# WS_ch05C.ipynb
# WESmith 11/28/22
## WS created this notebook to follow along with the code in the book:
## 'Bioinformatics with Python Cookbook' by Tiago Antao
## Each recipe will have its own notebook, suffixed by A, B, etc.¶

# RECIPE: 
# TRAVERSING GENOME ANNOTATIONS
## also see book code in Chapter05/Annotations.py¶

In [None]:
from collections import defaultdict
import os
import gffutils
import sqlite3

In [None]:
# put large datasets in ~/Downloads so they are not backed up
remote_data_dir = '/home/smithw/Downloads/bioinformatics/ch05_data'
gambiae_gff     = 'VectorBase-55_AgambiaePEST.gff'
gambiae_gff_db  = 'gambiae_gff.db'
file            = os.path.join(remote_data_dir, gambiae_gff)
save_it         = os.path.join(remote_data_dir, gambiae_gff_db)

### 1) CREATE ANNOTATION DATABASE

In [None]:
# this took 20 sec to create the db the first time
try:
    db = gffutils.create_db(file, save_it)
except sqlite3.OperationalError:
    db = gffutils.FeatureDB(save_it)

### 2) LIST AVAILABLE FEATURE TYPES

In [None]:
print(list(db.featuretypes()))

In [None]:
for feat_type in db.featuretypes():
    print(feat_type, db.count_features_of_type(feat_type))

### 3) LIST ALL SEQIDS

In [None]:
seqids = set()
for e in db.all_features():
    seqids.add(e.seqid)
for seqid in seqids:
    print(seqid)

### 4) EXTRACT INFORMATION PER CHROMOSOME

In [None]:
num_mRNAs = defaultdict(int)
num_exons = defaultdict(int)
max_exons = 0
max_span = 0
tot      = 0 # WS
for seqid in seqids:
    cnt = 0
    for gene in db.region(seqid=seqid, featuretype='protein_coding_gene'):
        cnt += 1
        span = abs(gene.start - gene.end) # strand
        if span > max_span:
            max_span = span
            max_span_gene = gene
        # WS interpretation: my_mRNAs are the number of 'alternative transcripts'
        #    in the book terminology
        my_mRNAs = list(db.children(gene, featuretype='mRNA'))
        num_mRNAs[len(my_mRNAs)] += 1
        if len(my_mRNAs) == 0:
            exon_check = [gene]
        else:
            exon_check = my_mRNAs  # WS list of 'alternative transcripts'?
        for check in exon_check:
            my_exons = list(db.children(check, featuretype='exon'))
            num_exons[len(my_exons)] += 1
            if len(my_exons) > max_exons:
                max_exons = len(my_exons)
                max_exons_gene = gene
    tot += cnt
    print(f'seqid {seqid}, number of genes {cnt}')
print('total number of genes {}'.format(tot))  # WS

In [None]:
# WS modified printout, with WS interpretation
print('gene with the max number of exons: %s (%d)' % (max_exons_gene.id, max_exons))

In [None]:
print('gene with the max span: %s (%d)' % (max_span_gene.id, max_span))

In [None]:
# WS interpretation of the dict
tot_gene = 0  # a check
tot_tran = 0
for j, k in num_mRNAs.items():
    print('{:5} genes have {:2} mRNA transcript (ie alternative transcripts)'.\
          format(k, j))
    tot_gene += k
    tot_tran += k * j
print('total number of genes: {}\ntotal number of mRNA transcripts: {}'.\
      format(tot_gene, tot_tran))

In [None]:
# WS interpretation of the dict
tot = 0 # a check: total number of transcripts
for j, k in num_exons.items():
    print('{:5} mRNA transcripts have {:2} exons'.format(k, j))
    tot += k
print('total number of mRNA transcripts: {}'.format(tot))