# WS_ch05C.ipynb
# WESmith 11/28/22
## WS created this notebook to follow along with the code in the book:
## 'Bioinformatics with Python Cookbook' by Tiago Antao
## Each recipe will have its own notebook, suffixed by A, B, etc.¶

# RECIPE: 
# TRAVERSING GENOME ANNOTATIONS
## also see book code in Chapter05/Annotations.py¶

In [5]:
from collections import defaultdict
import os
import gffutils
import sqlite3

In [6]:
data_dir = 'data/ch05_data'
name     = 'VectorBase-55_AgambiaePEST.gff'
file     = os.path.join(data_dir, name)
save_it  = os.path.join(data_dir, 'ag.db')

### 1) CREATE ANNOTATION DATABASE

In [8]:
# this took 20 sec
try:
    db = gffutils.create_db(file, save_it)
except sqlite3.OperationalError:
    db = gffutils.FeatureDB(save_it)

### 2) LIST AVAILABLE FEATURE TYPES

In [9]:
print(list(db.featuretypes()))

['CDS', 'RNase_MRP_RNA', 'RNase_P_RNA', 'SRP_RNA', 'exon', 'five_prime_UTR', 'lnc_RNA', 'mRNA', 'ncRNA', 'ncRNA_gene', 'pre_miRNA', 'protein_coding_gene', 'pseudogene', 'pseudogenic_transcript', 'rRNA', 'snRNA', 'snoRNA', 'tRNA', 'three_prime_UTR']


In [28]:
for feat_type in db.featuretypes():
    print(feat_type, db.count_features_of_type(feat_type))

CDS 67394
RNase_MRP_RNA 1
RNase_P_RNA 1
SRP_RNA 3
exon 61590
five_prime_UTR 17472
lnc_RNA 2
mRNA 15125
ncRNA 4
ncRNA_gene 729
pre_miRNA 77
protein_coding_gene 13094
pseudogene 9
pseudogenic_transcript 9
rRNA 242
snRNA 35
snoRNA 2
tRNA 362
three_prime_UTR 12236


### 3) LIST ALL SEQIDS

In [19]:
seqids = set()
for e in db.all_features():
    seqids.add(e.seqid)
for seqid in seqids:
    print(seqid)

AgamP4_UNKN
AgamP4_3L
AgamP4_X
AgamP4_2R
AgamP4_Mt
AgamP4_2L
AgamP4_Y_unplaced
AgamP4_3R


### 4) EXTRACT INFORMATION PER CHROMOSOME

In [41]:
num_mRNAs = defaultdict(int)
num_exons = defaultdict(int)
max_exons = 0
max_span = 0
tot      = 0 # WS
for seqid in seqids:
    cnt = 0
    for gene in db.region(seqid=seqid, featuretype='protein_coding_gene'):
        cnt += 1
        span = abs(gene.start - gene.end) # strand
        if span > max_span:
            max_span = span
            max_span_gene = gene
        # WS interpretation: my_mRNAs are the number of 'alternative transcripts'
        #    in the book terminology
        my_mRNAs = list(db.children(gene, featuretype='mRNA'))
        num_mRNAs[len(my_mRNAs)] += 1
        if len(my_mRNAs) == 0:
            exon_check = [gene]
        else:
            exon_check = my_mRNAs  # WS list of 'alternative transcripts'?
        for check in exon_check:
            my_exons = list(db.children(check, featuretype='exon'))
            num_exons[len(my_exons)] += 1
            if len(my_exons) > max_exons:
                max_exons = len(my_exons)
                max_exons_gene = gene
    tot += cnt
    print(f'seqid {seqid}, number of genes {cnt}')
print('total number of genes {}'.format(tot))  # WS

seqid AgamP4_UNKN, number of genes 478
seqid AgamP4_3L, number of genes 2216
seqid AgamP4_X, number of genes 1073
seqid AgamP4_2R, number of genes 3670
seqid AgamP4_Mt, number of genes 13
seqid AgamP4_2L, number of genes 2950
seqid AgamP4_Y_unplaced, number of genes 2
seqid AgamP4_3R, number of genes 2692
total number of genes 13094


In [42]:
# WS modified printout, with WS interpretation
print('gene with the max number of exons: %s (%d)' % (max_exons_gene.id, max_exons))

gene with the max number of exons: AGAP001660 (69)


In [43]:
print('gene with the max span: %s (%d)' % (max_span_gene.id, max_span))

gene with the max span: AGAP006656 (365621)


In [59]:
# WS interpretation of the dict
tot_gene = 0  # a check
tot_tran = 0
for j, k in num_mRNAs.items():
    print('{:5} genes have {:2} mRNA transcript (ie alternative transcripts)'.\
          format(k, j))
    tot_gene += k
    tot_tran += k * j
print('total number of genes: {}\ntotal number of mRNA transcripts: {}'.\
      format(tot_gene, tot_tran))

11753 genes have  1 mRNA transcript (ie alternative transcripts)
  964 genes have  2 mRNA transcript (ie alternative transcripts)
  234 genes have  3 mRNA transcript (ie alternative transcripts)
   85 genes have  4 mRNA transcript (ie alternative transcripts)
   27 genes have  5 mRNA transcript (ie alternative transcripts)
    2 genes have 12 mRNA transcript (ie alternative transcripts)
    3 genes have  9 mRNA transcript (ie alternative transcripts)
    5 genes have  7 mRNA transcript (ie alternative transcripts)
    4 genes have  8 mRNA transcript (ie alternative transcripts)
   10 genes have  6 mRNA transcript (ie alternative transcripts)
    1 genes have 20 mRNA transcript (ie alternative transcripts)
    3 genes have 11 mRNA transcript (ie alternative transcripts)
    1 genes have 10 mRNA transcript (ie alternative transcripts)
    2 genes have 13 mRNA transcript (ie alternative transcripts)
total number of genes: 13094
total number of mRNA transcripts: 15125


In [57]:
# WS interpretation of the dict
tot = 0 # a check: total number of transcripts
for j, k in num_exons.items():
    print('{:5} mRNA transcripts have {:2} exons'.format(k, j))
    tot += k
print('total number of mRNA transcripts: {}'.format(tot))

 1552 mRNA transcripts have  5 exons
 1207 mRNA transcripts have  1 exons
 2059 mRNA transcripts have  4 exons
 3159 mRNA transcripts have  2 exons
 2763 mRNA transcripts have  3 exons
  786 mRNA transcripts have  7 exons
  530 mRNA transcripts have  8 exons
 1126 mRNA transcripts have  6 exons
  282 mRNA transcripts have 11 exons
   65 mRNA transcripts have 15 exons
  194 mRNA transcripts have 12 exons
  459 mRNA transcripts have  9 exons
  340 mRNA transcripts have 10 exons
   27 mRNA transcripts have 21 exons
   36 mRNA transcripts have 20 exons
  119 mRNA transcripts have 13 exons
   96 mRNA transcripts have 14 exons
   76 mRNA transcripts have 17 exons
    6 mRNA transcripts have 27 exons
   46 mRNA transcripts have 18 exons
    8 mRNA transcripts have 26 exons
   65 mRNA transcripts have 16 exons
   32 mRNA transcripts have 19 exons
   15 mRNA transcripts have 23 exons
    6 mRNA transcripts have 29 exons
   10 mRNA transcripts have 31 exons
    6 mRNA transcripts have 28 exons
 