# WS_ch05D.ipynb
# WESmith 11/29/22
## WS created this notebook to follow along with the code in the book:
## 'Bioinformatics with Python Cookbook' by Tiago Antao
## Each recipe will have its own notebook, suffixed by A, B, etc.¶

# RECIPE: 
# EXTRACTING GENES FROM A REFERENCE USING ANNOTATIONS
## also see book code in Chapter05/Getting_Gene.py¶

In [None]:
import os
from collections import defaultdict, OrderedDict  # WS added OrderedDict
import gffutils
from Bio import Seq, SeqIO
import matplotlib.pyplot as plt
import utils as ws

In [None]:
# move big files to Downloads so it will not be backed up
remote_data_dir = '/home/smithw/Downloads/bioinformatics/ch05_data'
gambiae_gff     = 'VectorBase-55_AgambiaePEST.gff'
gambiae_gff_db  = 'gambiae_gff.db'
gambiae_fasta   = 'VectorBase-57_AgambiaePEST_Genome.fasta'
# gambiae_gff.db was created in WS_ch05C.ipynb
db_name    = os.path.join(remote_data_dir, gambiae_gff_db)
fasta_name = os.path.join(remote_data_dir, gambiae_fasta)

In [None]:
db = gffutils.FeatureDB(db_name)

### 1) GET GENE ANNOTATION INFORMATION

In [None]:
# voltage-gated sodium channel (VGSC) gene
gene_id = 'AGAP004707'

In [None]:
gene = db[gene_id]
print(gene)

In [None]:
print(gene.seqid, gene.strand)

### 2) GET THE SEQUENCE OF THE CHROMOSOME CONTAINING THE GENE

In [None]:
recs = SeqIO.parse(open(fasta_name, 'rt', encoding='utf-8'), 'fasta')
for rec in recs:
    print(rec.description)
    if rec.id == gene.seqid:
        my_seq = rec.seq
        break

In [None]:
my_seq  # WS sequence for entire chromosome 2L

### 3) CONSTRUCT A FUNCTION TO RETURN A SEQUENCE FROM A LIST OF CDS

In [None]:
# WS added 'offset' to generalize to different phases
def get_sequence(chrom_seq, CDSs, strand, offset=0):
    # WS CDSs should work as a generator or a list
    # WS NOTE 12/3/22: author doesn't include FRAME info from the gff file 
    #                  (his comment below is FRAME???); I'll write a more complete function
    #                  called 'protein_from_gene' that will include FRAME info;
    #                  the example he uses, AGAP004707-RA, has frame = xxx for all CDS
    seq = Seq.Seq('')
    for CDS in CDSs:
        # #FRAME???
        my_cds = Seq.Seq(str(chrom_seq[CDS.start - 1 + offset: CDS.end + offset]))
        seq += my_cds
    return seq if strand == '+' else seq.reverse_complement()

### 4) CHOOSE AN mRNA TRANSCRIPT FOR THE GENE FROM THOSE AVAILABLE

In [None]:
mRNAs = db.children(gene, featuretype='mRNA') # returns a generator

In [None]:
for mRNA in mRNAs:
    print(mRNA.id)
    if mRNA.id.endswith('RA'): # WS why choose 'RA'?
        break

In [None]:
print(mRNA)

### 5) GET THE CDSs FOR THIS PARTICULAR mRNA TRANSCRIPT,
### THEN THE NUCLEOTIDE SEQUENCE THEY REPRESENT, 
### THEN THE PROTEIN

In [None]:
CDSs = db.children(mRNA, featuretype='CDS', order_by='start')
CDS_list = list(CDSs)   #WS turn generator into a list to examine

In [None]:
len(CDS_list)  
# I counted 30 CDSs for the 'RA' mRNA parent directly from the gff 
# file using emacs: correct

In [None]:
for k in CDS_list:
    print(k.id, k.frame)

In [None]:
CDS_list[0].__dict__

In [None]:
gene_seq = get_sequence(my_seq, CDS_list, gene.strand) # WS sending in a list

In [None]:
print(len(gene_seq), gene_seq)

In [None]:
prot = gene_seq.translate()
print(len(prot), prot)

### 6) GET GENE CODED IN NEGATIVE STRAND DIRECTION

In [None]:
reverse_gene_id = 'AGAP004708'  # WS added gene-level stuff
reverse_gene = db[reverse_gene_id]

In [None]:
print(reverse_gene.seqid, reverse_gene.strand) # explicitly shows '-'

In [None]:
reverse_transcript_id = 'AGAP004708-RA'

In [None]:
reverse_CDSs = list(db.children(reverse_transcript_id, featuretype='CDS', 
                                order_by='start'))

In [None]:
for k in reverse_CDSs:  # WS print
    print(k.id, k.start)

In [None]:
reverse_seq = get_sequence(my_seq, reverse_CDSs, '-')

In [None]:
print(len(reverse_seq), reverse_seq)

In [None]:
reverse_prot = reverse_seq.translate()
print(len(reverse_prot), reverse_prot)

### WS EXPERIMENTS

In [None]:
# WS NEXT1: find the stop codon in the above;
# then look at the other 5 frames for this sequence, and count the stop
# codons; the correct sequence only has 1 stop codon?

In [None]:
# look at the genetic code to see stop codons
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_id[1]
mito_table     = CodonTable.unambiguous_dna_by_id[2]
print(standard_table)

In [None]:
ordered_prot_hist = ws.amino_histo(prot)
for j, k in ordered_prot_hist.items():
    print('amino acid {} occurs {:4} times'.format(j, k))

In [None]:
# WS now look at all 6 frames (+,- complement for 3 phases) and see how many stop ('*') codons show up per case
# the 'true' '+' strand with zero offset should show only 1 stop codon (as per the Cell book)
x1 = ['+', '-']
x2 = list(range(-3,4))
vals = [(x,y) for x in x1 for y in x2]
for (j, k) in vals:
    ss = get_sequence(my_seq, CDS_list, j, offset=k)
    dd = ss.translate()
    out = ws.amino_histo(dd)
    print('strand: {}, phase: {:2}, number of stop signals: {:4}'.format(j, k, out['*']))
# only 1 combination is showing 1 stop signal
# NOTE: not clear why +3 offset isn't showing at most 2 stop siglals: original and possibly another one at new end
#       and why -3 offset isn't showing at most 1 stop (it loses the one at the end, and picks up at most 1 at the
#       new beginning): 
# POSSIBLE SOLUTION: we're concatenating CDS sections, and reading each CDS with the same offset, so they are going
#                    together in complex ways: we're not offsetting a single strand, but multiple concatenated
#                    strands; the ONLY case with the single 'stop' is the '+' strand with 0 offset on all of the CDSs

In [None]:
sss = get_sequence(my_seq, CDS_list, '+', offset=0)
dd = sss.translate()
ws.amino_histo(dd, order=True)
# this gets a 'stop' once: good

In [None]:
# WS now look at reverse complement of '+' thread and see how many stop ('*') codons show up
sss

In [None]:
reverse_gene_seq = sss.reverse_complement()
reverse_gene_seq

In [None]:
reverse_gene_seq_prot = reverse_gene_seq.translate()

In [None]:
ordered_reverse_prot_hist = ws.amino_histo(reverse_gene_seq_prot, order=True)
for j, k in ordered_reverse_prot_hist.items():
    print('amino acid {} occurs {:4} times'.format(j, k))  
    # WS this gets a 'stop' 99 times: obviously bogus