# WS_ch03B.ipynb
# WESmith 11/10/22
## WS created this notebook to follow along chap 3 code from book 
# 'Bioinformatics with Python Cookbook' by Tiago Antao
### Each recipe will have its own notebook, suffixed by A, B, etc.

# PERFORMING BASIC SEQUENCE ANALYSIS

In [None]:
from Bio import Entrez, Medline, SeqIO, SeqRecord
import os
import utils as ws

In [None]:
email = 'smiwarsky@gmail.com' 
Entrez.email = email # required

In [None]:
data_dir = 'data'

In [None]:
class EntrezFetch():
    '''
    WS convenience class to fetch data from Entrez databases.
    id:      ID of object(s) to fetch (list of strings). 
             If a single object, a list of one object is made internally.
    db:      database type (str) (default: 'nucleotide')
    rettype: return type (str) (default: 'gb')
    email:   email address of requestor (str)(required)
    '''
    def __init__(self, id=id, db='nucleotide', rettype='gb', email=''):
        if type(id) != list: id = [id]
        self.id      = id
        self.db      = db
        self.rettype = rettype
        self.email   = email
        Entrez.email = self.email
        self.info(db=self.db, rettype=self.rettype)
        self.handle  = Entrez.efetch(db=self.db, 
                                     id=self.id, 
                                     rettype=self.rettype)
        self.rec     = list(SeqIO.parse(self.handle, self.rettype))
        self.refs    = None

    def info(self, db='N/A', rettype='N/A', mode='N/A'):
        print('GETTING DATA FROM {} DB OF TYPE {} AS MODE {} ...'.\
                  format(db, rettype, mode))
        
    def references(self, db='pubmed', rettype='medline', retmode='text'):
        '''
        Get publications for each record
        '''
        if self.refs is not None:
            return self.refs
        else:
            self.info(db=db, rettype=rettype, mode=retmode)
            out = []
            for j, rec in enumerate(self.rec):
                try:
                    rr = rec.annotations['references']
                except:
                    print('NO REFS FOR RECORD {}'.format(j))
                    continue
                for ref in rr:
                    if ref.pubmed_id != '':
                        #print('trying: ', [ref.pubmed_id])
                        handle = Entrez.efetch(db=db, id=[ref.pubmed_id],
                                               rettype=rettype, retmode=retmode)
                        dd = list(Medline.parse(handle))  # make a list from the generator
                        out.append({ref.pubmed_id: dd})
            self.refs = out # a list of dictionaries: one dictionary for each record
        return self.refs

    def print_refs(self, short=True):
        _ = self.references()
        for j in self.refs:
            for k,v in j.items():
                for kk in v:
                    if short:
                        print('TITLE: {}\nJOURNAL: {}\nCIT: {}\n\n'.\
                              format(kk['TI'], kk['JT'], kk['SO']))
                    else:
                        print('TITLE: {}\nABSTRACT: {}\nJOURNAL: {}\nCIT: {}\n\n'.\
                              format(kk['TI'], kk['AB'], kk['JT'], kk['SO']))

In [None]:
id = 'NM_002299'
dd = EntrezFetch(id=id, email=email)

In [None]:
dd.print_refs()

In [None]:
dd.print_refs(short=True)

In [None]:
rr = dd.references()

In [None]:
for j in rr:
    for k,v in j.items():
        for kk in v:
            print('TITLE: {}\nJOURNAL: {}\nCIT: {}\n\n'.format(kk['TI'], kk['JT'], kk['SO']))

In [None]:
for k in dd.rec:
    ws.print_dict(k.__dict__)

In [None]:
for k in dd.rec:
    print(k.annotations['references'])

In [None]:
hdl = Entrez.efetch(db='nucleotide', id=['NM_002299'], rettype='gb') # human lactase gene

In [None]:
type(hdl)

In [None]:
gb_rec = SeqIO.read(hdl, 'gb')

In [None]:
ws.print_dict(gb_rec.__dict__)

In [None]:
# references
refs = gb_rec.annotations['references']

In [None]:
refs

In [None]:
def get_refs(refs, db='pubmed', rettype='medline'):
    '''
    Get a dictionary of publications for this record
    '''
    out = {}
    for ref in refs:
        if ref.pubmed_id != '':
            #print(ref.pubmed_id)
            handle = Entrez.efetch(db=db, id=[ref.pubmed_id],
                                   rettype=rettype, retmode="text")
            dd = list(Medline.parse(handle))  # make a list from the generator
            out[ref.pubmed_id] = dd[0] # assumes only 1 reference per record
    return out

In [None]:
dd = get_refs(refs)

In [None]:
for k,v in dd.items():
    print(k, len(v))

In [None]:
ws.print_dict(dd, keywidth=10)

In [None]:
for ref in refs:
    if ref.pubmed_id != '':
        print(ref.pubmed_id)
        handle = Entrez.efetch(db="pubmed", id=[ref.pubmed_id],
                               rettype="medline", retmode="text")
        records = Medline.parse(handle)
        for med_rec in records:
            for k, v in med_rec.items():
                print('%s: %s' % (k, v))

In [None]:
# get location of gene
# WS NOTE: CDS means 'coding sequence': the actual part of the sequence that codes for a protein
for feature in gb_rec.features:
    if feature.type == 'CDS':
        location = feature.location
print(location)

In [None]:
cds = SeqRecord.SeqRecord(gb_rec.seq[location.start:location.end], 
                          'NM_002299', description='LCT CDS only')

In [None]:
print(cds.seq), len(cds.seq)  # WS here it is! the lactose coding sequence

### 1) WRITE FASTA FILE

In [None]:
file = 'example.fasta'
w_hdl = open(os.path.join(data_dir, file), 'w')
SeqIO.write([cds], w_hdl, 'fasta')
w_hdl.close()

### 2) READ FILE

In [None]:
recs = SeqIO.parse(os.path.join(data_dir, file), 'fasta')

In [None]:
for rec in recs:
    seq = rec.seq
    print(rec.description)
    print(seq[:10])

### 3) TRANSCRIBE SEQUENCE

In [None]:
rna = seq.transcribe()
print(rna[:15]) # WS T goes to U

### 4) TRANSLATE INTO PROTEIN

In [None]:
prot = seq.translate()
print(prot[:5])

## CODON TABLES

In [None]:
# WS this from biopython site:
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_id[1]
mito_table     = CodonTable.unambiguous_dna_by_id[2]

In [None]:
print(standard_table)

In [None]:
import utils as ws
ws.attrs(standard_table)

In [None]:
# WS the 20 amino acids
set(standard_table.forward_table.values())

In [None]:
standard_table.back_table  # WS this disregards the many-to-one nature of the forward table