# WS_ch03B.ipynb
# WESmith 11/10/22
## WS created this notebook to follow along chap 3 code from book 
# 'Bioinformatics with Python Cookbook' by Tiago Antao
### Each recipe will have its own notebook, suffixed by A, B, etc.

# PERFORMING BASIC SEQUENCE ANALYSIS

In [None]:
from Bio import Entrez, Medline, SeqIO, SeqRecord
import os
import utils as ws
from collections import defaultdict

In [None]:
email    = 'smiwarsky@gmail.com' 
data_dir = 'data'

In [None]:
# WS created EntrezFetch class to do some of the basic data retrieval
id = 'NM_002299'
dd = ws.EntrezFetch(id=id, email=email)

In [None]:
dd.print_refs(short=True)

In [None]:
dd.print_features(short=False)

In [None]:
# get location of gene
# WS NOTE: CDS means 'coding sequence': the actual part of the sequence that codes for a protein
vv = dd.rec[0] # get the record from the new EntrezFetch object
for feature in vv.features:
    if feature.type == 'CDS':
        location = feature.location
print(location)

In [None]:
vv.name, vv.id

In [None]:
cds = SeqRecord.SeqRecord(vv.seq[location.start:location.end], 
                          name=vv.name, id=vv.id, 
                          description='LCT CDS only')

In [None]:
print(cds.seq), print(len(cds.seq))  # WS the lactose coding sequence

In [None]:
cds

### 1) WRITE FASTA FILE

In [None]:
file     = 'ch03B.fasta'
w_handle = open(os.path.join(data_dir, file), 'w')
SeqIO.write([cds], w_handle, 'fasta')
w_handle.close()

### 2) READ FILE

In [None]:
recs = SeqIO.parse(os.path.join(data_dir, file), 'fasta')

In [None]:
for rec in recs:
    seq = rec.seq
    print(rec.description)
    print(seq[:15])

### 3) TRANSCRIBE SEQUENCE

In [None]:
rna = seq.transcribe()
print(rna[:15]) # WS T goes to U

### 4) TRANSLATE INTO PROTEIN

In [None]:
prot = seq.translate()
print(prot[:5])

## CODON TABLES

In [None]:
# WS this from biopython site:
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_id[1]
mito_table     = CodonTable.unambiguous_dna_by_id[2]

In [None]:
print(standard_table)

In [None]:
ws.attrs(standard_table)

In [None]:
ws.print_dict(standard_table.__dict__, keywidth=15)

In [None]:
# WS the 20 amino acids
set(standard_table.forward_table.values())

In [None]:
standard_table.back_table  # WS this disregards the many-to-one nature of the forward table