# WS_ch03B.ipynb
# WESmith 11/10/22
## WS created this notebook to follow along chap 3 code from book 
# 'Bioinformatics with Python Cookbook' by Tiago Antao
### Each recipe will have its own notebook, suffixed by A, B, etc.

# PERFORMING BASIC SEQUENCE ANALYSIS

In [1]:
from Bio import Entrez, SeqIO, SeqRecord
import os

In [2]:
Entrez.email = 'smiwarsky@gmail.com'  # required

In [3]:
data_dir = './data'

In [4]:
hdl = Entrez.efetch(db='nucleotide', id=['NM_002299'], rettype='gb') # human lactase gene

In [5]:
type(hdl)

_io.TextIOWrapper

In [6]:
gb_rec = SeqIO.read(hdl, 'gb')
type(gb_rec)

Bio.SeqRecord.SeqRecord

In [7]:
# get location of gene
# WS NOTE: CDS means 'coding sequence': the actual part of the sequence that codes for a protein
for feature in gb_rec.features:
    if feature.type == 'CDS':
        location = feature.location
location, type(location)

(FeatureLocation(ExactPosition(15), ExactPosition(5799), strand=1),
 Bio.SeqFeature.FeatureLocation)

In [8]:
cds = SeqRecord.SeqRecord(gb_rec.seq[location.start:location.end], 
                          'NM_002299', description='LCT CDS only')

In [9]:
print(cds.seq), len(cds.seq)  # WS here it is! the lactose coding sequence

ATGGAGCTGTCTTGGCATGTAGTCTTTATTGCCCTGCTAAGTTTTTCATGCTGGGGGTCAGACTGGGAGTCTGATAGAAATTTCATTTCCACCGCTGGTCCTCTAACCAATGACTTGCTGCACAACCTGAGTGGTCTCCTGGGAGACCAGAGTTCTAACTTTGTAGCAGGGGACAAAGACATGTATGTTTGTCACCAGCCACTGCCCACTTTCCTGCCAGAATACTTCAGCAGTCTCCATGCCAGTCAGATCACCCATTATAAGGTATTTCTGTCATGGGCACAGCTCCTCCCAGCAGGAAGCACCCAGAATCCAGACGAGAAAACAGTGCAGTGCTACCGGCGACTCCTCAAGGCCCTCAAGACTGCACGGCTTCAGCCCATGGTCATCCTGCACCACCAGACCCTCCCTGCCAGCACCCTCCGGAGAACCGAAGCCTTTGCTGACCTCTTCGCCGACTATGCCACATTCGCCTTCCACTCCTTCGGGGACCTAGTTGGGATCTGGTTCACCTTCAGTGACTTGGAGGAAGTGATCAAGGAGCTTCCCCACCAGGAATCAAGAGCGTCACAACTCCAGACCCTCAGTGATGCCCACAGAAAAGCCTATGAGATTTACCACGAAAGCTATGCTTTTCAGGGCGGAAAACTCTCTGTTGTCCTGCGAGCTGAAGATATCCCGGAGCTCCTGCTAGAACCACCCATATCTGCGCTTGCCCAGGACACGGTCGATTTCCTCTCTCTTGATTTGTCTTATGAATGCCAAAATGAGGCAAGTCTGCGGCAGAAGCTGAGTAAATTGCAGACCATTGAGCCAAAAGTGAAAGTTTTCATCTTCAACCTAAAACTCCCAGACTGCCCCTCCACCATGAAGAACCCAGCCAGTCTGCTCTTCAGCCTTTTTGAAGCCATAAATAAAGACCAAGTGCTCACCATTGGGTTTGATATTAATGAGTTTCTGAGTTGTTCATCAAGTTCCAAGAAAAGCATGTCTTGTTCTC

(None, 5784)

### 1) WRITE FASTA FILE

In [10]:
file = 'example.fasta'
w_hdl = open(os.path.join(data_dir, file), 'w')
SeqIO.write([cds], w_hdl, 'fasta')
w_hdl.close()

### 2) READ FILE

In [11]:
recs = SeqIO.parse(os.path.join(data_dir, file), 'fasta')

In [12]:
for rec in recs:
    seq = rec.seq
    print(rec.description)
    print(seq[:10])

NM_002299 LCT CDS only
ATGGAGCTGT


### 3) TRANSCRIBE SEQUENCE

In [13]:
rna = seq.transcribe()
print(rna[:15]) # WS T goes to U

AUGGAGCUGUCUUGG


### 4) TRANSLATE INTO PROTEIN

In [14]:
prot = seq.translate()
print(prot[:5])

MELSW


## CODON TABLES

In [15]:
# WS this from biopython site:
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_id[1]
mito_table     = CodonTable.unambiguous_dna_by_id[2]

In [16]:
print(standard_table)

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

In [17]:
import ws_utils as ws
ws.attrs(standard_table)

["OBJECT TYPE: <class 'Bio.Data.CodonTable.NCBICodonTableDNA'>",
 'back_table',
 'forward_table',
 'id',
 'names',
 'nucleotide_alphabet',
 'protein_alphabet',
 'start_codons',
 'stop_codons']

In [18]:
# WS the 20 amino acids
set(standard_table.forward_table.values())

{'A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'Y'}

In [19]:
standard_table.back_table  # WS this disregards the many-to-one nature of the forward table

{'K': 'AAG',
 'N': 'AAT',
 'T': 'ACT',
 'R': 'CGT',
 'S': 'TCT',
 'I': 'ATT',
 'M': 'ATG',
 'Q': 'CAG',
 'H': 'CAT',
 'P': 'CCT',
 'L': 'TTG',
 'E': 'GAG',
 'D': 'GAT',
 'A': 'GCT',
 'G': 'GGT',
 'V': 'GTT',
 'Y': 'TAT',
 'C': 'TGT',
 'W': 'TGG',
 'F': 'TTT',
 None: 'TAA'}