# WS_ch03A.ipynb
# WESmith 11/09/22
## WS created this notebook to follow along chap 3 code from book 
# 'Bioinformatics with Python Cookbook' by Tiago Antao
### Each recipe will have its own notebook, suffixed by A, B, etc.
## see the link below for reference to SeqIO data structures
### http://biopython.org/DIST/docs/tutorial/Tutorial.html

# ACCESSING GENBANK AND MOVING AROUND NCBI DATABASES

### 1) IMPORT MODULES, CONFIGURE EMAIL

In [None]:
from Bio import Entrez, SeqIO

In [None]:
Entrez.email = 'smiwarsky@gmail.com'  # required

### 2) FIND CHOROQUININE RESISTANCE TRANSPORTER (CRT) GENE IN PLASMODIUM FALCIPARAM

In [None]:
handle = Entrez.esearch(db='nucleotide', term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]')
rec_list = Entrez.read(handle)

In [None]:
for j, k in rec_list.items():
    print('{} = {}'.format(j, k))

### 3) RETRIEVE THE RECORDS

In [None]:
id_list = rec_list['IdList']
hdl = Entrez.efetch(db='nucleotide', id=id_list, rettype='gb')

In [None]:
type(hdl)

### 4) READ AND PARSE RESULT

In [None]:
recs = list(SeqIO.parse(hdl,'gb'))

### 5) LOOK AT A SINGLE RECORD (A DIFFERENT RECORD THAN IN BOOK)

In [None]:
recs[0].name

In [None]:
recs[0].description

In [None]:
help(recs[0])

In [None]:
import ws_utils as ws
ws.attrs(recs[0], skip=True)

In [None]:
ws.attrs(recs[0].reverse_complement())  # WS exploring

In [None]:
recs[0].annotations

In [None]:
recs[0].features  # WS exploring

In [None]:
ws.attrs(recs[0].features[0])

In [None]:
ws.attrs(recs[0].features[0].qualifiers)

In [None]:
for jj in range(2):
    print('{}\n{}\n'.format(recs[jj].name, recs[jj].description))

In [None]:
# WS exploring
for k in recs[0].features:
    print('\n{}'.format(k.qualifiers))

### 6) EXTRACT SEQUENCE FEATURES

In [None]:
# WS created function get_info(), not done in text
def get_info(rec):
    for feature in rec.features:
        #print('FEATURE TYPE: {}\n'.format(feature.type))
        if feature.type == 'gene':
            print('\nPROCESSED: GENE TYPE {}'.format(feature.qualifiers['gene']))
        elif feature.type == 'exon':
            loc = feature.location
            print('\nPROCESSED: EXON TYPE {}'.format(loc.start, loc.end, loc.strand))
        else:
            print('\nNOT PROCESSED: {}'.format(feature))

In [None]:
get_info(recs[0])

### 7) LOOK AT ANNOTATIONS

In [None]:
for name, value in recs[10].annotations.items():
    print('{} = {}'.format(name.upper(), value))

### 8) LOOK AT SEQUENCE INFO

In [None]:
type(recs[1].seq)

In [None]:
print(recs[1].seq)

In [None]:
print(recs[1].seq.reverse_complement_rna())

In [None]:
print(recs[1].reverse_complement())

In [None]:
print(recs[1].seq.lower())

In [None]:
print(recs[1].seq.reverse_complement())

In [None]:
recs[2].seq.translate()

In [None]:
for k in recs:
    print('\nID:   {}\nDESC: {}\nSEQ:  {}\nTRANSLATE: {}'.\
          format(k.id, k.description, k.seq, k.seq.translate()))
    for ref in k.annotations['references']:
        print('PUBMED ID: {}'.format(ref.pubmed_id))

In [None]:
# the above had no pubmed IDs, so didn't take this further: see this recipe in the book