# WS_ch03A.ipynb
# WESmith 11/09/22
## WS created this notebook to follow along chap 3 code from book 
# 'Bioinformatics with Python Cookbook' by Tiago Antao
### Each recipe will have its own notebook, suffixed by A, B, etc.
## see the link below for reference to SeqIO data structures
### http://biopython.org/DIST/docs/tutorial/Tutorial.html

# ACCESSING GENBANK AND MOVING AROUND NCBI DATABASES

### 1) IMPORT MODULES, CONFIGURE EMAIL

In [None]:
from Bio import Entrez, SeqIO, Medline
import utils as ws

In [None]:
Entrez.email = 'smiwarsky@gmail.com'  # required

### 2) FIND CHOROQUININE RESISTANCE TRANSPORTER (CRT) GENE IN PLASMODIUM FALCIPARAM

In [None]:
handle   = Entrez.esearch(db='nucleotide', term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]')
rec_list = Entrez.read(handle)

In [None]:
ws.print_dict(rec_list, keywidth=18)

### 3) RETRIEVE THE RECORDS

In [None]:
id_list = rec_list['IdList']
hdl     = Entrez.efetch(db='nucleotide', id=id_list, rettype='gb')

In [None]:
ws.attrs(hdl)

### 4) READ AND PARSE RESULT

In [None]:
recs = list(SeqIO.parse(hdl,'gb'))  # convert iterator to list

In [None]:
len(recs)

In [None]:
for k in recs: # WS mod
    print('{}: {}'.format(k.name, k.description))

### 5) LOOK AT A SINGLE RECORD (A DIFFERENT RECORD THAN IN BOOK)

In [None]:
ws.attrs(recs[0], skip=True)

In [None]:
print(recs[0])

In [None]:
ws.print_dict(recs[0].__dict__, keywidth=20)  # WS look at everything

In [None]:
recs[0].features # WS list of SeqFeature objects

### 6) EXTRACT SEQUENCE FEATURES

In [None]:
ws.attrs(recs[0].features[0])  # WS attrs of SeqFeature object

In [None]:
# WS print ALL of the data from each feature type: 
#   note that the qualifiers dict has different keys for different feature types
for k in recs[0].features:
    ws.print_dict(k.__dict__, keywidth=15); print()

In [None]:
ws.print_dict(recs[0].features[0].location.__dict__)

### 7) LOOK AT ANNOTATIONS

In [None]:
ws.print_dict(recs[0].annotations, keywidth=20)

### 8) LOOK AT SEQUENCE INFO

In [None]:
help(recs[0])  # WS all you need to know here

In [None]:
ws.attrs(recs[0].seq)  # WS attrs of Bio.Seq.Seq

In [None]:
print(recs[0].seq)

In [None]:
print(recs[0].seq.reverse_complement())

In [None]:
print(recs[0].seq.reverse_complement_rna())

In [None]:
print(recs[0].seq.lower())

In [None]:
print(recs[0].seq.translate())

In [None]:
for k in recs[0:2]:  # print first 2 to compare
    print('\nID:   {}\nDESC: {}\nSEQ:  {}\nTRANSLATE: {}'.\
          format(k.id, k.description, k.seq, k.seq.translate()))
    for ref in k.annotations['references']:
        print('PUBMED ID: {}'.format(ref.pubmed_id))

## 9) GET REFERENCES

In [None]:
refs = recs[0].annotations['references']
ws.attrs(refs[0])

In [None]:
for ref in refs:
    if ref.pubmed_id != '':
        print(ref.pubmed_id)
        
        handle = Entrez.efetch(db="pubmed", id=[ref.pubmed_id],
                               rettype="medline", retmode="text")
        records = Medline.parse(handle)
        for med_rec in records:
            for k, v in med_rec.items():
                print('%s: %s' % (k, v))