# WS_ch03A.ipynb
# WESmith 11/09/22
## WS created this notebook to follow along chap 3 code from book 
# 'Bioinformatics with Python Cookbook' by Tiago Antao
### Each recipe will have its own notebook, suffixed by A, B, etc.
## see the link below for reference to SeqIO data structures
### http://biopython.org/DIST/docs/tutorial/Tutorial.html

# ACCESSING GENBANK AND MOVING AROUND NCBI DATABASES

In [3]:
from Bio import Entrez, SeqIO

In [126]:
Entrez.email = 'smiwarsky@gmail.com'  # required

In [6]:
# find the chloroquinine resistance transporter (CRT) gene in Plasmodium falciparum
handle = Entrez.esearch(db='nucleotide', term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]')
rec_list = Entrez.read(handle)

In [138]:
for j, k in rec_list.items():
    print('{} = {}'.format(j, k))

Count = 2925
RetMax = 20
RetStart = 0
IdList = ['2301594124', '2301594089', '2262825096', '2262825094', '2262825092', '2262825090', '2262825088', '2262825086', '2262825084', '2262825082', '2262825080', '2262825078', '2262825076', '2262825074', '2262825072', '2262825070', '2262825068', '2262825066', '2262825064', '2262825062']
TranslationSet = [{'From': '"Plasmodium falciparum"[Organism]', 'To': '"Plasmodium falciparum"[Organism]'}]
TranslationStack = [{'Term': 'CRT[Gene Name]', 'Field': 'Gene Name', 'Count': '5780', 'Explode': 'N'}, {'Term': '"Plasmodium falciparum"[Organism]', 'Field': 'Organism', 'Count': '261968', 'Explode': 'Y'}, 'AND']
QueryTranslation = CRT[Gene Name] AND "Plasmodium falciparum"[Organism]


In [9]:
hdl = Entrez.efetch(db='nucleotide', id=id_list, rettype='gb')

In [12]:
type(hdl)

_io.TextIOWrapper

In [13]:
recs = list(SeqIO.parse(hdl,'gb'))

In [155]:
help(recs[0])

Help on SeqRecord in module Bio.SeqRecord object:

class SeqRecord(builtins.object)
 |  SeqRecord(seq, id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=None, features=None, annotations=None, letter_annotations=None)
 |  
 |  A SeqRecord object holds a sequence and information about it.
 |  
 |  Main attributes:
 |   - id          - Identifier such as a locus tag (string)
 |   - seq         - The sequence itself (Seq object or similar)
 |  
 |  Additional attributes:
 |   - name        - Sequence name, e.g. gene name (string)
 |   - description - Additional text (string)
 |   - dbxrefs     - List of database cross references (list of strings)
 |   - features    - Any (sub)features defined (list of SeqFeature objects)
 |   - annotations - Further information about the whole sequence (dictionary).
 |     Most entries are strings, or lists of strings.
 |   - letter_annotations - Per letter/symbol annotation (restricted
 |     dictionary). This holds Py

In [193]:
def attrs(obj, skip=True, token='__'):  # WS convenience function
    attr = ['OBJECT TYPE: {}'.format(type(obj))]
    for k in dir(obj):
        if skip and k.__contains__(token): continue
        attr.append(k)
    return attr

In [194]:
attrs(recs[0], skip=True)

["OBJECT TYPE: <class 'Bio.SeqRecord.SeqRecord'>",
 '_per_letter_annotations',
 '_seq',
 '_set_per_letter_annotations',
 '_set_seq',
 'annotations',
 'dbxrefs',
 'description',
 'features',
 'format',
 'id',
 'letter_annotations',
 'lower',
 'name',
 'reverse_complement',
 'seq',
 'translate',
 'upper']

In [195]:
attrs(recs[0].reverse_complement())  # WS exploring

["OBJECT TYPE: <class 'Bio.SeqRecord.SeqRecord'>",
 '_per_letter_annotations',
 '_seq',
 '_set_per_letter_annotations',
 '_set_seq',
 'annotations',
 'dbxrefs',
 'description',
 'features',
 'format',
 'id',
 'letter_annotations',
 'lower',
 'name',
 'reverse_complement',
 'seq',
 'translate',
 'upper']

In [187]:
recs[0].annotations

{'molecule_type': 'DNA',
 'topology': 'linear',
 'data_file_division': 'INV',
 'date': '19-SEP-2022',
 'accessions': ['MZ054304'],
 'sequence_version': 1,
 'keywords': [''],
 'source': 'Plasmodium falciparum 3D7',
 'organism': 'Plasmodium falciparum 3D7',
 'taxonomy': ['Eukaryota',
  'Sar',
  'Alveolata',
  'Apicomplexa',
  'Aconoidasida',
  'Haemosporida',
  'Plasmodiidae',
  'Plasmodium',
  'Plasmodium (Laverania)'],
 'references': [Reference(title='Direct Submission', ...)],
 'structured_comment': OrderedDict([('Assembly-Data',
               OrderedDict([('Sequencing Technology',
                             'Sanger dideoxy sequencing')]))])}

In [188]:
recs[0].description

'Plasmodium falciparum 3D7 isolate CRT33 chloroquine resistance transporter (crt) gene, partial cds'

In [60]:
recs[0].features  # WS exploring

[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(209), strand=1), type='source'),
 SeqFeature(FeatureLocation(BeforePosition(0), AfterPosition(209), strand=1), type='gene'),
 SeqFeature(FeatureLocation(BeforePosition(0), AfterPosition(209), strand=1), type='mRNA'),
 SeqFeature(FeatureLocation(BeforePosition(0), AfterPosition(209), strand=1), type='CDS')]

In [196]:
attrs(recs[0].features[0])

["OBJECT TYPE: <class 'Bio.SeqFeature.SeqFeature'>",
 '_flip',
 '_get_location_operator',
 '_get_ref',
 '_get_ref_db',
 '_get_strand',
 '_set_location_operator',
 '_set_ref',
 '_set_ref_db',
 '_set_strand',
 '_shift',
 'extract',
 'id',
 'location',
 'location_operator',
 'qualifiers',
 'ref',
 'ref_db',
 'strand',
 'translate',
 'type']

In [197]:
attrs(recs[0].features[0].qualifiers)

["OBJECT TYPE: <class 'collections.OrderedDict'>",
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'move_to_end',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

In [28]:
for jj in range(2):
    print('{}\n{}\n'.format(recs[jj].name, recs[jj].description))

MZ054304
Plasmodium falciparum 3D7 isolate CRT33 chloroquine resistance transporter (crt) gene, partial cds

MZ054303
Plasmodium falciparum 3D7 isolate CRT4 chloroquine resistance transporter (crt) gene, partial cds



In [59]:
# WS exploring
for k in recs[0].features:
    print('\n{}'.format(k.qualifiers))


OrderedDict([('organism', ['Plasmodium falciparum 3D7']), ('mol_type', ['genomic DNA']), ('isolate', ['CRT33']), ('host', ['Homo sapiens']), ('db_xref', ['taxon:36329']), ('chromosome', ['7']), ('country', ['India: Nuapada District Hq Hospital']), ('collection_date', ['14-Jul-2019']), ('collected_by', ['Ramakanta Rana']), ('PCR_primers', ['fwd_name: pfcrt-f, fwd_seq: ggctcacgtttaggtgga, rev_name: pfcrt-r, rev_seq: tgaatttccctttttatttccaaa'])])

OrderedDict([('gene', ['crt']), ('note', ['pfcrt'])])

OrderedDict([('gene', ['crt']), ('product', ['chloroquine resistance transporter'])])

OrderedDict([('gene', ['crt']), ('codon_start', ['1']), ('product', ['chloroquine resistance transporter']), ('protein_id', ['UWU44816.1']), ('translation', ['LIFKEIKDNIFIYILSIIYLSVCVMNKIFAKRTLNKIGNYSFVTSETHNFICMIMFFIVYSLFGNKKGNS'])])


In [44]:
# WS created function, not done in text
def get_info(rec):
    for feature in rec.features:
        #print('FEATURE TYPE: {}\n'.format(feature.type))
        if feature.type == 'gene':
            print('\nPROCESSED: GENE TYPE {}'.format(feature.qualifiers['gene']))
        elif feature.type == 'exon':
            loc = feature.location
            print('\nPROCESSED: EXON TYPE {}'.format(loc.start, loc.end, loc.strand))
        else:
            print('\nNOT PROCESSED: {}'.format(feature))

In [51]:
get_info(recs[0])


NOT PROCESSED: type: source
location: [0:209](+)
qualifiers:
    Key: PCR_primers, Value: ['fwd_name: pfcrt-f, fwd_seq: ggctcacgtttaggtgga, rev_name: pfcrt-r, rev_seq: tgaatttccctttttatttccaaa']
    Key: chromosome, Value: ['7']
    Key: collected_by, Value: ['Ramakanta Rana']
    Key: collection_date, Value: ['14-Jul-2019']
    Key: country, Value: ['India: Nuapada District Hq Hospital']
    Key: db_xref, Value: ['taxon:36329']
    Key: host, Value: ['Homo sapiens']
    Key: isolate, Value: ['CRT33']
    Key: mol_type, Value: ['genomic DNA']
    Key: organism, Value: ['Plasmodium falciparum 3D7']


PROCESSED: GENE TYPE ['crt']

NOT PROCESSED: type: mRNA
location: [<0:>209](+)
qualifiers:
    Key: gene, Value: ['crt']
    Key: product, Value: ['chloroquine resistance transporter']


NOT PROCESSED: type: CDS
location: [<0:>209](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: gene, Value: ['crt']
    Key: product, Value: ['chloroquine resistance transporter']
    Key: protein

In [136]:
for name, value in recs[10].annotations.items():
    print('{} = {}'.format(name.upper(), value))

MOLECULE_TYPE = DNA
TOPOLOGY = linear
DATA_FILE_DIVISION = INV
DATE = 06-JUL-2022
ACCESSIONS = ['OM964461']
SEQUENCE_VERSION = 1
KEYWORDS = ['']
SOURCE = Plasmodium falciparum (malaria parasite P. falciparum)
ORGANISM = Plasmodium falciparum
TAXONOMY = ['Eukaryota', 'Sar', 'Alveolata', 'Apicomplexa', 'Aconoidasida', 'Haemosporida', 'Plasmodiidae', 'Plasmodium', 'Plasmodium (Laverania)']
REFERENCES = [Reference(title='Emergence and expansion of piperaquine resistance mediated by the PfCRTC350R mutation and plasmepsin gene amplifications hinder dihydroartemisinin-piperaquine efficacy in the Guiana Shield', ...), Reference(title='Direct Submission', ...)]
STRUCTURED_COMMENT = OrderedDict([('Assembly-Data', OrderedDict([('Sequencing Technology', 'Sanger dideoxy sequencing')]))])


In [128]:
type(recs[1].seq)

Bio.Seq.Seq

In [134]:
print(recs[1].seq)

CTTATTTTTAAAGAGATTAAGGATAATATTTTTATTTATATTTTAAGTATTATTTATTTAAGTGTATGTGTAATGAATAAAATTTTTGCTAAAAGAACTTTAAACAAAATTGGTAACTATAGTTTTGTAACATCCGAAACTCACAACTTTATTTGTATGATTATGTTCTTTATTGTTTATTCCTTATTTGGAAATAAAAAGGGAAATTCA


In [132]:
print(recs[1].seq.reverse_complement_rna())

UGAAUUUCCCUUUUUAUUUCCAAAUAAGGAAUAAACAAUAAAGAACAUAAUCAUACAAAUAAAGUUGUGAGUUUCGGAUGUUACAAAACUAUAGUUACCAAUUUUGUUUAAAGUUCUUUUAGCAAAAAUUUUAUUCAUUACACAUACACUUAAAUAAAUAAUACUUAAAAUAUAAAUAAAAAUAUUAUCCUUAAUCUCUUUAAAAAUAAG


In [81]:
print(recs[1].reverse_complement())

ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 4
Seq('TGAATTTCCCTTTTTATTTCCAAATAAGGAATAAACAATAAAGAACATAATCAT...AAG')


In [87]:
print(recs[1].seq.lower())

cttatttttaaagagattaaggataatatttttatttatattttaagtattatttatttaagtgtatgtgtaatgaataaaatttttgctaaaagaactttaaacaaaattggtaactatagttttgtaacatccgaaactcacaactttatttgtatgattatgttctttattgtttattccttatttggaaataaaaagggaaattca


In [88]:
print(recs[1].seq.reverse_complement())

TGAATTTCCCTTTTTATTTCCAAATAAGGAATAAACAATAAAGAACATAATCATACAAATAAAGTTGTGAGTTTCGGATGTTACAAAACTATAGTTACCAATTTTGTTTAAAGTTCTTTTAGCAAAAATTTTATTCATTACACATACACTTAAATAAATAATACTTAAAATATAAATAAAAATATTATCCTTAATCTCTTTAAAAATAAG


In [113]:
recs[2].seq.translate()

Seq('SCIQSPALAIAYYFKFLAVRIK')

In [124]:
for k in recs:
    print('\nID:   {}\nDESC: {}\nSEQ:  {}\nTRANSLATE: {}'.\
          format(k.id, k.description, k.seq, k.seq.translate()))
    for ref in k.annotations['references']:
        print('PUBMED ID: {}'.format(ref.pubmed_id))


ID:   MZ054304.1
DESC: Plasmodium falciparum 3D7 isolate CRT33 chloroquine resistance transporter (crt) gene, partial cds
SEQ:  CTTATTTTTAAAGAGATTAAGGATAATATTTTTATTTATATTTTAAGTATTATTTATTTAAGTGTATGTGTAATGAATAAAATTTTTGCTAAAAGAACTTTAAACAAAATTGGTAACTATAGTTTTGTAACATCCGAAACTCACAACTTTATTTGTATGATTATGTTCTTTATTGTTTATTCCTTATTTGGAAATAAAAAGGGAAATTC
TRANSLATE: LIFKEIKDNIFIYILSIIYLSVCVMNKIFAKRTLNKIGNYSFVTSETHNFICMIMFFIVYSLFGNKKGN
PUBMED ID: 

ID:   MZ054303.1
DESC: Plasmodium falciparum 3D7 isolate CRT4 chloroquine resistance transporter (crt) gene, partial cds
SEQ:  CTTATTTTTAAAGAGATTAAGGATAATATTTTTATTTATATTTTAAGTATTATTTATTTAAGTGTATGTGTAATGAATAAAATTTTTGCTAAAAGAACTTTAAACAAAATTGGTAACTATAGTTTTGTAACATCCGAAACTCACAACTTTATTTGTATGATTATGTTCTTTATTGTTTATTCCTTATTTGGAAATAAAAAGGGAAATTCA
TRANSLATE: LIFKEIKDNIFIYILSIIYLSVCVMNKIFAKRTLNKIGNYSFVTSETHNFICMIMFFIVYSLFGNKKGNS
PUBMED ID: 

ID:   OM964469.1
DESC: Plasmodium falciparum isolate SPK77 chloroquine resistance transporter (CRT) gene, partial cds
SEQ:  AGTTGTATAC

In [125]:
# the above had no pubmed IDs, so didn't take this further: see this recipe in the book