In [69]:
import pickle
import argparse

from Bio import Entrez
from Bio import SeqIO
import numpy as np
import pandas as pd

def load_pickled(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f)
    
def save_pickled(item, file_name):
    with open(file_name, 'wb') as f:
        pickle.dump(item, f)

def getRecord(search_term, retmax=10, database="nucleotide"):
    """Get records from NCBI based on the search term.
    """
    
    Entrez.email = "zed@uchicago.edu"
    handle = Entrez.esearch(
        db=database, 
        term=[search_term], 
        retmax=retmax)

    record = Entrez.read(handle)
    handle.close()
    handle = Entrez.efetch(db=database, id=record["IdList"], retmode="xml")
    records = Entrez.read(handle)
    
    return records

def procSequence(
    records,
    begIndex,
    endIndex,
    type_='nucleotide',
    N=10000,
    LMAX=35000):  


    S=[]
    ACC=[]
    count=0
    for i in records:
        beg=[ x for x in item_generator(i,'GBInterval_from')][0]
        end=[ x for x in item_generator(i,'GBInterval_to')][0]

        dt=[]
        for j in i['GBSeq_feature-table']:
            if 'GBFeature_quals' in j:
                for k in j['GBFeature_quals']:
                    if k['GBQualifier_name'] == 'collection_date':
                        dt.append(k['GBQualifier_value'][0])
        
        # i already added the code to parse the date
        # pull sequences for each year
        # simialrly prase the country 
        # and add code to pull sequnces for a specific 
        # year and specific hemisphere (northern vs southern)
        # You have to map the country name to whether it is in the N vs S (find a way to do it)
        print(dt)
        
        
        if type_ == 'nucleotide':
            seq=[ x for x in item_generator(i,'GBSeq_sequence')][0]
        elif type_ == 'protein':
            seq = []
            for j in i['GBSeq_feature-table']:
                if 'GBFeature_quals' in j:
                    for k in j['GBFeature_quals']:
                        if k['GBQualifier_name'] == 'translation':
                            seq.append(k['GBQualifier_value'])
            seq = seq[0]
        else:
            raise ValueError('Not an available type: {}'.format(type_))

        acc=[ x for x in item_generator(i,'GBSeq_primary-accession')][0]
        
        xbeg=''.join('x' for i in np.arange(int(beg)))
        xend=''.join('x' for i in np.arange(LMAX-int(end)))
        seq=xbeg+seq+xend
        seq=seq[begIndex:endIndex]
        S=np.append(S,seq)
        ACC=np.append(ACC,acc)
        if count > N:
            break
        else:
            count=count+1
            
    SF=pd.DataFrame([list(x) for x in S]).replace('x',np.nan)
    SF['accession']=ACC
    SF=SF.dropna(how='all',axis=1)
    
    return SF
def item_generator(json_input, lookup_key):
    if isinstance(json_input, dict):
        for k, v in json_input.items():
            if k == lookup_key:
                yield v
                break
            else:
                for child_val in item_generator(v, lookup_key):
                    yield child_val
                    break
    elif isinstance(json_input, list):
        for item in json_input:
            for item_val in item_generator(item, lookup_key):
                yield item_val
                break
                


In [70]:
t='Influenza A HA'
R=getRecord(t,retmax=20)

In [71]:
# records pull in XML formal, which has a lot of info including what we need
# We need to parse out the sequence data
# which is done by procSequence
# We also need the *collection date* for our purpose
# Please follow the code in procSequence to get the collection date as well
R[0]

In [72]:
# for Influenza A HA. Lookup the proc for NA in pull_sequence.py
df = procSequence(
            R,
            begIndex=0,
            endIndex=550,
            type_='protein')

['21-Feb-2018']
['21-Feb-2018']
['21-Feb-2018']
['21-Feb-2018']
['21-Feb-2018']
['21-Feb-2018']
['22-Mar-2018']
['21-Feb-2018']
['22-Mar-2018']
['22-Mar-2018']
['18-Apr-2018']
['21-Feb-2018']
['22-Mar-2018']
['21-Feb-2018']
['21-Feb-2018']
['18-Apr-2018']
['18-Apr-2018']
['09-May-2018']
['18-Apr-2018']
['22-Mar-2018']


DictElement({'GBSeq_locus': 'MZ543285', 'GBSeq_length': '1714', 'GBSeq_strandedness': 'single', 'GBSeq_moltype': 'cRNA', 'GBSeq_topology': 'linear', 'GBSeq_division': 'VRL', 'GBSeq_update-date': '17-JUL-2021', 'GBSeq_create-date': '17-JUL-2021', 'GBSeq_definition': 'Influenza A virus (A/chicken/MUWRP-Uganda/830/2018(H9N2)) segment 4 hemagglutinin (HA) gene, complete cds', 'GBSeq_primary-accession': 'MZ543285', 'GBSeq_accession-version': 'MZ543285.1', 'GBSeq_other-seqids': ['gb|MZ543285.1|', 'gi|2065710251'], 'GBSeq_project': 'PRJNA430090', 'GBSeq_source': 'Influenza A virus', 'GBSeq_organism': 'Influenza A virus', 'GBSeq_taxonomy': 'Viruses; ssRNA viruses; ssRNA negative-strand viruses; Negarnaviricota; Polyploviricotina; Insthoviricetes; Articulavirales; Orthomyxoviridae; Alphainfluenzavirus', 'GBSeq_references': [DictElement({'GBReference_reference': '1', 'GBReference_position': '1..1714', 'GBReference_authors': ['Ducatez,M.', 'Kayali,G.', 'Byarugaba,D.', 'Djegui,F.', 'Go-Maro,E.', '