In [19]:
import pandas as pd
import requests as re
import untangle

In [190]:
baseurl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
esearch = "esearch.fcgi"
efetch = "efetch.fcgi"
db = "protein"
retmax = 100
species = []
glands = []
variants = pd.read_csv("spider_silks.csv")
silk_df = pd.DataFrame(columns=["Spidrion", "Gland / Silk Type", "Species", "Spinneret Used", "Size", "Use", "Database(s)", "Sequence", "Notes", "Experiment Considerations"])

In [171]:
variants

Unnamed: 0,Organism,Glands
0,Latrodectus hesperus,Major ampullate
1,Araneus ventricosus,Flagelliform
2,Nephilia clavipes,Aggregate
3,Araneus diadematus,Minor ampullate
4,Octonoba varians,Tubuliform
5,Parasteatoda tepidariorum,Aciniform
6,Euprosthenops australis,Pyriform
7,,Cribellar


In [145]:
def make_db_request(link, params):
    link = baseurl + link
    request = re.get(link, params=params)
    return request.text

In [146]:
def get_search_ids(species, gland):
    query = species.replace(" ", "+") + "+ AND + " + gland.replace(" ", "+")
    params = {"db": db, "term": query, "retmax" : retmax}       
    request = make_db_request(esearch, params)
    searches = untangle.parse(request)
    idlist = [id.cdata for id in searches.eSearchResult.IdList.children]
    return idlist

ids = get_search_ids("Araneus ventricosus", "Major ampullate")

In [147]:
def parse_features(features):
    for feat in features.children:
        if feat.GBFeature_key == "CDS":
            qualifiers = feat.GBFeature_quals.children
            for qual in qualifiers:
                if (qual.GBQualifier_name == "gene"):
                    return qual.GBQualifier_value.cdata
    return ""

In [185]:
def get_document(idlist, species, gland, columns):
    idlist = ",".join(idlist)
    params = {"db": db, "id": idlist, "rettype": "gp", "retmode": "xml"}
    doc = untangle.parse(make_db_request(efetch, params))
    gbset =  doc.GBSet
    silk_db = []
    for gbseq in gbset.children:
        definition = gbseq.GBSeq_definition.cdata
        if (gland.lower() in definition.lower()):
            print(definition)
            size = gbseq.GBSeq_length.cdata
            partial = "partial" if "partial" in definition else ""
            accession = gbseq.GBSeq_accession_version.cdata
            gene = parse_features(gbseq.GBSeq_feature_table)
            sequence = gbseq.GBSeq_sequence.cdata.upper()
            database = "GenBank:" + accession
            silk_db += [[gene, gland, species, "", size, "", database, sequence, partial, ""]]
    return pd.DataFrame(silk_db, columns=columns)
# get_document(ids, "Araneus ventricosus", "Major ampullate", silk_df)

In [191]:
def create_db(df):
    glands = variants["Glands"].to_list()[:1]
    organism = variants["Organism"].to_list()[:1]
    for gland in glands:
        for org in organism: 
            ids = get_search_ids(org, gland)
            df = df.append(get_document(ids, org, gland, df.columns))
    return df

silk_df = create_db(silk_df)

Chain A, Major ampullate spidroin 1
major ampullate spidroin 2 variant 1, partial [Latrodectus hesperus]
major ampullate spidroin 3 variant 1, partial [Latrodectus hesperus]
major ampullate spidroin 1 variant 2, partial [Latrodectus hesperus]
major ampullate spidroin 1 variant 1, partial [Latrodectus hesperus]
major ampullate spidroin 2 variant 1, partial [Latrodectus hesperus]
major ampullate spidroin 3 variant 1, partial [Latrodectus hesperus]
major ampullate spidroin 1 variant 3, partial [Latrodectus hesperus]
major ampullate spidroin 1 variant 2, partial [Latrodectus hesperus]
major ampullate spidroin 1 variant 1, partial [Latrodectus hesperus]
major ampullate spidroin 1, partial [Latrodectus hesperus]
major ampullate spidroin 1 locus 2, partial [Latrodectus hesperus]
major ampullate spidroin 1 locus 2, partial [Latrodectus hesperus]
major ampullate spidroin 1 locus 1, partial [Latrodectus hesperus]
major ampullate spidroin 1 locus 2, partial [Latrodectus hesperus]
major ampullate 

In [192]:
silk_df

Unnamed: 0,Spidrion,Gland / Silk Type,Species,Spinneret Used,Size,Use,Database(s),Sequence,Notes,Experiment Considerations
0,,Major ampullate,Latrodectus hesperus,,137,,GenBank:2N3E_A,GMGQANTPWSSKANADAFINSFISAASNTGSFSQDQMENMSLIGNT...,,
1,MaSp2,Major ampullate,Latrodectus hesperus,,178,,GenBank:AWK58757.1,AGPSGPGGSGAAAAAAAGGSGPGGFGQGPTGYGPSGPGGQQGYGPG...,partial,
2,MaSp3,Major ampullate,Latrodectus hesperus,,617,,GenBank:AWK58730.1,MTWSTRLALSLLSVLLSQAICALGQTNTPWSSKANADAFIQAFMKD...,partial,
3,MaSp1,Major ampullate,Latrodectus hesperus,,159,,GenBank:AWK58708.1,GAGQGGAAAAAAAAAAGGAGQGGYGGYGQQGGAGAAAAAASGPGQI...,partial,
4,MaSp1,Major ampullate,Latrodectus hesperus,,159,,GenBank:AWK58707.1,GAGQGGAAAAAAAAAAGGAGQGGYGGYGQQGGAGAAAAAASGPGQI...,partial,
5,MaSp2,Major ampullate,Latrodectus hesperus,,234,,GenBank:AWK58652.1,MNWSTRLVLSILVVLCTQSLCALGQANTPWSSKENADAFIGAFMNA...,partial,
6,MaSp3,Major ampullate,Latrodectus hesperus,,333,,GenBank:AWK58638.1,GRYGQSGFGQEGAGQGGAGSVATAGESGQGNTGINVKVAGNAGQGD...,partial,
7,MaSp1,Major ampullate,Latrodectus hesperus,,165,,GenBank:AWK58627.1,MTWSTRLALSFFAVICTQSIYALGQGNTPWSTKANADNFMNGFLSA...,partial,
8,MaSp1,Major ampullate,Latrodectus hesperus,,259,,GenBank:AWK58626.1,MTWSTRLALSFLLVLCTQSIYALAQANTPWSSKANADAFINSFISA...,partial,
9,MaSp1,Major ampullate,Latrodectus hesperus,,259,,GenBank:AWK58625.1,MTWSTRLALSFLFVLCTQSLYALAQANTPWSSKANADAFINSFISA...,partial,
