In [2]:
import pandas as pd
import requests as re
import untangle
from collections import Counter
import json

In [3]:
baseurl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
esearch = "esearch.fcgi"
efetch = "efetch.fcgi"
db = "protein"
species = []
glands = []
variants = pd.read_csv("spider_silks.csv")

In [4]:
with open("silks.json", "r") as f:
    silk_json = json.loads(f.read())

In [5]:
silk_json

[{'name': 'Major Ampullate', 'alt': 'dragline'},
 {'name': 'Flagelliform', 'alt': ''},
 {'name': 'Aggregate spidroin', 'alt': ''},
 {'name': 'Minor ampullate', 'alt': ''},
 {'name': 'Tubuliform', 'alt': ''},
 {'name': 'Aciniform', 'alt': ''},
 {'name': 'Pyriform', 'alt': ''},
 {'name': 'Cribellar', 'alt': ''}]

In [6]:
def make_db_request(link, params):
    link = baseurl + link
    request = re.get(link, params=params)
    return request.text

def get_search_ids(gland):
    query = '"{}"'.format(gland["name"].replace(" ", "+"))
    params = {"db": db, "term": query, "retmax" : 1500}       
    request = make_db_request(esearch, params)
    searches = untangle.parse(request)
    idlist = [id.cdata for id in searches.eSearchResult.IdList.children]
    return idlist

def parse_features(features):
    for feat in features.children:
        if feat.GBFeature_key == "CDS":
            qualifiers = feat.GBFeature_quals.children
            for qual in qualifiers:
                if (qual.GBQualifier_name == "gene"):
                    return qual.GBQualifier_value.cdata
    return ""

In [39]:
def get_refs(reflist):
    ref = reflist.GBReference
    if (len(ref) >= 1 and ref[0] is not None):
        return ref[0].GBReference_title.cdata
    else:
        return ""
    
def filter_searches(definition, taxonomy, name, alt=""):
    definition = definition.lower()
    if ("Araneae" not in taxonomy):
        return False
    if ((name.lower() not in definition) and (alt.lower()) not in definition):
        return False
    return True
    
def parse_document(xmlstr, gland_obj):
    genbankurl = "https://www.ncbi.nlm.nih.gov/protein/"
    doc = untangle.parse(xmlstr)
    gbset =  doc.GBSet
    silk_db = []
    gland = gland_obj["name"]
    alt = gland_obj["alt"]
    for gbseq in gbset.children:
        definition = gbseq.GBSeq_definition.cdata
        taxonomy = gbseq.GBSeq_taxonomy.cdata
        if filter_searches(definition, taxonomy, gland, alt):
            sequence = gbseq.GBSeq_sequence.cdata.upper()
            species = gbseq.GBSeq_organism.cdata
            size = gbseq.GBSeq_length.cdata
            partial = "partial" if "partial" in definition else ""
            accession = gbseq.GBSeq_accession_version.cdata
            gene = parse_features(gbseq.GBSeq_feature_table)
            database = '=HYPERLINK("{url}{acc}","Genbank:{acc}")'.format(acc=accession, url=genbankurl)
            ref = ""
            if "GBSeq_references" in gbseq:
                ref = get_refs(gbseq.GBSeq_references)
            silk_db += [[gene, gland, species, "", size, "", database, sequence, partial, "", ref]]
    return silk_db

def get_document(idlist, gland, columns):
    retstart = 0
    retmax = 200
    silk_db = []
    while retstart < len(idlist):
        ids = ",".join(idlist[retstart:retstart+retmax])
        params = {"db": db, "id": ids, "rettype": "gp", "retmode": "xml"}
        xmlstr = make_db_request(efetch, params)
        silk_db += parse_document(xmlstr, gland)
        retstart += retmax
    print(len(silk_db))
    return pd.DataFrame(silk_db, columns=columns)

In [40]:
get_document(ids, "Major ampullate", silk_df.columns)

NameError: name 'ids' is not defined

In [41]:
def create_db(df):
    for gland in silk_json:
        print (gland)
        ids = get_search_ids(gland)
        df = pd.concat([df, get_document(ids, gland, df.columns)])
    return df

silk_df = pd.DataFrame(columns=["Spidrion", "Gland / Silk Type", "Species", "Spinneret Used", "Size", "Use", "Database(s)", "Sequence", "Notes", "Experiment Considerations", "References"])
silk_df = create_db(silk_df)

{'name': 'Major Ampullate', 'alt': 'dragline'}
407
{'name': 'Flagelliform', 'alt': ''}
58
{'name': 'Aggregate spidroin', 'alt': ''}
0
{'name': 'Minor ampullate', 'alt': ''}
148
{'name': 'Tubuliform', 'alt': ''}
214
{'name': 'Aciniform', 'alt': ''}
127
{'name': 'Pyriform', 'alt': ''}
39
{'name': 'Cribellar', 'alt': ''}
8


In [42]:
silk_df

Unnamed: 0,Spidrion,Gland / Silk Type,Species,Spinneret Used,Size,Use,Database(s),Sequence,Notes,Experiment Considerations,References
0,,Major Ampullate,Caerostris darwini,,169,,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/prote...",MVLIIGLTSVTSLKVEELQKSYRIAFNERQKRSPKEDCSNSYNDVL...,,,
1,MaSp3D1,Major Ampullate,Caerostris darwini,,2328,,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/prote...",MGWITNIIIFVLFCTQANAFNKETVMGPGMMRDFMNGMSNAMANSG...,,,
2,MaSp1F2,Major Ampullate,Caerostris darwini,,2985,,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/prote...",MTWTSRLALSLLVAICTQSMFALGQDNTPWSSTGTAESFMSSFMSA...,,,
3,MaSp2C1,Major Ampullate,Caerostris darwini,,710,,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/prote...",GGYGPGGQGPSGPGSQGPGGQGPYGPGGSAAAAAAAGGYGPGGQGP...,partial,,
4,MaSp2C1,Major Ampullate,Caerostris darwini,,3227,,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/prote...",MYSSTRLALTLLAVLCTQAVFTAAQAPSPWKRTAPAERFITSFITA...,partial,,
...,...,...,...,...,...,...,...,...,...,...,...
3,CrSp,Cribellar,Octonoba varians,,4449,,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/prote...",VVLSRKLKRILPAALLTADLSSGLGRLGAGAPASRYAQLVAGSIAL...,partial,,
4,CrSp,Cribellar,Octonoba sybotides,,2426,,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/prote...",MDWLTRSAMLLLLIFFRCNGIFGQSIAGQEISAPSNINSAQNFATS...,partial,,
5,CrSp,Cribellar,Octonoba sybotides,,986,,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/prote...",SAAAAAQAVEREEGVGTEGVSAGEGLRDITEGAKKVAQSAEKSEIF...,partial,,
6,CrSp,Cribellar,Octonoba grandiprojecta,,1964,,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/prote...",DLTVHLPSAPEHSEKLASAFELASGTSTQVVVPSQTESELSTLTEG...,partial,,


In [43]:
species = set(silk_df["Species"].to_list())
species

{'Agelenopsis aperta',
 'Aptostichus sp. AS220',
 'Araneus bicentenarius',
 'Araneus diadematus',
 'Araneus gemmoides',
 'Araneus ventricosus',
 'Argiope amoena',
 'Argiope argentata',
 'Argiope aurantia',
 'Argiope bruennichi',
 'Argiope trifasciata',
 'Argyroneta aquatica',
 'Bothriocyrtum californicum',
 'Caerostris darwini',
 'Caerostris extrusa',
 'Cybaeus angustiarum',
 'Cyrtophora moluccensis',
 'Deinopis spinosa',
 'Diguetia canities',
 'Euagrus chisoseus',
 'Euprosthenops australis',
 'Euprosthenops sp. NNP-2003',
 'Gasteracantha cancriformis',
 'Gea heptagon',
 'Kukulcania hibernalis',
 'Latrodectus geometricus',
 'Latrodectus hasseltii',
 'Latrodectus hesperus',
 'Latrodectus mactans',
 'Latrodectus tredecimguttatus',
 'Macrothele holsti',
 'Metepeira grandiosa',
 'Nephila pilipes',
 'Nephilengys cruentata',
 'Octonoba grandiconcava',
 'Octonoba grandiprojecta',
 'Octonoba okinawensis',
 'Octonoba sybotides',
 'Octonoba varians',
 'Octonoba yesoensis',
 'Oedothorax gibbosus'

In [44]:
def get_aa_comp(seq):
    total = len(seq)
    aa_count = Counter(seq).most_common(5)
    aa_comp = {k : round(v/total * 100,2)  for (k,v) in aa_count}
    return aa_comp

In [45]:
silk_df["Amino acid composition"] = silk_df["Sequence"].apply(get_aa_comp)

In [46]:
silk_df["Amino acid composition"]

0    {'S': 9.47, 'E': 7.69, 'L': 7.1, 'P': 7.1, 'R'...
1    {'G': 40.38, 'S': 20.53, 'A': 15.29, 'Y': 5.2,...
2    {'G': 47.6, 'A': 23.75, 'Q': 9.51, 'S': 5.9, '...
3    {'G': 35.49, 'A': 19.01, 'P': 14.93, 'S': 11.4...
4    {'G': 39.51, 'A': 17.97, 'P': 16.58, 'S': 9.02...
                           ...                        
3    {'A': 22.59, 'S': 10.16, 'E': 10.11, 'L': 9.89...
4    {'A': 21.19, 'E': 9.36, 'S': 8.99, 'L': 8.49, ...
5    {'A': 15.31, 'S': 12.27, 'L': 10.34, 'E': 9.33...
6    {'A': 22.71, 'S': 9.37, 'L': 9.22, 'E': 9.22, ...
7    {'A': 14.31, 'S': 12.91, 'L': 10.42, 'E': 9.95...
Name: Amino acid composition, Length: 1001, dtype: object

In [47]:
silk_df.to_csv("silk_data.csv", index=False)