# WS_ch08A.ipynb
### WESmith 04/16/23

## FINDING A PROTEIN IN MULTIPLE DATABASES
#### (see book code in Chapter08/Intro.py)

### WS created this notebook to follow along with code from the book
### 'Bioinformatics with Python Cookbook' by Tiago Antao
#### Each recipe will have its own notebook, suffixed by A, B, etc.¶

In [None]:
from collections import defaultdict
import requests
from Bio import ExPASy, SwissProt
import pandas as pd
import io

In [None]:
import Bio

In [None]:
Bio.__version__  # version 1.79 didn't work for SwissProt access; version 1.81 works

In [None]:
# this chapter is out of date for accessing UNIPROT; see REST API and new formats at 
# https://www.uniprot.org/help/api_queries

In [None]:
# WS get the allowed fields from https://www.uniprot.org/help/return_fields
# WS this is the REST API
server = "https://rest.uniprot.org/uniprotkb/search?"

In [None]:
#query = 'organism_id:9606'  # test case: human id
query  = 'gene:P53'  # example in book
query += '+AND+reviewed:true'
params = {'query':query, 
          'format':'tsv',
          'fields':'accession,id,length,organism_id,organism_name,xref_pdb,xref_hgnc',
          'size':'50'} # size tested on 'organism_id' query; it works

In [None]:
def get_request(server, **kwargs):  # WS new function
    txt = server
    for j, k in kwargs.items():
        txt += j + '=' + k + '&'
    txt = txt[:-1] # strip off last '&'
    req = requests.get(txt)
    if not req.ok:
        req.raise_for_status()
    return req

In [None]:
req = get_request(server, **params)

In [None]:
uniprot_list = pd.read_table(io.StringIO(req.text))
uniprot_list.rename(columns={'Organism (ID)': 'ID'}, inplace=True)
uniprot_list

In [None]:
p53_human = uniprot_list[
            (uniprot_list.ID == 9606) &
            (uniprot_list['Entry Name'].str.contains('P53'))]['Entry'].iloc[0]
p53_human

In [None]:
handle = ExPASy.get_sprot_raw(p53_human)

In [None]:
handle.url

In [None]:
sp_rec = SwissProt.read(handle)

In [None]:
print(sp_rec.entry_name, sp_rec.sequence_length, sp_rec.gene_name); print()
print(sp_rec.description); print()
print(sp_rec.organism, sp_rec.seqinfo); print()
print(sp_rec.sequence); print()
#print(sp_rec.comments); print()  # very long
print(sp_rec.keywords)

In [None]:
help(sp_rec)

In [None]:
print('Total features:', len(sp_rec.features))

In [None]:
done_features = set()
num_items = defaultdict(int) # WS calculate how many of each feature type
for feature in sp_rec.features:
    # WS had to add .type (book out of date)
    num_items[feature.type] += 1
    if feature.type in done_features:
        continue
    else:
        done_features.add(feature.type)
        print(feature)

In [None]:
# WS get the counts per feature
sum = 0
for j, k in num_items.items():
    sum += k
    print('{:10}{:5}'.format(j, k))
print('{:10}{:5}'.format('total', sum))

In [None]:
print('Cross references: ',len(sp_rec.cross_references))

In [None]:
per_source = defaultdict(list)
for xref in sp_rec.cross_references:
    source = xref[0]
    per_source[source].append(xref[1:])
print(per_source.keys())

In [None]:
per_source['PDB']  # I looked at a few of these with Jmol: very neat

In [None]:
done_GOs = set()
print('Annotation SOURCES:', len(per_source['GO']))

In [None]:
for annot in per_source['GO']:
    if annot[1][0] in done_GOs:
        continue
    else:
        done_GOs.add(annot[1][0])
        print(annot)

In [None]:
done_GOs

In [None]:
per_source['GO'][0][1] # [0] of this is 'C' the first character