# WS_ch08A.ipynb
### WESmith 04/16/23

## FINDING A PROTEIN IN MULTIPLE DATABASES
#### (see book code in Chapter08/Intro.py)

### WS created this notebook to follow along with code from the book
### 'Bioinformatics with Python Cookbook' by Tiago Antao
#### Each recipe will have its own notebook, suffixed by A, B, etc.¶

In [1]:
from collections import defaultdict
import requests
from Bio import ExPASy, SwissProt
import pandas as pd
import io

In [2]:
import Bio

In [75]:
Bio.__version__  # version 1.79 didn't work for SwissProt access; version 1.81 works

'1.81'

In [4]:
# this chapter is out of date for accessing UNIPROT; see REST API and new formats at 
# https://www.uniprot.org/help/api_queries

In [5]:
# WS get the allowed fields from https://www.uniprot.org/help/return_fields
# WS this is the REST API
server = "https://rest.uniprot.org/uniprotkb/search?"

In [6]:
#query = 'organism_id:9606'  # test case: human id
query  = 'gene:P53'  # example in book
query += '+AND+reviewed:true'
params = {'query':query, 
          'format':'tsv',
          'fields':'accession,id,length,organism_id,organism_name,xref_pdb,xref_hgnc',
          'size':'50'} # size tested on 'organism_id' query; it works

In [7]:
def get_request(server, **kwargs):  # WS new function
    txt = server
    for j, k in kwargs.items():
        txt += j + '=' + k + '&'
    txt = txt[:-1] # strip off last '&'
    req = requests.get(txt)
    if not req.ok:
        req.raise_for_status()
    return req

In [27]:
req = get_request(server, **params)

In [29]:
uniprot_list = pd.read_table(io.StringIO(req.text))
uniprot_list.rename(columns={'Organism (ID)': 'ID'}, inplace=True)
uniprot_list

Unnamed: 0,Entry,Entry Name,Length,ID,Organism,PDB,HGNC
0,P04637,P53_HUMAN,393,9606,Homo sapiens (Human),1A1U;1AIE;1C26;1DT7;1GZH;1H26;1HS5;1JSP;1KZY;1...,HGNC:11998;
1,P10361,P53_RAT,391,10116,Rattus norvegicus (Rat),,
2,P02340,P53_MOUSE,390,10090,Mus musculus (Mouse),1HU8;2GEQ;2IOI;2IOM;2IOO;2P52;3EXJ;3EXL;,
3,Q42578,PER53_ARATH,335,3702,Arabidopsis thaliana (Mouse-ear cress),1PA2;1QO4;,
4,O09185,P53_CRIGR,393,10029,Cricetulus griseus (Chinese hamster) (Cricetul...,,
5,Q8SPZ3,P53_DELLE,387,9749,Delphinapterus leucas (Beluga whale),,
6,Q9TTA1,P53_TUPBE,393,37347,Tupaia belangeri (Common tree shrew) (Tupaia g...,,
7,P61260,P53_MACFU,393,9543,Macaca fuscata fuscata (Japanese macaque),,
8,P56424,P53_MACMU,393,9544,Macaca mulatta (Rhesus macaque),,
9,P79892,P53_HORSE,280,9796,Equus caballus (Horse),,


In [10]:
p53_human = uniprot_list[
            (uniprot_list.ID == 9606) &
            (uniprot_list['Entry Name'].str.contains('P53'))]['Entry'].iloc[0]
p53_human

'P04637'

In [11]:
handle = ExPASy.get_sprot_raw(p53_human)

In [12]:
handle.url

'https://rest.uniprot.org/uniprotkb/P04637.txt'

In [13]:
sp_rec = SwissProt.read(handle)

In [21]:
print(sp_rec.entry_name, sp_rec.sequence_length, sp_rec.gene_name); print()
print(sp_rec.description); print()
print(sp_rec.organism, sp_rec.seqinfo); print()
print(sp_rec.sequence); print()
#print(sp_rec.comments); print()  # very long
print(sp_rec.keywords)

P53_HUMAN 393 [{'Name': 'TP53', 'Synonyms': ['P53']}]

RecName: Full=Cellular tumor antigen p53; AltName: Full=Antigen NY-CO-13; AltName: Full=Phosphoprotein p53; AltName: Full=Tumor suppressor p53;

Homo sapiens (Human). (393, 43653, 'AD5C149FD8106131')

MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD

['3D-structure', 'Acetylation', 'Activator', 'Alternative promoter usage', 'Alternative splicing', 'Apoptosis', 'Biological rhythms', 'Cell cycle', 'Cytoplasm', 'Cytoskeleton', 'Direct protein sequencing', 'Disease variant', 'DNA-binding', 'Endoplasmic reticulum', 'Glycoprotein', 'Host-virus interaction', 'Isopeptide bond', 'Li-Fraumeni syndrome',

In [23]:
help(sp_rec)

Help on Record in module Bio.SwissProt object:

class Record(builtins.object)
 |  Holds information from a SwissProt record.
 |  
 |  Attributes:
 |   - entry_name        Name of this entry, e.g. RL1_ECOLI.
 |   - data_class        Either 'STANDARD' or 'PRELIMINARY'.
 |   - molecule_type     Type of molecule, 'PRT',
 |   - sequence_length   Number of residues.
 |   - accessions        List of the accession numbers, e.g. ['P00321']
 |   - created           A tuple of (date, release).
 |   - sequence_update   A tuple of (date, release).
 |   - annotation_update A tuple of (date, release).
 |   - description       Free-format description.
 |   - gene_name         A list of dictionaries with keys 'Name', 'Synonyms',
 |                       'OrderedLocusNames' and 'ORFNames'.
 |   - organism          The source of the sequence.
 |   - organelle         The origin of the sequence.
 |   - organism_classification  The taxonomy classification.  List of strings.
 |     (http://www.ncbi.nlm.nih.

In [32]:
print('Total features:', len(sp_rec.features))

Total features: 1513


In [79]:
done_features = set()
num_items = defaultdict(int) # WS calculate how many of each feature type
for feature in sp_rec.features:
    # WS had to add .type (book out of date)
    num_items[feature.type] += 1
    if feature.type in done_features:
        continue
    else:
        done_features.add(feature.type)
        print(feature)

type: CHAIN
location: [0:393]
id: PRO_0000185703
qualifiers:
    Key: note, Value: Cellular tumor antigen p53

type: DNA_BIND
location: [101:292]
qualifiers:
    Key: evidence, Value: ECO:0000269|PubMed:16793544,ECO:0000269|PubMed:18996393, ECO:0000269|PubMed:20364130

type: REGION
location: [0:320]
qualifiers:
    Key: evidence, Value: ECO:0000269|PubMed:25732823
    Key: note, Value: Interaction with CCAR2

type: MOTIF
location: [16:25]
qualifiers:
    Key: note, Value: TADI

type: COMPBIAS
location: [63:92]
qualifiers:
    Key: evidence, Value: ECO:0000256|SAM:MobiDB-lite
    Key: note, Value: Pro residues

type: BINDING
location: [175:176]
qualifiers:
    Key: evidence, Value: ECO:0000269|PubMed:14534297,ECO:0000269|PubMed:16793544, ECO:0000269|PubMed:17015838,ECO:0000269|PubMed:18650397, ECO:0000269|PubMed:19515728,ECO:0000269|PubMed:20142040, ECO:0000269|PubMed:20364130
    Key: ligand, Value: Zn(2+)
    Key: ligand_id, Value: ChEBI:CHEBI:29105

type: SITE
location: [119:120]
qua

In [92]:
# WS get the counts per feature
sum = 0
for j, k in num_items.items():
    sum += k
    print('{:10}{:5}'.format(j, k))
print('{:10}{:5}'.format('total', sum))

CHAIN         1
DNA_BIND      1
REGION       19
MOTIF         5
COMPBIAS      4
BINDING       4
SITE          1
MOD_RES      28
CROSSLNK      4
VAR_SEQ       6
VARIANT    1363
MUTAGEN      36
HELIX        14
TURN          7
STRAND       20
total      1513


In [46]:
print('Cross references: ',len(sp_rec.cross_references))

Cross references:  998


In [47]:
per_source = defaultdict(list)
for xref in sp_rec.cross_references:
    source = xref[0]
    per_source[source].append(xref[1:])
print(per_source.keys())

dict_keys(['EMBL', 'CCDS', 'PIR', 'RefSeq', 'PDB', 'PDBsum', 'AlphaFoldDB', 'BMRB', 'SMR', 'BioGRID', 'ComplexPortal', 'CORUM', 'DIP', 'ELM', 'IntAct', 'MINT', 'STRING', 'BindingDB', 'ChEMBL', 'DrugBank', 'MoonDB', 'TCDB', 'GlyCosmos', 'GlyGen', 'iPTMnet', 'MetOSite', 'PhosphoSitePlus', 'BioMuta', 'DMDM', 'SWISS-2DPAGE', 'CPTAC', 'EPD', 'jPOST', 'MassIVE', 'MaxQB', 'PaxDb', 'PeptideAtlas', 'ProteomicsDB', 'ABCD', 'Antibodypedia', 'CPTC', 'DNASU', 'Ensembl', 'GeneID', 'KEGG', 'MANE-Select', 'UCSC', 'AGR', 'CTD', 'DisGeNET', 'GeneCards', 'GeneReviews', 'HGNC', 'HPA', 'MalaCards', 'MIM', 'neXtProt', 'OpenTargets', 'Orphanet', 'PharmGKB', 'VEuPathDB', 'eggNOG', 'GeneTree', 'InParanoid', 'OMA', 'OrthoDB', 'PhylomeDB', 'TreeFam', 'PathwayCommons', 'Reactome', 'SABIO-RK', 'SignaLink', 'SIGNOR', 'BioGRID-ORCS', 'ChiTaRS', 'EvolutionaryTrace', 'GeneWiki', 'GenomeRNAi', 'Pharos', 'PRO', 'Proteomes', 'RNAct', 'Bgee', 'ExpressionAtlas', 'Genevisible', 'GO', 'CDD', 'DisProt', 'Gene3D', 'IDEAL', 'In

In [93]:
per_source['PDB']  # I looked at a few of these with Jmol: very neat

[('1A1U', 'NMR', '-', 'A/C=324-358'),
 ('1AIE', 'X-ray', '1.50 A', 'A=326-356'),
 ('1C26', 'X-ray', '1.70 A', 'A=325-356'),
 ('1DT7', 'NMR', '-', 'X/Y=367-388'),
 ('1GZH', 'X-ray', '2.60 A', 'A/C=95-292'),
 ('1H26', 'X-ray', '2.24 A', 'E=376-386'),
 ('1HS5', 'NMR', '-', 'A/B=324-357'),
 ('1JSP', 'NMR', '-', 'A=367-386'),
 ('1KZY', 'X-ray', '2.50 A', 'A/B=95-289'),
 ('1MA3', 'X-ray', '2.00 A', 'B=372-389'),
 ('1OLG', 'NMR', '-', 'A/B/C/D=319-360'),
 ('1OLH', 'NMR', '-', 'A/B/C/D=319-360'),
 ('1PES', 'NMR', '-', 'A/B/C/D=325-355'),
 ('1PET', 'NMR', '-', 'A/B/C/D=325-355'),
 ('1SAE', 'NMR', '-', 'A/B/C/D=319-360'),
 ('1SAF', 'NMR', '-', 'A/B/C/D=319-360'),
 ('1SAK', 'NMR', '-', 'A/B/C/D=319-360'),
 ('1SAL', 'NMR', '-', 'A/B/C/D=319-360'),
 ('1TSR', 'X-ray', '2.20 A', 'A/B/C=94-312'),
 ('1TUP', 'X-ray', '2.20 A', 'A/B/C=94-312'),
 ('1UOL', 'X-ray', '1.90 A', 'A/B=94-312'),
 ('1XQH', 'X-ray', '1.75 A', 'B/F=369-377'),
 ('1YC5', 'X-ray', '1.40 A', 'B=372-389'),
 ('1YCQ', 'X-ray', '2.30 A', '

In [73]:
done_GOs = set()
print('Annotation SOURCES:', len(per_source['GO']))

Annotation SOURCES: 185


In [74]:
for annot in per_source['GO']:
    if annot[1][0] in done_GOs:
        continue
    else:
        done_GOs.add(annot[1][0])
        print(annot)

('GO:0005813', 'C:centrosome', 'IDA:UniProtKB')
('GO:0071889', 'F:14-3-3 protein binding', 'EXP:DisProt')
('GO:0006914', 'P:autophagy', 'IMP:CAFA')


In [70]:
done_GOs

{'C', 'F', 'P'}

In [71]:
per_source['GO'][0][1] # [0] of this is 'C' the first character

'C:centrosome'