# UniProt 서버에서 pdb 검색

* https://www.uniprot.org/help/
* https://www.uniprot.org/help/programmatic_access

In [1]:
from collections import defaultdict
import requests

In [2]:
# UniProt API 이용하는 함수

server = 'http://www.uniprot.org/uniprot'

def do_request(server, ID='', **kwargs):
    params = ''
    req = requests.get('%s/%s%s' % (server, ID, params), params=kwargs)
    if not req.ok:
        req.raise_for_status()
    return req

In [3]:
# p53 유전자 검색
# reviewed:yes - 확인된 유전자만 요청

req = do_request(server, query='gene:p53 AND reviewed:yes', # AND organism:Human',
                 format='tab',
                 columns='id,entry name,length,organism,organism-id,database(PDB),database(HGNC)',
                 limit='50')

In [4]:
import pandas as pd
import io

uniprot_list = pd.read_table(io.StringIO(req.text))
uniprot_list.rename(columns={'Organism ID': 'ID'}, inplace=True)
uniprot_list

Unnamed: 0,Entry,Entry name,Length,Organism,ID,Cross-reference (PDB),Cross-reference (HGNC)
0,P79820,P53_ORYLA,352,Oryzias latipes (Japanese rice fish) (Japanese...,8090,,
1,Q96A56,T53I1_HUMAN,240,Homo sapiens (Human),9606,,18022;
2,Q9W679,P53_TETMU,367,Tetraodon miurus (Congo puffer),94908,,
3,P04637,P53_HUMAN,393,Homo sapiens (Human),9606,1A1U;1AIE;1C26;1DT7;1GZH;1H26;1HS5;1JSP;1KZY;1...,11998;
4,Q42578,PER53_ARATH,335,Arabidopsis thaliana (Mouse-ear cress),3702,1PA2;1QO4;,
5,Q9TTA1,P53_TUPBE,393,Tupaia belangeri (Common tree shrew) (Tupaia g...,37347,,
6,P79892,P53_HORSE,280,Equus caballus (Horse),9796,,
7,Q8SPZ3,P53_DELLE,387,Delphinapterus leucas (Beluga whale),9749,,
8,Q92143,P53_XIPMA,342,Xiphophorus maculatus (Southern platyfish) (Pl...,8083,,
9,Q29537,P53_CANLF,381,Canis lupus familiaris (Dog) (Canis familiaris),9615,,


In [5]:
from Bio import ExPASy, SwissProt

In [6]:
# 위 결과에서 사람의 p53 유전자 ID는 9606임.
# 이 정보를 이용해 SwissProt 데이터베이스를 검색

p53_human = uniprot_list[uniprot_list.ID == 9606]['Entry'].tolist()[0]
print(p53_human)
handle = ExPASy.get_sprot_raw(p53_human)
sp_rec= SwissProt.read(handle)

Q96A56


In [7]:
print(sp_rec.entry_name, sp_rec.sequence_length, sp_rec.gene_name, sep=' / ')
print(sp_rec.description)
print(sp_rec.organism, sp_rec.seqinfo, sep=' / ')
print(sp_rec.sequence)  # protein residue sequence

T53I1_HUMAN / 240 / Name=TP53INP1; Synonyms=P53DINP1, SIP;
RecName: Full=Tumor protein p53-inducible nuclear protein 1; AltName: Full=Stress-induced protein; AltName: Full=p53-dependent damage-inducible nuclear protein 1; Short=p53DINP1;
Homo sapiens (Human). / (240, 27366, '1950D66EDDB4A186')
MFQRLNKMFVGEVSSSSNQEPEFNEKEDDEWILVDFIDTCTGFSAEEEEEEEDISEESPTEHPSVFSCLPASLECLADTSDSCFLQFESCPMEESWFITPPPCFTAGGLTTIKVETSPMENLLIEHPSMSVYAVHNSCPGLSEATRGTDELHSPSSPRVEAQNEMGQHIHCYVAALAAHTTFLEQPKSFRPSQWIKEHSERQPLNRNSLRRQNLTRDCHPRQVKHNGWVVHQPCPRQYNY


In [8]:
print(sp_rec.comments, '\n')
print(sp_rec.keywords)

["FUNCTION: Antiproliferative and proapoptotic protein involved in cell stress response which acts as a dual regulator of transcription and autophagy. Acts as a positive regulator of autophagy. In response to cellular stress or activation of autophagy, relocates to autophagosomes where it interacts with autophagosome-associated proteins GABARAP, GABARAPL1/L2, MAP1LC3A/B/C and regulates autophagy. Acts as an antioxidant and plays a major role in p53/TP53-driven oxidative stress response. Possesses both a p53/TP53-independent intracellular reactive oxygen species (ROS) regulatory function and a p53/TP53-dependent transcription regulatory function. Positively regulates p53/TP53 and p73/TP73 and stimulates their capacity to induce apoptosis and regulate cell cycle. In response to double-strand DNA breaks, promotes p53/TP53 phosphorylation on 'Ser-46' and subsequent apoptosis. Acts as a tumor suppressor by inducing cell death by an autophagy and caspase-dependent mechanism. Can reduce cell 

In [9]:
help(sp_rec)

Help on Record in module Bio.SwissProt object:

class Record(builtins.object)
 |  Holds information from a SwissProt record.
 |  
 |  Attributes:
 |   - entry_name        Name of this entry, e.g. RL1_ECOLI.
 |   - data_class        Either 'STANDARD' or 'PRELIMINARY'.
 |   - molecule_type     Type of molecule, 'PRT',
 |   - sequence_length   Number of residues.
 |   - accessions        List of the accession numbers, e.g. ['P00321']
 |   - created           A tuple of (date, release).
 |   - sequence_update   A tuple of (date, release).
 |   - annotation_update A tuple of (date, release).
 |   - description       Free-format description.
 |   - gene_name         Gene name.  See userman.txt for description.
 |   - organism          The source of the sequence.
 |   - organelle         The origin of the sequence.
 |   - organism_classification  The taxonomy classification.  List of strings.
 |     (http://www.ncbi.nlm.nih.gov/Taxonomy/)
 |   - taxonomy_id       A list of NCBI taxonomy id's.

In [10]:
# 참조 외부 DB, GO 항목 수 등 출력

done_features = []
print(len(sp_rec.features))

for feature in sp_rec.features:
    print(feature)
        
print(len(sp_rec.cross_references))   # DB간 상호참조 정보

per_source = defaultdict(list)
for xref in sp_rec.cross_references:
    source = xref[0]
    per_source[source].append(xref[1:])
print(per_source.keys(), '\n')

done_GOs = set()
print(len(per_source['GO']))   # Gene Ontology (유전자 기능과 온톨로지)
for annot in per_source['GO']:
    if annot[1][0] in done_GOs:
        continue
    else:
        done_GOs.add(annot[1][0])
        print(annot)

6
type: CHAIN
location: [0:240]
id: PRO_0000072406
qualifiers:
    Key: note, Value: Tumor protein p53-inducible nuclear protein 1

type: MOTIF
location: [24:37]
qualifiers:
    Key: note, Value: LIR

type: COMPBIAS
location: [13:17]
qualifiers:
    Key: note, Value: Poly-Ser

type: COMPBIAS
location: [19:61]
qualifiers:
    Key: note, Value: Glu-rich

type: VAR_SEQ
location: [158:240]
id: VSP_013176
qualifiers:
    Key: evidence, Value: ECO:0000303|PubMed:11511362,ECO:0000303|PubMed:12067065
    Key: note, Value: VEAQNEMGQHIHCYVAALAAHTTFLEQPKSFRPSQWIKEHSERQPLNRNSLRRQNLTRDCHPRQVKHNGWVVHQPCPRQYNY -> ARKSCL (in isoform 2)

type: VARIANT
location: [74:75]
id: VAR_051404
qualifiers:
    Key: note, Value: C -> R (in dbSNP:rs11991800)

100
dict_keys(['EMBL', 'CCDS', 'RefSeq', 'BioGRID', 'IntAct', 'STRING', 'iPTMnet', 'PhosphoSitePlus', 'BioMuta', 'DMDM', 'jPOST', 'MassIVE', 'PaxDb', 'PeptideAtlas', 'PRIDE', 'ProteomicsDB', 'Antibodypedia', 'DNASU', 'Ensembl', 'GeneID', 'KEGG', 'UCSC', 'CTD',