In [16]:
import mysql.connector as sql
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
from time import sleep
import time

In [17]:
def get_protein_location(uniprot_id):
    """
    Given a UniProt ID, returns the subcellular location of the corresponding protein.

    This function scrapes data from the UniProt website, specifically from the XML data
    of the protein corresponding to the UniProt ID. It finds all subcellular locations
    listed and returns them as a joined string, with each location separated by a comma.

    Note that there is a delay of one second at the beginning of the function to prevent
    overwhelming the server with requests.

    Parameters:
    uniprot_id (str): The UniProt ID of the protein.

    Returns:
    str: A string of the subcellular locations of the protein, separated by commas.

    Example:
    # >>> get_protein_location("Q9Y2G9")
    'Mitochondrion matrix, Mitochondrion inner membrane'
    """
    time.sleep(1)
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml-xml')

    subcellular_locations = soup.findAll('subcellularLocation')
    locations = [location.get_text(separator=' ', strip=True) for location in subcellular_locations]

    return ', '.join(locations)

In [18]:
df = pd.read_csv('drug_central/tchem_drugs_05122020.tsv', sep='\t')
df

Unnamed: 0,uniprot,swissprot,drug_name,act_value,act_type,action_type,source_name,reference,smiles,ChEMBL_Id
0,P42338,PK3CB_HUMAN,copanlisib,8.431798,IC50,INHIBITOR,SCIENTIFIC LITERATURE,http://www.ncbi.nlm.nih.gov/pubmed/24170767,COC1=C(OCCCN2CCOCC2)C=CC2=C1N=C(NC(=O)C1=CN=C(...,CHEMBL3218576
1,P21917,DRD4_HUMAN,brexpiprazole,8.200659,Ki,AGONIST,SCIENTIFIC LITERATURE,http://www.ncbi.nlm.nih.gov/pubmed/24947465,O=C1NC2=C(C=C1)C=CC(OCCCCN1CCN(CC1)C1=C3C=CSC3...,CHEMBL2105760
2,P50406,5HT6R_HUMAN,brexpiprazole,7.236572,Ki,,SCIENTIFIC LITERATURE,http://www.ncbi.nlm.nih.gov/pubmed/24947465,O=C1NC2=C(C=C1)C=CC(OCCCCN1CCN(CC1)C1=C3C=CSC3...,CHEMBL2105760
3,P51575,P2RX1_HUMAN,adenosine triphosphate,7.250000,EC50,AGONIST,IUPHAR,http://www.guidetopharmacology.org/GRAC/Ligand...,NC1=NC=NC2=C1N=CN2[C@@H]1O[C@H](COP(O)(=O)OP(O...,CHEMBL14249
4,Q99571,P2RX4_HUMAN,adenosine triphosphate,6.300000,EC50,AGONIST,IUPHAR,http://www.guidetopharmacology.org/GRAC/Ligand...,NC1=NC=NC2=C1N=CN2[C@@H]1O[C@H](COP(O)(=O)OP(O...,CHEMBL14249
...,...,...,...,...,...,...,...,...,...,...
638,Q9Y6E0,STK24_HUMAN,neratinib,8.190000,Kd,,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,CHEMBL180022
639,Q8IVH8,M4K3_HUMAN,neratinib,8.110000,Kd,,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,CHEMBL180022
640,P0DMS8,AA3R_HUMAN,fostamatinib,7.091515,IC50,INHIBITOR,SCIENTIFIC LITERATURE,http://www.ncbi.nlm.nih.gov/pubmed/16946104,COC1=CC(NC2=NC(NC3=CC=C4OC(C)(C)C(=O)N(COP(O)(...,CHEMBL2103830
641,Q92800,EZH1_HUMAN,tazemetostat,6.410000,IC50,INHIBITOR,DRUG LABEL,https://www.accessdata.fda.gov/drugsatfda_docs...,CCN(C1CCOCC1)C1=CC(=CC(C(=O)NCC2=C(C)C=C(C)NC2...,CHEMBL3414621


In [19]:
df_drug_central = df[['smiles','uniprot']]
df_drug_central

Unnamed: 0,smiles,uniprot
0,COC1=C(OCCCN2CCOCC2)C=CC2=C1N=C(NC(=O)C1=CN=C(...,P42338
1,O=C1NC2=C(C=C1)C=CC(OCCCCN1CCN(CC1)C1=C3C=CSC3...,P21917
2,O=C1NC2=C(C=C1)C=CC(OCCCCN1CCN(CC1)C1=C3C=CSC3...,P50406
3,NC1=NC=NC2=C1N=CN2[C@@H]1O[C@H](COP(O)(=O)OP(O...,P51575
4,NC1=NC=NC2=C1N=CN2[C@@H]1O[C@H](COP(O)(=O)OP(O...,Q99571
...,...,...
638,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,Q9Y6E0
639,CCOC1=CC2=NC=C(C#N)C(NC3=CC=C(OCC4=CC=CC=N4)C(...,Q8IVH8
640,COC1=CC(NC2=NC(NC3=CC=C4OC(C)(C)C(=O)N(COP(O)(...,P0DMS8
641,CCN(C1CCOCC1)C1=CC(=CC(C(=O)NCC2=C(C)C=C(C)NC2...,Q92800


In [20]:
len(set(list(df_drug_central['uniprot'])))

334

In [21]:
get_cell_location = dict()
count = 0
for unipro_id in set(list(df_drug_central['uniprot'])):
    count +=1
    print(count, end="  -  ")
    get_cell_location[unipro_id] = get_protein_location(unipro_id)

1  -  2  -  3  -  4  -  5  -  6  -  7  -  8  -  9  -  10  -  11  -  12  -  13  -  14  -  15  -  16  -  17  -  18  -  19  -  20  -  21  -  22  -  23  -  24  -  25  -  26  -  27  -  28  -  29  -  30  -  31  -  32  -  33  -  34  -  35  -  36  -  37  -  38  -  39  -  40  -  41  -  42  -  43  -  44  -  45  -  46  -  47  -  48  -  49  -  50  -  51  -  52  -  53  -  54  -  55  -  56  -  57  -  58  -  59  -  60  -  61  -  62  -  63  -  64  -  65  -  66  -  67  -  68  -  69  -  70  -  71  -  72  -  73  -  74  -  75  -  76  -  77  -  78  -  79  -  80  -  81  -  82  -  83  -  84  -  85  -  86  -  87  -  88  -  89  -  90  -  91  -  92  -  93  -  94  -  95  -  96  -  97  -  98  -  99  -  100  -  101  -  102  -  103  -  104  -  105  -  106  -  107  -  108  -  109  -  110  -  111  -  112  -  113  -  114  -  115  -  116  -  117  -  118  -  119  -  120  -  121  -  122  -  123  -  124  -  125  -  126  -  127  -  128  -  129  -  130  -  131  -  132  -  133  -  134  -  135  -  136  -  137  -  138  -  139 

In [22]:
df_pos = pd.DataFrame(list(get_cell_location.items()), columns=['uni_prot_id', 'location'])
df_pos

Unnamed: 0,uni_prot_id,location
0,O43172,"Nucleus, Nucleus speckle"
1,O95342,Apical cell membrane Multi-pass membrane prote...
2,P68363,Cytoplasm Cytoskeleton
3,P33981,
4,Q401N2,Cell membrane Multi-pass membrane protein
...,...,...
329,P27361,"Cytoplasm, Nucleus, Membrane Caveola, Cell jun..."
330,O43293,"Nucleus, Cytoplasm, Nucleus, Cytoplasm, Nucleu..."
331,Q9H3N8,Cell membrane Multi-pass membrane protein
332,Q08209,"Cytoplasm, Cell membrane Peripheral membrane p..."


In [23]:
df_pos.to_csv('drug_central_sub_location.csv', index=False)