# Parse the DrugBank XML and extract TSVs

Run using Python 3 to avoid a non-ascii character error when writing to file with the csv module.

In [1]:
import os
import csv
import gzip
import collections
import re
import io
import json
import xml.etree.ElementTree as ET

import requests
import pandas
import numpy as np

In [2]:
import gseapy as gp
from gseapy.parser import Biomart

In [3]:
bm = Biomart(verbose=False, host="asia.ensembl.org")

In [5]:
xml_path = os.path.join('drugbank/', 'full database.xml')
with open(xml_path) as xml_file:
    tree = ET.parse(xml_file)
root = tree.getroot()

In [6]:
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"

rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    row['description'] = drug.findtext(ns + "description")
    row['groups'] = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['atc_codes'] = [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)

    rows.append(row)

In [7]:
alias_dict = {row['drugbank_id']: row['aliases'] for row in rows}
with open('./drugbank/data/aliases.json', 'w') as fp:
    json.dump(alias_dict, fp, indent=2, sort_keys=True)

In [8]:
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row

rows = list(map(collapse_list_values, rows))

In [9]:
columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi', 'description']
drugbank_df = pandas.DataFrame.from_dict(rows)[columns]
drugbank_df.head()

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
0,DB00001,Lepirudin,biotech,approved,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",,,Lepirudin is identical to natural hirudin exce...
1,DB00002,Cetuximab,biotech,approved,L01XC06,"Amino Acids, Peptides, and Proteins|Antibodies...",,,Cetuximab is an epidermal growth factor recept...
2,DB00003,Dornase alfa,biotech,approved,R05CB13,"Amino Acids, Peptides, and Proteins|Cough and ...",,,Dornase alfa is a biosynthetic form of human d...
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29,"ADP Ribose Transferases|Amino Acids, Peptides,...",,,A recombinant DNA-derived cytotoxic protein co...
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01,"Agents reducing cytokine levels|Amino Acids, P...",,,Dimeric fusion protein consisting of the extra...


In [10]:
drugbank_df.shape

(13563, 9)

In [11]:
drugbank_slim_df = drugbank_df[
    drugbank_df.groups.map(lambda x: 'approved' in x) &
    drugbank_df.inchi.map(lambda x: x is not None) &
    drugbank_df.type.map(lambda x: x == 'small molecule')
]
drugbank_slim_df.head()

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
5,DB00006,Bivalirudin,small molecule,approved|investigational,B01AE06,"Amino Acids, Peptides, and Proteins|Anticoagul...",OIRCOABEOLEUMC-GEJPAHFPSA-N,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122...,Bivalirudin is a synthetic 20 residue peptide ...
6,DB00007,Leuprolide,small molecule,approved|investigational,L02AE51|L02AE02,Adrenal Cortex Hormones|Agents Causing Muscle ...,GFIJNRVAKGFPGQ-LIJARHBVSA-N,InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-2...,Leuprolide belongs to the general class of dru...
13,DB00014,Goserelin,small molecule,approved,L02AE03,"Adrenal Cortex Hormones|Amino Acids, Peptides,...",BLCLNMBMMGCOAS-URPVMXJPSA-N,InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...,"Goserelin is a synthetic hormone. In men, it s..."
25,DB00027,Gramicidin D,small molecule,approved,R02AB30,"Amino Acids, Peptides, and Proteins|Anti-Bacte...",NDAYQJDHGXTBJL-MWWSRJDJSA-N,InChI=1S/C96H135N19O16/c1-50(2)36-71(105-79(11...,Gramcidin D is a heterogeneous mixture of thre...
33,DB00035,Desmopressin,small molecule,approved,H01BA02,"Agents that produce hypertension|Amino Acids, ...",NFLWUMRGJYTJIN-PNIOQBSNSA-N,InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(...,"Desmopressin (dDAVP), a synthetic analogue of ..."


In [12]:
drugbank_slim_df.shape

(2618, 9)

In [13]:
# write drugbank tsv
path = os.path.join('drugbank/data', 'drugbank.tsv')
drugbank_df.to_csv(path, sep='\t', index=False)

# write slim drugbank tsv
path = os.path.join('drugbank/data', 'drugbank-slim.tsv')
drugbank_slim_df.to_csv(path, sep='\t', index=False)

## Extract protein information

In [14]:
protein_rows = list()
for i, drug in enumerate(root):
    drugbank_id = drug.findtext(ns + "drugbank-id[@primary='true']")
    for category in ['target', 'enzyme', 'carrier', 'transporter']:
        proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))
        for protein in proteins:
            row = {'drugbank_id': drugbank_id, 'category': category}
            row['organism'] = protein.findtext('{}organism'.format(ns))
            row['known_action'] = protein.findtext('{}known-action'.format(ns))
            actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))
            row['actions'] = '|'.join(action.text for action in actions)
            uniprot_ids = [polypep.text for polypep in protein.findall(
                "{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier".format(ns=ns))]            
            if len(uniprot_ids) != 1:
                continue
            row['uniprot_id'] = uniprot_ids[0]
            ref_text = protein.findtext("{ns}references[@format='textile']".format(ns=ns))
            if str(type(ref_text)) != "<class 'NoneType'>" :
                pmids = re.findall(r'pubmed/([0-9]+)', ref_text)
                row['pubmed_ids'] = '|'.join(pmids)
            else:
                row['pubmed_ids'] = None
            protein_rows.append(row)

protein_df = pandas.DataFrame.from_dict(protein_rows)

In [15]:
# Read our uniprot to entrez_gene mapping
response = requests.get('http://git.dhimmel.com/uniprot/data/map/GeneID.tsv.gz', stream=True)
text = io.TextIOWrapper(gzip.GzipFile(fileobj=response.raw))
uniprot_df = pandas.read_table(text, engine='python')
uniprot_df.rename(columns={'uniprot': 'uniprot_id', 'GeneID': 'entrez_gene_id'}, inplace=True)

# merge uniprot mapping with protein_df
entrez_df = protein_df.merge(uniprot_df, how='inner')

In [16]:
columns = ['drugbank_id', 'category', 'uniprot_id', 'entrez_gene_id', 'organism',
           'known_action', 'actions', 'pubmed_ids']
entrez_df = entrez_df[columns]

In [17]:
path = os.path.join('drugbank/data', 'proteins.tsv')
entrez_df.to_csv(path, sep='\t', index=False)

In [18]:
entrez_df

Unnamed: 0,drugbank_id,category,uniprot_id,entrez_gene_id,organism,known_action,actions,pubmed_ids
0,DB00001,target,P00734,2147,Humans,yes,inhibitor,
1,DB00006,target,P00734,2147,Humans,yes,inhibitor,
2,DB00025,enzyme,P00734,2147,Humans,unknown,activator,
3,DB00055,target,P00734,2147,Humans,unknown,,
4,DB00100,target,P00734,2147,Humans,unknown,,
...,...,...,...,...,...,...,...,...
27322,DB15493,target,Q8IXJ6,22933,Humans,yes,inhibitor,
27323,DB15495,target,P35232,5245,Humans,yes,inhibitor,
27324,DB15496,target,P35232,5245,Humans,yes,inhibitor,
27325,DB15570,target,P51679,1233,Humans,unknown,antagonist,


In [19]:
entrez_df.actions.value_counts()

                                              11195
inhibitor                                      5042
substrate                                      4145
antagonist                                     1564
agonist                                        1101
                                              ...  
binder|potentiator                                1
intercalation                                     1
desensitize the target                            1
antagonist|inhibitory allosteric modulator        1
inducer|acetylation                               1
Name: actions, Length: 112, dtype: int64

In [20]:
# Number of unique genes with an interaction
len(set(entrez_df.entrez_gene_id))

4628

In [21]:
# Number of unique drugs  with an interaction
len(set(entrez_df.drugbank_id))

7057

In [22]:
drugbank_df.head()

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
0,DB00001,Lepirudin,biotech,approved,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",,,Lepirudin is identical to natural hirudin exce...
1,DB00002,Cetuximab,biotech,approved,L01XC06,"Amino Acids, Peptides, and Proteins|Antibodies...",,,Cetuximab is an epidermal growth factor recept...
2,DB00003,Dornase alfa,biotech,approved,R05CB13,"Amino Acids, Peptides, and Proteins|Cough and ...",,,Dornase alfa is a biosynthetic form of human d...
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29,"ADP Ribose Transferases|Amino Acids, Peptides,...",,,A recombinant DNA-derived cytotoxic protein co...
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01,"Agents reducing cytokine levels|Amino Acids, P...",,,Dimeric fusion protein consisting of the extra...


In [23]:
drugbank_df.columns

Index(['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories',
       'inchikey', 'inchi', 'description'],
      dtype='object')

In [24]:
# Maping multiple columns
drugbank_df.index = drugbank_df.drugbank_id
entrez_df.index = entrez_df.drugbank_id
for i in ['name', 'type', 'groups', 'atc_codes', 'categories', 'description']:
    entrez_df[i] = entrez_df.drugbank_id.map(drugbank_df[i])

In [25]:
entrez_df.head()

Unnamed: 0_level_0,drugbank_id,category,uniprot_id,entrez_gene_id,organism,known_action,actions,pubmed_ids,name,type,groups,atc_codes,categories,description
drugbank_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
DB00001,DB00001,target,P00734,2147,Humans,yes,inhibitor,,Lepirudin,biotech,approved,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",Lepirudin is identical to natural hirudin exce...
DB00006,DB00006,target,P00734,2147,Humans,yes,inhibitor,,Bivalirudin,small molecule,approved|investigational,B01AE06,"Amino Acids, Peptides, and Proteins|Anticoagul...",Bivalirudin is a synthetic 20 residue peptide ...
DB00025,DB00025,enzyme,P00734,2147,Humans,unknown,activator,,"Antihemophilic factor, human recombinant",biotech,approved|investigational,B02BD02,"Amino Acids, Peptides, and Proteins|Biological...",Human recombinant antihemophilic factor (AHF) ...
DB00055,DB00055,target,P00734,2147,Humans,unknown,,,Drotrecogin alfa,biotech,approved|investigational|withdrawn,B01AD10,"Amino Acids, Peptides, and Proteins|Anti-Infec...",Drotrecogin alfa is activated human protein C ...
DB00100,DB00100,target,P00734,2147,Humans,unknown,,,Coagulation Factor IX (Recombinant),biotech,approved|investigational,,Blood Coagulation Factors|Hemostatics|Increase...,Recombinant Coagulation Factor IX is a purifie...


In [26]:
# Mapping Entrez to gene symbol
glist = np.unique(entrez_df.entrez_gene_id.values).tolist()
glist = [str(x) for x in glist if type(x) ==int]
results = bm.query(dataset='hsapiens_gene_ensembl', attributes=['entrezgene_id','hgnc_symbol'],#, 'go_id'],
                   filters={'entrezgene_id': glist})

In [27]:
entrez_df

Unnamed: 0_level_0,drugbank_id,category,uniprot_id,entrez_gene_id,organism,known_action,actions,pubmed_ids,name,type,groups,atc_codes,categories,description
drugbank_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
DB00001,DB00001,target,P00734,2147,Humans,yes,inhibitor,,Lepirudin,biotech,approved,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",Lepirudin is identical to natural hirudin exce...
DB00006,DB00006,target,P00734,2147,Humans,yes,inhibitor,,Bivalirudin,small molecule,approved|investigational,B01AE06,"Amino Acids, Peptides, and Proteins|Anticoagul...",Bivalirudin is a synthetic 20 residue peptide ...
DB00025,DB00025,enzyme,P00734,2147,Humans,unknown,activator,,"Antihemophilic factor, human recombinant",biotech,approved|investigational,B02BD02,"Amino Acids, Peptides, and Proteins|Biological...",Human recombinant antihemophilic factor (AHF) ...
DB00055,DB00055,target,P00734,2147,Humans,unknown,,,Drotrecogin alfa,biotech,approved|investigational|withdrawn,B01AD10,"Amino Acids, Peptides, and Proteins|Anti-Infec...",Drotrecogin alfa is activated human protein C ...
DB00100,DB00100,target,P00734,2147,Humans,unknown,,,Coagulation Factor IX (Recombinant),biotech,approved|investigational,,Blood Coagulation Factors|Hemostatics|Increase...,Recombinant Coagulation Factor IX is a purifie...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DB15493,DB15493,target,Q8IXJ6,22933,Humans,yes,inhibitor,,Cambinol,small molecule,experimental,,Pyrimidines,Cambinol is a beta-naphtol derivative that inh...
DB15495,DB15495,target,P35232,5245,Humans,yes,inhibitor,,Rocaglamide,small molecule,experimental,,"Heterocyclic Compounds, Fused-Ring","Rocaglamide, also referred to as rocaglamide-A..."
DB15496,DB15496,target,P35232,5245,Humans,yes,inhibitor,,Didesmethylrocaglamide,small molecule,experimental,,,Didesmethylrocaglamide is a naturally-occurrin...
DB15570,DB15570,target,P51679,1233,Humans,unknown,antagonist,,FLX475,small molecule,investigational,,,FLX475 is a small molecule CCR4 antagonist bei...


In [28]:
entrez_df.index = np.arange(entrez_df.shape[0])
results =  results.dropna().drop_duplicates().reset_index(drop=True)
#entrez_df['hgnc_symbol'] = 0
entrez_df['hgnc_symbol'] = entrez_df.entrez_gene_id.map(results['hgnc_symbol'])
#for i in range(entrez_df.shape[0]):
#    entrez_df.iloc[i,-1] = results.loc[results.entrezgene_id==entrez_df.iloc[i,3],'hgnc_symbol'][0].values#entrez_df.entrez_gene_id.map(results['hgnc_symbol'])

In [29]:
results.shape

(3005, 2)

In [30]:
entrez_df.head()

Unnamed: 0,drugbank_id,category,uniprot_id,entrez_gene_id,organism,known_action,actions,pubmed_ids,name,type,groups,atc_codes,categories,description,hgnc_symbol
0,DB00001,target,P00734,2147,Humans,yes,inhibitor,,Lepirudin,biotech,approved,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",Lepirudin is identical to natural hirudin exce...,RET
1,DB00006,target,P00734,2147,Humans,yes,inhibitor,,Bivalirudin,small molecule,approved|investigational,B01AE06,"Amino Acids, Peptides, and Proteins|Anticoagul...",Bivalirudin is a synthetic 20 residue peptide ...,RET
2,DB00025,enzyme,P00734,2147,Humans,unknown,activator,,"Antihemophilic factor, human recombinant",biotech,approved|investigational,B02BD02,"Amino Acids, Peptides, and Proteins|Biological...",Human recombinant antihemophilic factor (AHF) ...,RET
3,DB00055,target,P00734,2147,Humans,unknown,,,Drotrecogin alfa,biotech,approved|investigational|withdrawn,B01AD10,"Amino Acids, Peptides, and Proteins|Anti-Infec...",Drotrecogin alfa is activated human protein C ...,RET
4,DB00100,target,P00734,2147,Humans,unknown,,,Coagulation Factor IX (Recombinant),biotech,approved|investigational,,Blood Coagulation Factors|Hemostatics|Increase...,Recombinant Coagulation Factor IX is a purifie...,RET


In [31]:
entrez_df.to_csv('DrugBank_Drug_Target.csv')

### Integrate MedDRA side effect database

In [111]:
se_df = pandas.read_csv('MedDRA/side-effects.tsv',sep='\t') #pandas.read_csv('MedDRA/meddra_freq_with_cols.tsv',index_col=0)

In [112]:
se_df.head(5)

Unnamed: 0,drugbank_id,drugbank_name,umls_cui_from_meddra,side_effect_name
0,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",C0000729,Abdominal cramps
1,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",C0000737,Abdominal pain
2,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",C0232492,Abdominal pain upper
3,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",C0740651,Abdominal symptom
4,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",C0877331,Abnormal clotting factor


In [113]:
se_df_unique = se_df.groupby('drugbank_id').agg({'umls_cui_from_meddra': lambda x: list(x),'umls_cui_from_meddra': lambda x: list(x),
                                      'side_effect_name': lambda x: list(x),'side_effect_name': lambda x: list(x),
                                     })
se_df_unique.head(2)

Unnamed: 0_level_0,umls_cui_from_meddra,side_effect_name
drugbank_id,Unnamed: 1_level_1,Unnamed: 2_level_1
DB00014,"[C0000737, C0702166, C0948089, C0559546, C0233...","[Abdominal pain, Acne, Acute coronary syndrome..."
DB00035,"[C1291077, C0000729, C0000731, C0000737, C0233...","[Abdominal bloating, Abdominal cramps, Abdomin..."


In [114]:
# Maping side effect to drug bank using drukbank_id columns
#se_df_unique.index = se_df_unique.drugbank_id
#entrez_df.index = entrez_df.drugbank_id
for i in ['umls_cui_from_meddra', 'side_effect_name']:
    entrez_df[i] = entrez_df.drugbank_id.map(se_df_unique[i])

### Integrate MedDRA indication database

In [64]:
ind_df = pandas.read_csv('MedDRA/indications.tsv',sep='\t')
ind_df.head()

Unnamed: 0,drugbank_id,drugbank_name,pubchem_id,stitch_id_flat,umls_cui_from_label,method,concept_name,meddra_type,umls_cui_from_meddra,meddra_name
0,DB00014,Goserelin,47725,CID100047725,C0002871,text_mention,Anemia,PT,C0002871,Anaemia
1,DB00014,Goserelin,47725,CID100047725,C0006142,NLP_indication,Malignant neoplasm of breast,PT,C0006142,Breast cancer
2,DB00014,Goserelin,47725,CID100047725,C0006826,NLP_precondition,Malignant Neoplasms,PT,C0006826,Neoplasm malignant
3,DB00014,Goserelin,47725,CID100047725,C0014175,NLP_indication,Endometriosis,PT,C0014175,Endometriosis
4,DB00014,Goserelin,47725,CID100047725,C0025323,text_mention,Menorrhagia,PT,C0025323,Menorrhagia


In [65]:
ind_df.method.unique()

array(['text_mention', 'NLP_indication', 'NLP_precondition'], dtype=object)

In [97]:
ind_df_unique = ind_df.groupby('drugbank_id').agg({'umls_cui_from_meddra': lambda x: list(x),'umls_cui_from_meddra': lambda x: list(x),
                                                   'meddra_name': lambda x: list(x),'meddra_name': lambda x: np.unique(list(x)).tolist(),
                                                   })
ind_df_unique.columns = ['umls_cui_from_meddra','Meddra_Indication']
ind_df_unique.head(2)

Unnamed: 0_level_0,umls_cui_from_meddra,Meddra_Indication
drugbank_id,Unnamed: 1_level_1,Unnamed: 2_level_1
DB00014,"[C0002871, C0006142, C0006826, C0014175, C0025...","[Anaemia, Bone cancer metastatic, Breast cance..."
DB00035,"[C0008533, C0011848, C0032617, C0014394, C0018...","[Blood osmolarity decreased, Diabetes insipidu..."


In [98]:
# Maping indication to drug bank using drukbank_id columns
#se_df_unique.index = se_df_unique.drugbank_id
#entrez_df.index = entrez_df.drugbank_id
for i in [ 'Meddra_Indication']:
    entrez_df[i] = entrez_df.drugbank_id.map(ind_df_unique[i])

In [99]:
entrez_df.to_csv('DrugBank_Drug_Target_SideEffect_Indication.csv')

In [123]:
idx = se_df.side_effect_name.str.lower().str.contains('titis')
se_df_ = se_df.loc[idx,:]
se_df_

Unnamed: 0,drugbank_id,drugbank_name,umls_cui_from_meddra,side_effect_name
15,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",C0162820,Allergic contact dermatitis
109,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",C0011603,Dermatitis
110,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",C0234894,Dermatitis acneiform
111,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",C0011615,Dermatitis atopic
112,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",C0085932,Dermatitis bullous
...,...,...,...,...
153522,DB08439,parecoxib,C0019158,Hepatitis
153549,DB08439,parecoxib,C0030305,Pancreatitis
153576,DB05383,pimagedine HCl,C0011603,Dermatitis
153605,DB07424,tripotassium (1R)-4-biphenyl-4-yl-1-phosphonat...,C0011603,Dermatitis


In [100]:
#entrez_df = entrez_df.drop(columns=['meddra_name'])

In [124]:
idx = np.logical_or(se_df.side_effect_name.str.lower().str.contains('toxic') , se_df.side_effect_name.str.contains('poisining'))
se_df_ = se_df.loc[idx,:]
se_df_.shape

(819, 4)

In [125]:
se_df_

Unnamed: 0,drugbank_id,drugbank_name,umls_cui_from_meddra,side_effect_name
1214,DB08567,"(1S,4S)-4-(3,4-dichlorophenyl)-N-methyl-1,2,3,...",C0014518,Toxic epidermal necrolysis
1329,DB07129,"(2R)-1-(2,6-dimethylphenoxy)propan-2-amine",C0919924,Pulmonary toxicity
1678,DB08298,(2S)-2-(6-methoxynaphthalen-2-yl)propanoic acid,C0014518,Toxic epidermal necrolysis
1803,DB02932,(2r)-N-[4-Cyano-3-(Trifluoromethyl)Phenyl]-3-[...,C0235378,Hepatotoxicity
2084,DB08472,(3R)-N-methyl-3-phenyl-3-[4-(trifluoromethyl)p...,C0876994,Cardiotoxicity
...,...,...,...,...
153072,DB05291,lidocaine patch,C3160947,CNS toxicity
153142,DB05291,lidocaine patch,C0235032,Neurotoxicity
153330,DB05302,naltrexone depot,C0235378,Hepatotoxicity
153565,DB08439,parecoxib,C0014518,Toxic epidermal necrolysis


In [103]:
# Selecting side effects with Toxicity only
se_df_ = se_df.loc[idx,:]
se_df_unique_ = se_df_.groupby('drugbank_id').agg({'umls_cui_from_meddra': lambda x: list(x),'umls_cui_from_meddra': lambda x: list(x),
                                                 'side_effect_name': lambda x: list(x),'side_effect_name': lambda x: list(x),
                                                 })
for i in ['umls_cui_from_meddra', 'side_effect_name']:
    entrez_df[i] = entrez_df.drugbank_id.map(se_df_unique_[i])
entrez_df.to_csv('DrugBank_Drug_Target_Toxicity_Only_Indication.csv')

In [104]:
entrez_df.head()

Unnamed: 0,drugbank_id,category,uniprot_id,entrez_gene_id,organism,known_action,actions,pubmed_ids,name,type,groups,atc_codes,categories,description,hgnc_symbol,umls_cui_from_meddra,side_effect_name,Meddra_Indication
0,DB00001,target,P00734,2147,Humans,yes,inhibitor,,Lepirudin,biotech,approved,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",Lepirudin is identical to natural hirudin exce...,RET,,,
1,DB00006,target,P00734,2147,Humans,yes,inhibitor,,Bivalirudin,small molecule,approved|investigational,B01AE06,"Amino Acids, Peptides, and Proteins|Anticoagul...",Bivalirudin is a synthetic 20 residue peptide ...,RET,,,
2,DB00025,enzyme,P00734,2147,Humans,unknown,activator,,"Antihemophilic factor, human recombinant",biotech,approved|investigational,B02BD02,"Amino Acids, Peptides, and Proteins|Biological...",Human recombinant antihemophilic factor (AHF) ...,RET,,,
3,DB00055,target,P00734,2147,Humans,unknown,,,Drotrecogin alfa,biotech,approved|investigational|withdrawn,B01AD10,"Amino Acids, Peptides, and Proteins|Anti-Infec...",Drotrecogin alfa is activated human protein C ...,RET,,,
4,DB00100,target,P00734,2147,Humans,unknown,,,Coagulation Factor IX (Recombinant),biotech,approved|investigational,,Blood Coagulation Factors|Hemostatics|Increase...,Recombinant Coagulation Factor IX is a purifie...,RET,,,


In [105]:
# Drugbank entries with SE only
entrez_df_with_se = entrez_df.loc[~entrez_df.side_effect_name.isna(),:]

In [106]:
entrez_df_with_se.head()

Unnamed: 0,drugbank_id,category,uniprot_id,entrez_gene_id,organism,known_action,actions,pubmed_ids,name,type,groups,atc_codes,categories,description,hgnc_symbol,umls_cui_from_meddra,side_effect_name,Meddra_Indication
121,DB00281,target,P00533,1956,Humans,unknown,antagonist,,Lidocaine,small molecule,approved|vet_approved,S01HA07|D04AB01|R02AD02|C01BB01|S02DA01|N01BB5...,Agents for Treatment of Hemorrhoids and Anal F...,Ever since its discovery and availability for ...,ACOT13,"[C3160947, C0235032]","[CNS toxicity, Neurotoxicity]","[Acute coronary syndrome, Acute myocardial inf..."
122,DB00317,target,P00533,1956,Humans,yes,antagonist,,Gefitinib,small molecule,approved|investigational,L01XE02,Antineoplastic Agents|Antineoplastic and Immun...,Gefitinib (originally coded ZD1839) is a drug ...,ACOT13,[C0919924],[Pulmonary toxicity],"[Haemoglobin, Haemorrhage, Idiopathic pulmonar..."
123,DB00530,target,P00533,1956,Humans,yes,antagonist,,Erlotinib,small molecule,approved|investigational,L01XE03,Antineoplastic Agents|Antineoplastic and Immun...,Erlotinib is an inhibitor of the epidermal gro...,ACOT13,"[C0235378, C0919924]","[Hepatotoxicity, Pulmonary toxicity]","[Metastases to pancreas, Neoplasm, Neoplasm ma..."
124,DB01259,target,P00533,1956,Humans,yes,antagonist,,Lapatinib,small molecule,approved|investigational,L01XE07,Antineoplastic Agents|Antineoplastic and Immun...,Lapatinib is an anti-cancer drug developed by ...,ACOT13,"[C0876994, C0235378]","[Cardiotoxicity, Hepatotoxicity]","[Breast cancer, Breast cancer stage IV, Diseas..."
137,DB08916,target,P00533,1956,Humans,yes,inhibitor,,Afatinib,small molecule,approved,L01XE13,Amides|Antineoplastic Agents|Antineoplastic an...,Afatinib is a 4-anilinoquinazoline tyrosine ki...,ACOT13,[C0919924],[Pulmonary toxicity],"[Neoplasm, Non-small cell lung cancer, Small c..."
