# Analyzing the SIDER 4.1 data

+ http://sideeffects.embl.de/
+ http://thinklab.com/d/30#4

In [48]:
import csv
import gzip
import collections

In [49]:
import pandas

In [50]:
mkdir MedDRA

In [52]:
# MedDRA SIDER data
base_url = 'http://sideeffects.embl.de/media/download/'
filenames = [
    'README',
    'meddra_all_indications.tsv.gz',
    'meddra_all_se.tsv.gz',
    'meddra_freq.tsv.gz',
]
for filename in filenames:
    ! wget --no-verbose --timestamping --directory-prefix MedDRA {base_url}/{filename}

! mv MedDRA/README MedDRA/README.txt

2020-05-26 18:26:41 URL:http://sideeffects.embl.de/media/download//README [3304/3304] -> "MedDRA/README" [1]
2020-05-26 18:26:41 URL:http://sideeffects.embl.de/media/download//meddra_all_indications.tsv.gz [344689/344689] -> "MedDRA/meddra_all_indications.tsv.gz" [1]
2020-05-26 18:26:42 URL:http://sideeffects.embl.de/media/download//meddra_all_se.tsv.gz [2381171/2381171] -> "MedDRA/meddra_all_se.tsv.gz" [1]
2020-05-26 18:26:43 URL:http://sideeffects.embl.de/media/download//meddra_freq.tsv.gz [2058445/2058445] -> "MedDRA/meddra_freq.tsv.gz" [1]


## STITCH to DrugBank mapping utilities

In [53]:
def stitch_flat_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:]) - 1e8

def stitch_stereo_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:])

In [54]:
# Read DrugBank terms
url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank.tsv'
drugbank_df = pandas.read_table(url)[['drugbank_id', 'name']].rename(columns={'name': 'drugbank_name'})

# Pubchem to DrugBank mapping
url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/pubchem.tsv'
drugbank_map_df = pandas.read_table(url)

## meddra_freq.tsv.gz

In [55]:
columns = [
    'stitch_id_flat',
    'stitch_id_sterio',
    'umls_cui_from_label',
    'placebo',
    'frequency',
    'lower',
    'upper',
    'meddra_type',
    'umls_cui_from_meddra',
    'side_effect_name',
]
freq_df = pandas.read_table('MedDRA/meddra_freq.tsv.gz', names=columns)
freq_df.head(2)

Unnamed: 0,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,placebo,frequency,lower,upper,meddra_type,umls_cui_from_meddra,side_effect_name
0,CID100000085,CID000010917,C0000737,,21%,0.21,0.21,LLT,C0000737,Abdominal pain
1,CID100000085,CID000010917,C0000737,,21%,0.21,0.21,PT,C0000737,Abdominal pain


In [69]:
freq_df.to_csv('MedDRA/meddra_freq_with_cols.tsv')

## meddra_all_se.tsv.gz

In [71]:
columns = [
    'stitch_id_flat',
    'stitch_id_sterio',
    'umls_cui_from_label',
    'meddra_type',
    'umls_cui_from_meddra',
    'side_effect_name',
]
se_df = pandas.read_table('MedDRA/meddra_all_se.tsv.gz', names=columns)
se_df['pubchem_id'] = se_df.stitch_id_sterio.map(stitch_stereo_to_pubchem)
se_df = drugbank_map_df.merge(se_df)
se_df.head(2)

Unnamed: 0,drugbank_id,pubchem_id,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,meddra_type,umls_cui_from_meddra,side_effect_name
0,DB00014,47725,CID100047725,CID000047725,C0000737,LLT,C0000737,Abdominal pain
1,DB00014,47725,CID100047725,CID000047725,C0000737,PT,C0687713,Gastrointestinal pain


In [72]:
se_df['meddra_type'].value_counts()

PT     158671
LLT    141652
Name: meddra_type, dtype: int64

In [57]:
se_df.meddra_type.unique()

array(['LLT', 'PT', nan], dtype=object)

In [58]:
se_df = se_df[['drugbank_id', 'umls_cui_from_meddra', 'side_effect_name']]
se_df = se_df.dropna()
se_df = se_df.drop_duplicates(['drugbank_id', 'umls_cui_from_meddra'])
se_df = drugbank_df.merge(se_df)
se_df = se_df.sort_values(['drugbank_name', 'side_effect_name'])
len(se_df)

153663

In [59]:
# Create a reference of side effect IDs and Names
se_terms_df = se_df[['umls_cui_from_meddra', 'side_effect_name']].drop_duplicates()
assert se_terms_df.side_effect_name.duplicated().sum() == 0
se_terms_df = se_terms_df.sort_values('side_effect_name')
se_terms_df.to_csv('MedDRA/side-effect-terms.tsv', sep='\t', index=False)

In [61]:
# Number of drugbank drugs
se_df.drugbank_id.nunique()

1223

In [62]:
# Number of UMLS side effects
se_df.umls_cui_from_meddra.nunique()

5734

In [63]:
# Save side effects
se_df.to_csv('MedDRA/side-effects.tsv', sep='\t', index=False)

## meddra_all_indications.tsv.gz

In [64]:
columns = [
    'stitch_id_flat',
    'umls_cui_from_label',
    'method',
    'concept_name',
    'meddra_type',
    'umls_cui_from_meddra',
    'meddra_name',
]
indication_df = pandas.read_table('MedDRA/meddra_all_indications.tsv.gz', names=columns)
indication_df['pubchem_id'] = indication_df.stitch_id_flat.map(stitch_flat_to_pubchem)

In [65]:
indication_df = drugbank_df.merge(drugbank_map_df.merge(indication_df))
indication_df = indication_df.query("meddra_type == 'PT'")
indication_df.head(2)

Unnamed: 0,drugbank_id,drugbank_name,pubchem_id,stitch_id_flat,umls_cui_from_label,method,concept_name,meddra_type,umls_cui_from_meddra,meddra_name
1,DB00014,Goserelin,47725,CID100047725,C0002871,text_mention,Anemia,PT,C0002871,Anaemia
3,DB00014,Goserelin,47725,CID100047725,C0006142,NLP_indication,Malignant neoplasm of breast,PT,C0006142,Breast cancer


In [74]:
indication_df.meddra_name.head(20)

1                       Anaemia
3                 Breast cancer
5            Neoplasm malignant
7                 Endometriosis
9                   Menorrhagia
11                 Metrorrhagia
13                   Metastasis
15                         Pain
17            Uterine leiomyoma
19       Bone cancer metastatic
21    Urinary tract obstruction
23              Prostate cancer
25              Prostate cancer
27                Breast cancer
29         Factor IX deficiency
31           Diabetes insipidus
33                     Polyuria
35                     Enuresis
37                  Head injury
39                Haemarthrosis
Name: meddra_name, dtype: object

In [66]:
# Multiple Sclerosis indications
indication_df.query("umls_cui_from_meddra == 'C0026769'").drugbank_name.tolist()

['Baclofen',
 'Betamethasone',
 'Carbamazepine',
 'Triamcinolone',
 'Prednisone',
 'Tizanidine',
 'Hydrocortisone',
 'Prednisolone',
 'Methylprednisolone',
 'Mitoxantrone',
 'Dantrolene',
 'Dexamethasone',
 'FTY 720',
 'Dalfampridine',
 '(11alpha,14beta)-11,17,21-trihydroxypregn-4-ene-3,20-dione',
 'Fingolimod']

In [67]:
# Save indications
indication_df.to_csv('MedDRA/indications.tsv', sep='\t', index=False)