# Analyzing the SIDER 4.1 data

+ http://sideeffects.embl.de/
+ http://thinklab.com/d/30#4

In [1]:
import csv
import gzip
import collections

In [4]:
import pandas
import requests

In [5]:
import os
import requests

# Define base URL and filenames
base_url = 'http://sideeffects.embl.de/media/download/'
filenames = [
    'README',
    'meddra_all_indications.tsv.gz',
    'meddra_all_se.tsv.gz',
    'meddra_freq.tsv.gz',
]

# Create download directory if it doesn't exist
download_dir = 'download'
os.makedirs(download_dir, exist_ok=True)

# Download each file
for filename in filenames:
    url = f"{base_url}{filename}"
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        file_path = os.path.join(download_dir, filename)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download: {filename}")

# Rename README file
readme_path = os.path.join(download_dir, 'README')
if os.path.exists(readme_path):
    os.rename(readme_path, os.path.join(download_dir, 'README.txt'))
    print("Renamed README to README.txt")
else:
    print("README file not found")


Downloaded: README
Downloaded: meddra_all_indications.tsv.gz
Downloaded: meddra_all_se.tsv.gz
Downloaded: meddra_freq.tsv.gz
Renamed README to README.txt


In [3]:
# Download SIDER data
base_url = 'http://sideeffects.embl.de/media/download/'
filenames = [
    'README',
    'meddra_all_indications.tsv.gz',
    'meddra_all_se.tsv.gz',
    'meddra_freq.tsv.gz',
]
for filename in filenames:
    ! wget --no-verbose --timestamping --directory-prefix download {base_url}/{filename}

! mv download/README download/README.txt

zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget
mv: download/README: No such file or directory


## STITCH to DrugBank mapping utilities

In [6]:
def stitch_flat_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:]) - 1e8

def stitch_stereo_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:])

In [7]:
# Read DrugBank terms
url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank.tsv'
drugbank_df = pandas.read_table(url)[['drugbank_id', 'name']].rename(columns={'name': 'drugbank_name'})

# Pubchem to DrugBank mapping
url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/pubchem.tsv'
drugbank_map_df = pandas.read_table(url)

## meddra_freq.tsv.gz

In [8]:
columns = [
    'stitch_id_flat',
    'stitch_id_sterio',
    'umls_cui_from_label',
    'placebo',
    'frequency',
    'lower',
    'upper',
    'meddra_type',
    'umls_cui_from_meddra',
    'side_effect_name',
]
freq_df = pandas.read_table('download/meddra_freq.tsv.gz', names=columns)
freq_df.head(2)

Unnamed: 0,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,placebo,frequency,lower,upper,meddra_type,umls_cui_from_meddra,side_effect_name
0,CID100000085,CID000010917,C0000737,,21%,0.21,0.21,LLT,C0000737,Abdominal pain
1,CID100000085,CID000010917,C0000737,,21%,0.21,0.21,PT,C0000737,Abdominal pain


## meddra_all_se.tsv.gz

In [9]:
columns = [
    'stitch_id_flat',
    'stitch_id_sterio',
    'umls_cui_from_label',
    'meddra_type',
    'umls_cui_from_meddra',
    'side_effect_name',
]
se_df = pandas.read_table('download/meddra_all_se.tsv.gz', names=columns)
se_df['pubchem_id'] = se_df.stitch_id_sterio.map(stitch_stereo_to_pubchem)
se_df = drugbank_map_df.merge(se_df)
se_df.head(2)

Unnamed: 0,drugbank_id,pubchem_id,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,meddra_type,umls_cui_from_meddra,side_effect_name
0,DB00014,47725,CID100047725,CID000047725,C0000737,LLT,C0000737,Abdominal pain
1,DB00014,47725,CID100047725,CID000047725,C0000737,PT,C0687713,Gastrointestinal pain


In [10]:
se_df = se_df[['drugbank_id', 'umls_cui_from_meddra', 'side_effect_name']]
se_df = se_df.dropna()
se_df = se_df.drop_duplicates(['drugbank_id', 'umls_cui_from_meddra'])
se_df = drugbank_df.merge(se_df)
se_df = se_df.sort_values(['drugbank_name', 'side_effect_name'])
len(se_df)

153663

In [11]:
# Create a reference of side effect IDs and Names
se_terms_df = se_df[['umls_cui_from_meddra', 'side_effect_name']].drop_duplicates()
assert se_terms_df.side_effect_name.duplicated().sum() == 0
se_terms_df = se_terms_df.sort_values('side_effect_name')
se_terms_df.to_csv('data/side-effect-terms.tsv', sep='\t', index=False)

In [12]:
# Side effects of cocaine
se_df.query("drugbank_id == 'DB00907'")

Unnamed: 0,drugbank_id,drugbank_name,umls_cui_from_meddra,side_effect_name
80494,DB00907,Cocaine,C0085631,Agitation
80495,DB00907,Cocaine,C0233571,Excitement
80486,DB00907,Cocaine,C0014549,Grand mal convulsion
80487,DB00907,Cocaine,C0020517,Hypersensitivity
80488,DB00907,Cocaine,C0026961,Mydriasis
80489,DB00907,Cocaine,C0027769,Nervousness
80496,DB00907,Cocaine,C1145670,Respiratory failure
80497,DB00907,Cocaine,C1325847,Sensitisation
80490,DB00907,Cocaine,C0233494,Tension
80491,DB00907,Cocaine,C0040822,Tremor


In [13]:
# Number of drugbank drugs
se_df.drugbank_id.nunique()

1223

In [14]:
# Number of UMLS side effects
se_df.umls_cui_from_meddra.nunique()

5734

In [15]:
# Save side effects
se_df.to_csv('data/side-effects.tsv', sep='\t', index=False)

## meddra_all_indications.tsv.gz

In [16]:
columns = [
    'stitch_id_flat',
    'umls_cui_from_label',
    'method',
    'concept_name',
    'meddra_type',
    'umls_cui_from_meddra',
    'meddra_name',
]
indication_df = pandas.read_table('download/meddra_all_indications.tsv.gz', names=columns)
indication_df['pubchem_id'] = indication_df.stitch_id_flat.map(stitch_flat_to_pubchem)

In [17]:
indication_df = drugbank_df.merge(drugbank_map_df.merge(indication_df))
indication_df = indication_df.query("meddra_type == 'PT'")
indication_df.head(2)

Unnamed: 0,drugbank_id,drugbank_name,pubchem_id,stitch_id_flat,umls_cui_from_label,method,concept_name,meddra_type,umls_cui_from_meddra,meddra_name
1,DB00014,Goserelin,47725,CID100047725,C0002871,text_mention,Anemia,PT,C0002871,Anaemia
3,DB00014,Goserelin,47725,CID100047725,C0006142,NLP_indication,Malignant neoplasm of breast,PT,C0006142,Breast cancer


In [18]:
# Multiple Sclerosis indications
indication_df.query("umls_cui_from_meddra == 'C0026769'").drugbank_name.tolist()

['Baclofen',
 'Betamethasone',
 'Carbamazepine',
 'Triamcinolone',
 'Prednisone',
 'Tizanidine',
 'Hydrocortisone',
 'Prednisolone',
 'Methylprednisolone',
 'Mitoxantrone',
 'Dantrolene',
 'Dexamethasone',
 'FTY 720',
 'Dalfampridine',
 '(11alpha,14beta)-11,17,21-trihydroxypregn-4-ene-3,20-dione',
 'Fingolimod']

In [19]:
# Save indications
indication_df.to_csv('data/indications.tsv', sep='\t', index=False)