In [3]:
from utility import *

from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import pandas as pd
import collections
import json as json

In [25]:
# df display side by side - helper function for data visualization

from IPython.display import display_html
from itertools import chain,cycle
def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

# Zitnik Dataset Loading

The data sets needed for the loaders can be found at snap.stanford.edu/decagon. The side effect information was curated from the TWOSIDES, OFFSIDES, and Sider databases.

In [9]:
combo2stitch, combo2se, se2name = load_combo_se(fname='../data/decagon_data/bio-decagon-combo.csv')
net, node2idx = load_ppi(fname='../data/decagon_data/bio-decagon-ppi.csv')
stitch2se, se2name_mono = load_mono_se(fname='../data/decagon_data/bio-decagon-mono.csv')
stitch2proteins = load_targets(fname='../data/decagon_data/bio-decagon-targets.csv')
se2class, se2name_class = load_categories(fname='../data/decagon_data/bio-decagon-effectcategories.csv')
se2name.update(se2name_mono)
se2name.update(se2name_class)

Reading: ../data/decagon_data/bio-decagon-combo.csv
Drug combinations: 63473 Side effects: 1317
Drug-drug interactions: 4649441
Reading: ../data/decagon_data/bio-decagon-ppi.csv
Edges: 715612
Nodes: 19081
Reading: ../data/decagon_data/bio-decagon-mono.csv
Reading: ../data/decagon_data/bio-decagon-targets.csv
Reading: ../data/decagon_data/bio-decagon-effectcategories.csv


In [10]:
# summary of original dataset

# drugs with protein info
drugs_w_protein = list(stitch2proteins.keys())
print("drug proteins: " + str(len(drugs_w_protein)))

# drugs with individual SE info 
drugs_w_indiv_se = list(stitch2se.keys())
print("indiv drugs: " + str(len(drugs_w_indiv_se)))

# drugs with combo SE info
combo_drugs = np.unique(np.array([i for pair in list(combo2stitch.values()) for i in pair]))
print("combo drugs: " + str(len(combo_drugs)))

# total drugs
total_drugs = []
[total_drugs.extend(li) for li in (drugs_w_protein, drugs_w_indiv_se, combo_drugs)]
total_drugs = np.array(total_drugs)
total_drugs = np.unique(np.array(total_drugs))
print("total unique drugs: " + str(len(total_drugs)))

drug proteins: 284
indiv drugs: 639
combo drugs: 645
total unique drugs: 645


# CVD Dataset Creation

In [12]:
# read in cvd drugs from NCATS

mi_df = pd.read_csv('../data/NCATS_exports/export_all_uid_MI.tsv', sep='\t')
cad_df = pd.read_csv('../data/NCATS_exports/export_all_uid_CAD.tsv', sep='\t')
chf_df = pd.read_csv('../data/NCATS_exports/export_all_uid_CHF.tsv', sep='\t')

In [14]:
# read in UNII records from GSRS https://precision.fda.gov/uniisearch/archive

unii_records = pd.read_csv("../data/UNII_Data/UNII_Records_13Apr2023.txt", sep='\t', low_memory=False)
legacy_unii_records = pd.read_csv("../data/UNII_Data/Legacy UNIIs.txt", sep='\t', low_memory=False)

### Merge UNII records with total drugs and cvd drug lists

In [15]:
total_df = pd.DataFrame({"total_drugs": total_drugs,
             "drug_num": [int(d[3::].lstrip("0")) for d in total_drugs]})

total_merged = pd.merge(total_df, unii_records, how='left', right_on = 'PUBCHEM', left_on='drug_num')
total_merged = total_merged[["total_drugs", "drug_num", "UNII"]]

### Get drugs with no matched UNIIs to look up manually

In [19]:
null_drugs = total_merged[total_merged['UNII'].isnull()]
null_drugs.to_csv("../data/UNII_Data/null_drugs.csv")

### Read in drugs that were looked up manually

In [20]:
manual = pd.read_csv("../data/UNII_Data/manual_null_drugs.csv")

In [22]:
# summary of manual drugs

nonlive = len(manual[manual["UNII"] == "Non-live"])
null = len(manual[manual["UNII"].isnull()])
other = len(manual) - nonlive - null

print("Non-live: " + str(nonlive))
print("Null: " + str(null))
print("Valid UNII: " + str(other))

Non-live: 33
Null: 66
Valid UNII: 193


### Get number of cvd drugs found in total drugs

In [23]:
total_merged = pd.merge(total_merged, manual[["total_drugs", "UNII"]], 
         on = "total_drugs", how = "left")

total_merged["UNII"] = [i if i is not np.nan else j for i, j in zip(total_merged.UNII_x, total_merged.UNII_y)]
total_merged = total_merged[["total_drugs", "drug_num", "UNII"]]

mi_merged = pd.merge(total_merged, mi_df, on = 'UNII', how='inner')
cad_merged = pd.merge(total_merged, cad_df, on = 'UNII', how='inner')
chf_merged = pd.merge(total_merged, chf_df, on = 'UNII', how='inner')

In [26]:
# display dfs 

display_side_by_side(mi_merged[["UNII", "Name"]], \
                     cad_merged[["UNII", "Name"]], \
                     chf_merged[["UNII", "Name"]], \
                     titles=["MI Drugs", "CAD Drugs", "CHF Drugs"])

Unnamed: 0,UNII,Name
0,8NZ41MIK1O,ENOXAPARIN SODIUM
1,2679MF687A,NIACIN
2,R16CO5Y76E,ASPIRIN
3,0K47UL67F2,CARVEDILOL
4,GEB06NHM23,METOPROLOL
5,G59M7S0WS3,Nitroglycerin
6,L35JN3I7SJ,RAMIPRIL
7,80M03YXJ7I,VALSARTAN

Unnamed: 0,UNII,Name
0,1J444QC288,AMLODIPINE
1,OM90ZUW7M1,TICLOPIDINE

Unnamed: 0,UNII,Name
0,77W477J15H,CHLOROTHIAZIDE
1,M5DP350VZV,ETHACRYNIC ACID
2,F089I0511L,INDAPAMIDE
3,TZ7V40X7VX,METOLAZONE
4,L35JN3I7SJ,RAMIPRIL
5,27O7W4T232,SPIRONOLACTONE


### Create CVD df

In [27]:
# total cvd drug df

cvd = pd.concat([mi_df, cad_df, chf_df])
cvd_df = pd.merge(total_merged, cvd, on = 'UNII', how='inner').drop_duplicates("UNII").reset_index(drop=True)
cvd_df.to_csv("../data/cvd_df.csv")