# TDC

In [15]:
import pandas as pd 
import numpy as np
import itertools

from tdc.multi_pred import DDI
from tdc.chem_utils import MolConvert
import sklearn.metrics.pairwise
from sklearn.metrics import pairwise_distances

import nbimporter
from assistant import distinct, drug_index, labels_to_tensor, merge_smile, vectorize_smile, save, drug_combi_index, data_indexed_combi, vectorize_drugcombi, tanimoto_similarity_matrix

## TDC Twosides Dataset

In [7]:
# data = DDI(name = 'TWOSIDES').get_data() 

Data downloaded as ./data/twosides.csv automatically

In [2]:
# full data
data = pd.read_csv("./data/twosides.csv")
data.columns = ["Drug1", "Drug2", "SE_Index" ,"SE", "Drug1_smile", "Drug2_smile"]  # to unify with other datasets

In [3]:
data

Unnamed: 0,Drug1,Drug2,SE_Index,SE,Drug1_smile,Drug2_smile
0,CID000002173,CID000003345,1024,hypermagnesemia,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3
1,CID000002173,CID000003345,767,retinopathy of prematurity,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3
2,CID000002173,CID000003345,79,atelectasis,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3
3,CID000002173,CID000003345,25,alkalosis,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3
4,CID000002173,CID000003345,85,Back Ache,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3
...,...,...,...,...,...,...
4649436,CID000003461,CID000003954,1008,deep vein thromboses,C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F,CN(C)C(=O)C(CC[NH+]1CCC(CC1)(C2=CC=C(C=C2)Cl)O...
4649437,CID000003461,CID000003954,769,rhabdomyolysis,C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F,CN(C)C(=O)C(CC[NH+]1CCC(CC1)(C2=CC=C(C=C2)Cl)O...
4649438,CID000003461,CID000003954,930,loss of weight,C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F,CN(C)C(=O)C(CC[NH+]1CCC(CC1)(C2=CC=C(C=C2)Cl)O...
4649439,CID000003461,CID000003954,72,ascites,C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F,CN(C)C(=O)C(CC[NH+]1CCC(CC1)(C2=CC=C(C=C2)Cl)O...


### Indexing Dataset|

In [4]:
# extract distinct Drug and Y (Side Effect)
distinct_drug, distinct_SE = distinct(data, "Drug1", "Drug2", "SE")

# index distinct Drug and Y (Side Effect)
drug_idx = drug_index(distinct_drug) 
SE_idx = data[["SE", "SE_Index"]].drop_duplicates().sort_values("SE_Index").reset_index(drop=True)  # for SE, the given index is used.

In [5]:
# index the whole dataset
data_indexed = data.merge(drug_idx, left_on = "Drug1", right_on ="Drug_ID").merge(drug_idx, left_on = "Drug2",right_on ="Drug_ID").merge(SE_idx, on = "SE")

# Reconstruct 
data_indexed = data_indexed[["Drug1", "Drug2", "SE", "Drug1_smile", "Drug2_smile", "Drug_Index_x", "Drug_Index_y", "SE_Index_x"]].rename(columns={"Drug_Index_x":"Drug1_Index", "Drug_Index_y":"Drug2_Index", "SE_Index_x":"SE_Index"})
data_indexed

Unnamed: 0,Drug1,Drug2,SE,Drug1_smile,Drug2_smile,Drug1_Index,Drug2_Index,SE_Index
0,CID000002173,CID000003345,hypermagnesemia,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,606,345,1024
1,CID000000853,CID000003345,hypermagnesemia,C1=C(C=C(C(=C1I)OC2=CC(=C(C(=C2)I)O)I)I)CC(C(=...,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,397,345,1024
2,CID000003143,CID000003345,hypermagnesemia,CC1=C2C(C(=O)C3(C(CC4C(C3C(C(C2(C)C)(CC1OC(=O)...,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,356,345,1024
3,CID000003168,CID000003345,hypermagnesemia,C1CN(CC=C1N2C3=CC=CC=C3NC2=O)CCCC(=O)C4=CC=C(C...,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,616,345,1024
4,CID000000853,CID000004873,hypermagnesemia,C1=C(C=C(C(=C1I)OC2=CC(=C(C(=C2)I)O)I)I)CC(C(=...,[Cl-].[K+],397,271,1024
...,...,...,...,...,...,...,...,...
4649436,CID000002308,CID000002541,renal agenesis,CC1CC2C3CCC4=CC(=O)C=CC4(C3(C(CC2(C1(C(=O)CO)O...,CCOC1=NC2=CC=CC(=C2N1CC3=CC=C(C=C3)C4=CC=CC=C4...,337,403,1246
4649437,CID000000450,CID000060852,estrogen replacement,CC12CCC3C(C1CCC2O)CCC4=C3C=CC(=C4)O,CCCCCN(C)CCC(O)(P(=O)(O)O)P(=O)(O)O,447,486,303
4649438,CID000010631,CID000060852,estrogen replacement,CC1CC2C(CCC3(C2CCC3(C(=O)C)O)C)C4(C1=CC(=O)CC4)C,CCCCCN(C)CCC(O)(P(=O)(O)O)P(=O)(O)O,226,486,303
4649439,CID000004585,CID000005076,hypogonadotropic hypogonadism,CC1=CC2=C(NC3=CC=CC=C3N=C2S1)N4CCN(CC4)C,CC(C)C1=NC(=CS1)CN(C)C(=O)NC(C(C)C)C(=O)NC(CC2...,260,338,501


## Labels 

In [7]:
# labels triplets [Drug1, Drug2, Side Effect]
labels_triplets = np.array(data_indexed[['Drug1_Index', 'Drug2_Index', 'SE_Index']]).astype(int)

In [8]:
# label tensor (Yabc)
labels_tensor = labels_to_tensor(labels_triplets, drug_idx, SE_idx)
labels_tensor.shape

(645, 645, 1317)

In [None]:
# save(labels_triplets, "../Final_DF/TDC_Label.txt")
# labels_triplets = np.loadtxt("../Final_DF/TDC_Label.txt") 

## Feature Vectorizatoin

### Drug 
[1] SMILES

In [9]:
# merge SMILES string to drug index dataset
drug_idx_smile = merge_smile(data_indexed, drug_idx, columns = [["Drug1_Index", "Drug1_smile" ],["Drug2_Index", "Drug2_smile"]])     
drug_idx_smile = drug_idx_smile[["Drug_ID","Drug_Index", "Drug1_smile"]].rename(columns = {"Drug1_smile":"Drug_smile"})

In [13]:
# vectorize with one-hot coding of SMILES string
drug_vectorized_smile = vectorize_smile(drug_idx_smile.Drug_smile)

# reshape into 2D dimension 
drug_vectorized_smile = drug_vectorized_smile.reshape(drug_vectorized_smile.shape[0], drug_vectorized_smile.shape[1] * drug_vectorized_smile.shape[2])   # (645, 9389) 

(645, 9389)

[2] Morgan fingerprint 

The molecule conversion API of TDC is applied.

In [16]:
# convert SMILES string into morgan fingerprint with 1024 features 
converter = MolConvert(src = 'SMILES', dst = 'Morgan')  
drug_vectorized_morgan = converter(drug_idx_smile.Drug_smile.values.tolist()).astype("int8")     # (645, 1024)

### Side Effect 

Drug-drug pair label is used as side information.

In [18]:
# index drug-drug combination 
combi_idx = drug_combi_index(drug_idx)   # (207690, 3)

# merge into the data_indexed
data_indexed_comb = data_indexed_combi(data_indexed, combi_idx)
labels_triplets_drugcombi = np.array(data_indexed_comb[['Drug1_Index', 'Drug2_Index', 'SE_Index', "Combi_Index"]]).astype(int)

In [22]:
# vectorizie into 2D tensor of data_indexed 
SE_vectorized = vectorize_drugcombi(labels_triplets_drugcombi, combi_idx, SE_idx)  # (1317, 207690)

## Similarity Kernels

### Drug

[1] SMILES

In [28]:
# cosine similarity 
K_drug_smile_cos = sklearn.metrics.pairwise.cosine_similarity(drug_vectorized_smile)

In [None]:
# tanimoto similarity  
K_drug_smile_tanimoto = tanimoto_similarity_matrix(drug_vectorized_smile)

[2] Morgan fingerprint

In [29]:
# cosine similarity 
K_drug_morgan_cos = sklearn.metrics.pairwise.cosine_similarity(drug_vectorized_morgan)

In [None]:
# tanimoto similarity  
K_drug_morgan_tanimoto = tanimoto_similarity_matrix(drug_vectorized_morgan)

### Side Effect 

In [30]:
# cosine similarity 
K_SE_cos = sklearn.metrics.pairwise.cosine_similarity(SE_vectorized)

In [None]:
# jaccard similarity; similarity = 1 - distance 
K_SE_jacc = 1 - sklearn.metrics.pairwise_distances(SE_vectorized, metric ='jaccard')   ## for this, i have to use only train_data as we are using drug-drug pair (inside dataset): need to compute 

In [None]:
# save(K_drug_smile_cos, "../Final_DF/TDC_drug_smile_cos.txt")
# save(K_drug_smile_tanimoto, "../Final_DF/TDC_drug_smile_tanimoto.txt")
# save(K_drug_morgan_cos, "../Final_DF/TDC_drug_morgan_cos.txt")
# save(K_drug_morgan_tanimoto, "../Final_DF/TDC_drug_morgana_tanimoto.txt")
# save(K_SE_cos, "../Final_DF/TDC_SE_cos.txt")
# save(K_SE_jacc, "../Final_DF/TDC_SE_jacc.txt")