# Installation

In [5]:
# uncomment the line below and run this code cell once
# %run '../scripts/package_install.py'

In [6]:
import pandas as pd 
import numpy as np 
import matplotlib as np 
from sklearn.preprocessing import OneHotEncoder



# Loading Dataset From TDC package

In [7]:
# uncomment the line below and run this code cell once
#%run '../scripts/data_loader.py'

In [8]:
train = pd.read_csv('../data/train.csv')
valid = pd.read_csv('../data/valid.csv')
test = pd.read_csv('../data/test.csv')

# Exploratory Data Analysis

In [9]:
print(f'The shape of the training set is:{train.shape}')
print(f'The shape of the cross-validation set is:{valid.shape}')
print(f'The shape of the test set is:{test.shape}')

The shape of the training set is:(134265, 6)
The shape of the cross-validation set is:(19181, 6)
The shape of the test set is:(38362, 6)


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134265 entries, 0 to 134264
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  134265 non-null  int64 
 1   Drug1_ID    134265 non-null  object
 2   Drug1       134265 non-null  object
 3   Drug2_ID    134265 non-null  object
 4   Drug2       134265 non-null  object
 5   Y           134265 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 6.1+ MB


### train.info() returns information about the train dataset, namely the column names, and their data types
From the result above, the first column is the serial number. The dataset will be cleaned and the first column removed

In [11]:
train = train.drop(train.columns[0], axis=1)

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134265 entries, 0 to 134264
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Drug1_ID  134265 non-null  object
 1   Drug1     134265 non-null  object
 2   Drug2_ID  134265 non-null  object
 3   Drug2     134265 non-null  object
 4   Y         134265 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 5.1+ MB


In [13]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38362 entries, 0 to 38361
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  38362 non-null  int64 
 1   Drug1_ID    38362 non-null  object
 2   Drug1       38362 non-null  object
 3   Drug2_ID    38362 non-null  object
 4   Drug2       38362 non-null  object
 5   Y           38362 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.8+ MB


### test.info() returns information about the train dataset, namely the column names, and their data types
From the result above, the first column is the serial number. The dataset will be cleaned and the first column removed

In [14]:
test = test.drop(test.columns[0], axis=1)

In [15]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38362 entries, 0 to 38361
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Drug1_ID  38362 non-null  object
 1   Drug1     38362 non-null  object
 2   Drug2_ID  38362 non-null  object
 3   Drug2     38362 non-null  object
 4   Y         38362 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.5+ MB


In [16]:
valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19181 entries, 0 to 19180
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  19181 non-null  int64 
 1   Drug1_ID    19181 non-null  object
 2   Drug1       19181 non-null  object
 3   Drug2_ID    19181 non-null  object
 4   Drug2       19181 non-null  object
 5   Y           19181 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 899.2+ KB


### valid.info() returns information about the train dataset, namely the column names, and their data types
From the result above, the first column is the serial number. The dataset will be cleaned and the first column removed

In [17]:
valid = valid.drop(valid.columns[0], axis=1)

In [18]:
train_X = train.iloc[:, :-1]
train_y = train.iloc[:, -1]

In [19]:
train_X.head()


Unnamed: 0,Drug1_ID,Drug1,Drug2_ID,Drug2
0,DB04571,CC1=CC2=CC3=C(OC(=O)C=C3C)C(C)=C2O1,DB00460,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
1,DB09536,O=[Ti]=O,DB00460,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
2,DB01600,CC(C(O)=O)C1=CC=C(S1)C(=O)C1=CC=CC=C1,DB00460,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
3,DB09000,CC(CN(C)C)CN1C2=CC=CC=C2SC2=C1C=C(C=C2)C#N,DB00460,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
4,DB11630,OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)...,DB00460,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...


In [20]:
train_y.head()

0    1
1    1
2    1
3    1
4    1
Name: Y, dtype: int64

In [21]:
train_y.value_counts()

Y
49    42592
47    24102
73    16509
75     6539
60     5945
      ...  
43        7
41        7
26        6
62        4
42        3
Name: count, Length: 86, dtype: int64

In [22]:
train_y.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86])

### To know what type of relation the label index corresponds to,run the cell below

In [25]:
from tdc.utils import get_label_map


In [None]:
get_label_map(name = 'DrugBank', task = 'DDI')

# Featurisation

In [27]:
%run '../scripts/featuriser.py'

Fetching model eos4wt0...
Model fetched successfully
🚀 Serving model eos4wt0: morgan-fps

   URL: http://0.0.0.0:52469
   PID: -1
   SRV: pulled_docker
   Output source: local-only

👉 To run model:
   - run

💁 Information:
   - info
Ersilia model server is running
Processing ../data/train.csv...
Featurizing Drug1 from ../data/train.csv...
Successfully featurized Drug1 in ../data/train.csv
Featurizing Drug2 from ../data/train.csv...
Successfully featurized Drug2 in ../data/train.csv
Featurized data saved to ../data/featurised_train.csv
Deleted temporary file: ../data/train_drug1.csv
Deleted temporary file: ../data/train_drug1_feat.csv
Deleted temporary file: ../data/train_drug2.csv
Deleted temporary file: ../data/train_drug2_feat.csv
Processing ../data/valid.csv...
Featurizing Drug1 from ../data/valid.csv...
Successfully featurized Drug1 in ../data/valid.csv
Featurizing Drug2 from ../data/valid.csv...
Successfully featurized Drug2 in ../data/valid.csv
Featurized data saved to ../data/fe