## Data Preprocessing

In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import pycaret
from sklearn.preprocessing import LabelEncoder
from pycaret.regression import *
import warnings
warnings.filterwarnings('ignore')

In [30]:
# Read the dataset
data = pd.read_csv('../../layer3_latent30_dr3/train_X_y.csv', low_memory=False)


In [12]:
# Inspect the dataset
data.head()

Unnamed: 0.1,Unnamed: 0,Barcodes,CELL_LINE_NAME,ANCHOR_NAME,LIBRARY_NAME,ANCHOR_CONC,LIBRARY_CONC,SYNERGY_OBS_EMAX,Latent0,Latent1,...,Latent20,Latent21,Latent22,Latent23,Latent24,Latent25,Latent26,Latent27,Latent28,Latent29
0,0,AU565_AAACCAGTTTGG-0,AU565,5-Fluorouracil,AZD7762,2.5,1.0,0.126641,-0.124173,3.909615,...,-2.715222,0.072449,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066
1,1,AU565_AAACCAGTTTGG-0,AU565,5-Fluorouracil,AZD7762,10.0,1.0,0.17702,-0.124173,3.909615,...,-2.715222,0.072449,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066
2,2,AU565_AAACCAGTTTGG-0,AU565,AZD7762,Camptothecin,0.0625,0.1,0.384358,-0.124173,3.909615,...,-2.715222,0.072449,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066
3,3,AU565_AAACCAGTTTGG-0,AU565,AZD7762,Camptothecin,0.25,0.1,0.260706,-0.124173,3.909615,...,-2.715222,0.072449,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066
4,4,AU565_AAACCAGTTTGG-0,AU565,AZD7762,Crizotinib,0.0625,10.0,0.831054,-0.124173,3.909615,...,-2.715222,0.072449,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066


In [31]:
# Data cleaning
data = data.dropna()
data = data.drop(columns=['Barcodes', 'CELL_LINE_NAME'])

In [32]:
data = data.drop(data.columns[0],axis=1)

In [33]:
data

Unnamed: 0,ANCHOR_NAME,LIBRARY_NAME,ANCHOR_CONC,LIBRARY_CONC,SYNERGY_OBS_EMAX,Latent0,Latent1,Latent2,Latent3,Latent4,...,Latent20,Latent21,Latent22,Latent23,Latent24,Latent25,Latent26,Latent27,Latent28,Latent29
0,5-Fluorouracil,AZD7762,2.5000,1.0,0.126641,-0.124173,3.909615,-0.075322,-2.735342,0.067569,...,-2.715222,0.072449,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066
1,5-Fluorouracil,AZD7762,10.0000,1.0,0.177020,-0.124173,3.909615,-0.075322,-2.735342,0.067569,...,-2.715222,0.072449,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066
2,AZD7762,Camptothecin,0.0625,0.1,0.384358,-0.124173,3.909615,-0.075322,-2.735342,0.067569,...,-2.715222,0.072449,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066
3,AZD7762,Camptothecin,0.2500,0.1,0.260706,-0.124173,3.909615,-0.075322,-2.735342,0.067569,...,-2.715222,0.072449,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066
4,AZD7762,Crizotinib,0.0625,10.0,0.831054,-0.124173,3.909615,-0.075322,-2.735342,0.067569,...,-2.715222,0.072449,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2389232,Vorinostat,NU7441,2.0000,10.0,0.104265,0.094143,-0.807228,0.026398,-3.262593,0.176699,...,-3.246577,-0.107799,0.374694,-0.064594,0.112217,0.221923,-0.096319,0.29775,0.220406,0.187305
2389233,Vorinostat,SCH772984,0.5000,5.0,0.658678,0.094143,-0.807228,0.026398,-3.262593,0.176699,...,-3.246577,-0.107799,0.374694,-0.064594,0.112217,0.221923,-0.096319,0.29775,0.220406,0.187305
2389234,Vorinostat,SCH772984,2.0000,5.0,0.216893,0.094143,-0.807228,0.026398,-3.262593,0.176699,...,-3.246577,-0.107799,0.374694,-0.064594,0.112217,0.221923,-0.096319,0.29775,0.220406,0.187305
2389235,Vorinostat,Trametinib,0.5000,1.0,0.611517,0.094143,-0.807228,0.026398,-3.262593,0.176699,...,-3.246577,-0.107799,0.374694,-0.064594,0.112217,0.221923,-0.096319,0.29775,0.220406,0.187305


In [34]:
# get the anchor drug name mapping
anchor = LabelEncoder()
anchor.fit_transform(data['ANCHOR_NAME'])
dict(zip(anchor.classes_,range(len(anchor.classes_))))

{'5-Fluorouracil': 0,
 'AZD7762': 1,
 'Bortezomib': 2,
 'Cisplatin': 3,
 'Gemcitabine': 4,
 'Linsitinib': 5,
 'MK-1775': 6,
 'Navitoclax': 7,
 'Pictilisib': 8,
 'Taselisib': 9,
 'Vorinostat': 10}

In [35]:
# get the library drug name mapping
library = LabelEncoder()
library.fit_transform(data['LIBRARY_NAME'])
dict(zip(library.classes_,range(len(library.classes_))))

{'5-Fluorouracil': 0,
 'AZD7762': 1,
 'Afatinib': 2,
 'Alisertib': 3,
 'Axitinib': 4,
 'Camptothecin': 5,
 'Cisplatin': 6,
 'Crizotinib': 7,
 'Docetaxel': 8,
 'Entinostat': 9,
 'GSK269962A': 10,
 'Gemcitabine': 11,
 'JQ1': 12,
 'Lapatinib': 13,
 'MK-1775': 14,
 'MK-2206': 15,
 'NU7441': 16,
 'Navitoclax': 17,
 'Nutlin-3a (-)': 18,
 'Olaparib': 19,
 'Paclitaxel': 20,
 'RO-3306': 21,
 'SCH772984': 22,
 'Sapitinib': 23,
 'Tozasertib': 24,
 'Trametinib': 25,
 'Uprosertib': 26,
 'Wee1 Inhibitor': 27,
 'ZM447439': 28}

In [36]:
# create instance of labelencoder
# data['ANCHOR_NAME'].astype(str)
# data['LIBRARY_NAME'].astype(str)
labelencoder = LabelEncoder()
data['ANCHOR_CAT'] = labelencoder.fit_transform(data['ANCHOR_NAME'])
data['LIBRARY_CAT'] = labelencoder.fit_transform(data['LIBRARY_NAME'])
data['ANCHOR_CAT'].astype(int)
data['LIBRARY_CAT'].astype(int)

# Drop the sample name column and CellLine column
finaldata = data.drop(columns=['ANCHOR_NAME', 'LIBRARY_NAME'])

In [37]:
data[['ANCHOR_CAT','LIBRARY_CAT']].groupby()

Unnamed: 0,ANCHOR_CAT,LIBRARY_CAT
0,0,1
1,0,1
2,1,5
3,1,5
4,1,7
...,...,...
2389232,10,16
2389233,10,22
2389234,10,22
2389235,10,25


In [21]:
finaldata

Unnamed: 0,ANCHOR_CONC,LIBRARY_CONC,SYNERGY_OBS_EMAX,Latent0,Latent1,Latent2,Latent3,Latent4,Latent5,Latent6,...,Latent22,Latent23,Latent24,Latent25,Latent26,Latent27,Latent28,Latent29,ANCHOR_CAT,LIBRARY_CAT
0,2.5000,1.0,0.126641,-0.124173,3.909615,-0.075322,-2.735342,0.067569,-2.247642,-0.343655,...,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066,0,1
1,10.0000,1.0,0.177020,-0.124173,3.909615,-0.075322,-2.735342,0.067569,-2.247642,-0.343655,...,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066,0,1
2,0.0625,0.1,0.384358,-0.124173,3.909615,-0.075322,-2.735342,0.067569,-2.247642,-0.343655,...,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066,1,5
3,0.2500,0.1,0.260706,-0.124173,3.909615,-0.075322,-2.735342,0.067569,-2.247642,-0.343655,...,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066,1,5
4,0.0625,10.0,0.831054,-0.124173,3.909615,-0.075322,-2.735342,0.067569,-2.247642,-0.343655,...,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2389232,2.0000,10.0,0.104265,0.094143,-0.807228,0.026398,-3.262593,0.176699,-0.068403,-2.296632,...,0.374694,-0.064594,0.112217,0.221923,-0.096319,0.29775,0.220406,0.187305,10,16
2389233,0.5000,5.0,0.658678,0.094143,-0.807228,0.026398,-3.262593,0.176699,-0.068403,-2.296632,...,0.374694,-0.064594,0.112217,0.221923,-0.096319,0.29775,0.220406,0.187305,10,22
2389234,2.0000,5.0,0.216893,0.094143,-0.807228,0.026398,-3.262593,0.176699,-0.068403,-2.296632,...,0.374694,-0.064594,0.112217,0.221923,-0.096319,0.29775,0.220406,0.187305,10,22
2389235,0.5000,1.0,0.611517,0.094143,-0.807228,0.026398,-3.262593,0.176699,-0.068403,-2.296632,...,0.374694,-0.064594,0.112217,0.221923,-0.096319,0.29775,0.220406,0.187305,10,25


In [22]:
finaldata.dtypes

ANCHOR_CONC         float64
LIBRARY_CONC        float64
SYNERGY_OBS_EMAX    float64
Latent0             float64
Latent1             float64
Latent2             float64
Latent3             float64
Latent4             float64
Latent5             float64
Latent6             float64
Latent7             float64
Latent8             float64
Latent9             float64
Latent10            float64
Latent11            float64
Latent12            float64
Latent13            float64
Latent14            float64
Latent15            float64
Latent16            float64
Latent17            float64
Latent18            float64
Latent19            float64
Latent20            float64
Latent21            float64
Latent22            float64
Latent23            float64
Latent24            float64
Latent25            float64
Latent26            float64
Latent27            float64
Latent28            float64
Latent29            float64
ANCHOR_CAT            int64
LIBRARY_CAT           int64
dtype: object

## Train ML Regression Baseline Model & Comparison

In [24]:
# Set up the model training and get the baseline model results
s = setup(finaldata, target = 'SYNERGY_OBS_EMAX', numeric_features=['ANCHOR_CONC', 'LIBRARY_CONC', 
                                                           'ANCHOR_CAT', 'LIBRARY_CAT'])
base = compare_models(include=['rf', 'lightgbm', 'et', 'xgboost', 'dummy',
                               'dt', 'lr'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0007,0.0,0.0057,0.9994,0.004,0.0161,68.436
rf,Random Forest Regressor,0.0008,0.0001,0.0088,0.9986,0.0062,0.0058,159.146
dt,Decision Tree Regressor,0.0007,0.0002,0.0143,0.9964,0.0099,0.0035,7.233
xgboost,Extreme Gradient Boosting,0.0383,0.0028,0.0527,0.9512,0.0374,1.1551,80.927
lightgbm,Light Gradient Boosting Machine,0.0695,0.008,0.0895,0.8591,0.0629,1.2679,3.912
lr,Linear Regression,0.1761,0.0446,0.2111,0.2163,0.1472,3.9745,2.821
dummy,Dummy Regressor,0.2052,0.0568,0.2384,-0.0,0.167,11.6826,0.249
