In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import pycaret
from sklearn.preprocessing import LabelEncoder
from pycaret.regression import *
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read the dataset
data = pd.read_csv('../../layer3_latent30_dr3/train_X_y.csv', low_memory=False)

In [3]:
# Data cleaning
data = data.dropna()
data = data.drop(columns=['Barcodes', 'CELL_LINE_NAME'])
data = data.drop(data.columns[0],axis=1)

In [4]:
# create instance of labelencoder
# data['ANCHOR_NAME'].astype(str)
# data['LIBRARY_NAME'].astype(str)
labelencoder = LabelEncoder()
data['ANCHOR_CAT'] = labelencoder.fit_transform(data['ANCHOR_NAME'])
data['LIBRARY_CAT'] = labelencoder.fit_transform(data['LIBRARY_NAME'])
data['ANCHOR_CAT'].astype(int)
data['LIBRARY_CAT'].astype(int)

# Drop the sample name column and CellLine column
finaldata = data.drop(columns=['ANCHOR_NAME', 'LIBRARY_NAME'])

In [38]:
finaldata.to_csv('final_training_data.csv', index=False)
finaldata

Unnamed: 0,ANCHOR_CONC,LIBRARY_CONC,SYNERGY_OBS_EMAX,Latent0,Latent1,Latent2,Latent3,Latent4,Latent5,Latent6,...,Latent22,Latent23,Latent24,Latent25,Latent26,Latent27,Latent28,Latent29,ANCHOR_CAT,LIBRARY_CAT
0,2.5000,1.0,0.126641,-0.124173,3.909615,-0.075322,-2.735342,0.067569,-2.247642,-0.343655,...,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066,0,1
1,10.0000,1.0,0.177020,-0.124173,3.909615,-0.075322,-2.735342,0.067569,-2.247642,-0.343655,...,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066,0,1
2,0.0625,0.1,0.384358,-0.124173,3.909615,-0.075322,-2.735342,0.067569,-2.247642,-0.343655,...,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066,1,5
3,0.2500,0.1,0.260706,-0.124173,3.909615,-0.075322,-2.735342,0.067569,-2.247642,-0.343655,...,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066,1,5
4,0.0625,10.0,0.831054,-0.124173,3.909615,-0.075322,-2.735342,0.067569,-2.247642,-0.343655,...,-1.256836,0.222209,0.244195,0.593301,-0.116758,0.11183,0.044526,0.130066,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2389232,2.0000,10.0,0.104265,0.094143,-0.807228,0.026398,-3.262593,0.176699,-0.068403,-2.296632,...,0.374694,-0.064594,0.112217,0.221923,-0.096319,0.29775,0.220406,0.187305,10,16
2389233,0.5000,5.0,0.658678,0.094143,-0.807228,0.026398,-3.262593,0.176699,-0.068403,-2.296632,...,0.374694,-0.064594,0.112217,0.221923,-0.096319,0.29775,0.220406,0.187305,10,22
2389234,2.0000,5.0,0.216893,0.094143,-0.807228,0.026398,-3.262593,0.176699,-0.068403,-2.296632,...,0.374694,-0.064594,0.112217,0.221923,-0.096319,0.29775,0.220406,0.187305,10,22
2389235,0.5000,1.0,0.611517,0.094143,-0.807228,0.026398,-3.262593,0.176699,-0.068403,-2.296632,...,0.374694,-0.064594,0.112217,0.221923,-0.096319,0.29775,0.220406,0.187305,10,25


In [6]:
# Train the model
s = setup(finaldata, target = 'SYNERGY_OBS_EMAX', numeric_features=['ANCHOR_CONC', 'LIBRARY_CONC', 
                                                           'ANCHOR_CAT', 'LIBRARY_CAT'])
model = compare_models(include=['et'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0007,0.0,0.0056,0.9994,0.0039,0.0146,72.43


## Make predictions using patient data

In [82]:
# Make predictions using the patient data
patient_data = pd.read_csv('../../X_pred.csv', low_memory=False)

In [105]:
# Generate all drug combinations exist in the data
drug_combinations = list(finaldata[['ANCHOR_CAT','LIBRARY_CAT','ANCHOR_CONC','LIBRARY_CONC']].groupby(['ANCHOR_CAT','LIBRARY_CAT','ANCHOR_CONC','LIBRARY_CONC'])['ANCHOR_CAT'].agg(list).index)

In [None]:
# Generate a prediction table with each cell from patient samples exposed to each drug combination
# we have in the dataset
patient_data.loc[0,'ANCHOR_CAT']=1
for i in patient_data.index:
    patient_data.at[i,'ANCHOR_CAT'] = drug_combinations
patient_data = patient_data.explode('ANCHOR_CAT')

In [62]:
# table clean up
patient_data['LIBRARY_CAT'] = patient_data['ANCHOR_CAT'].apply(lambda x: x[1])
patient_data['LIBRARY_CONC'] = patient_data['ANCHOR_CAT'].apply(lambda x: x[3])
patient_data['ANCHOR_CONC'] = patient_data['ANCHOR_CAT'].apply(lambda x: x[2])
patient_data['ANCHOR_CAT'] = patient_data['ANCHOR_CAT'].apply(lambda x: x[0])

In [64]:
final_data = patient_data.drop(patient_data.columns[0],axis=1)

In [65]:
final_data.to_csv('patient_final_data.csv')

#### Predict drug response (Emax) for each patient cell

In [66]:
pred_x = final_data.iloc[:, 1:]
pred_x.shape

(6495972, 34)

In [67]:
pred_y = predict_model(model, pred_x)

In [68]:
pred_y

Unnamed: 0,Latent0,Latent1,Latent2,Latent3,Latent4,Latent5,Latent6,Latent7,Latent8,Latent9,...,Latent25,Latent26,Latent27,Latent28,Latent29,ANCHOR_CAT,LIBRARY_CAT,LIBRARY_CONC,ANCHOR_CONC,Label
0,-0.133713,-0.488713,-0.102538,1.393550,0.253198,0.219163,1.114243,-0.146467,0.000089,0.190237,...,-0.060139,-0.078290,-0.091746,0.088409,0.076120,0,1,1.0,2.5000,0.329662
0,-0.133713,-0.488713,-0.102538,1.393550,0.253198,0.219163,1.114243,-0.146467,0.000089,0.190237,...,-0.060139,-0.078290,-0.091746,0.088409,0.076120,0,1,1.0,10.0000,0.361817
0,-0.133713,-0.488713,-0.102538,1.393550,0.253198,0.219163,1.114243,-0.146467,0.000089,0.190237,...,-0.060139,-0.078290,-0.091746,0.088409,0.076120,1,5,0.1,0.0625,0.324865
0,-0.133713,-0.488713,-0.102538,1.393550,0.253198,0.219163,1.114243,-0.146467,0.000089,0.190237,...,-0.060139,-0.078290,-0.091746,0.088409,0.076120,1,5,0.1,0.2500,0.305970
0,-0.133713,-0.488713,-0.102538,1.393550,0.253198,0.219163,1.114243,-0.146467,0.000089,0.190237,...,-0.060139,-0.078290,-0.091746,0.088409,0.076120,1,7,10.0,0.0625,0.567809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63685,-0.125376,0.277588,0.079182,-0.318675,-0.003294,-1.091115,0.699207,-0.012308,0.076053,0.091409,...,-0.092716,-0.052029,-0.046705,0.035486,0.081573,10,16,10.0,2.0000,0.418179
63685,-0.125376,0.277588,0.079182,-0.318675,-0.003294,-1.091115,0.699207,-0.012308,0.076053,0.091409,...,-0.092716,-0.052029,-0.046705,0.035486,0.081573,10,22,5.0,0.5000,0.662988
63685,-0.125376,0.277588,0.079182,-0.318675,-0.003294,-1.091115,0.699207,-0.012308,0.076053,0.091409,...,-0.092716,-0.052029,-0.046705,0.035486,0.081573,10,22,5.0,2.0000,0.387312
63685,-0.125376,0.277588,0.079182,-0.318675,-0.003294,-1.091115,0.699207,-0.012308,0.076053,0.091409,...,-0.092716,-0.052029,-0.046705,0.035486,0.081573,10,25,1.0,0.5000,0.669647


In [69]:
# Bring cell barcodes back to predicted results
pred_y['Barcodes'] = final_data['Barcodes']

In [71]:
# Preview of our predictions
pred_y.groupby(['Barcodes', 'ANCHOR_CAT', 'LIBRARY_CAT', 'ANCHOR_CONC' ,'LIBRARY_CONC'])['Label'].agg(list)

Barcodes                                          ANCHOR_CAT  LIBRARY_CAT  ANCHOR_CONC  LIBRARY_CONC
AAACCTGAGAAAGTGG_EGAD00001006608_BIOKEY_26_Pre-1  0           1            2.5000       1.0             [0.34000147923827173]
                                                                           10.0000      1.0             [0.32252435371279714]
                                                  1           5            0.0625       0.1              [0.4683538453280926]
                                                                           0.2500       0.1              [0.2979623860120773]
                                                              7            0.0625       10.0             [0.6734612885117531]
                                                                                                                ...          
TTTGTCATCTTGTTTG_EGAD00001006608_BIOKEY_17_Pre-1  10          16           2.0000       10.0            [0.42989618472754954]
                 

In [92]:
# Extract the predicted best drug regime
predicted_best_drug_regime = pred_y.groupby(['Barcodes'])['Label'].agg(list).apply(np.argmin)

In [93]:
predicted_best_drug_regime

Barcodes
AAACCTGAGAAAGTGG_EGAD00001006608_BIOKEY_26_Pre-1    49
AAACCTGAGACAGACC_EGAD00001006608_BIOKEY_26_On-1     49
AAACCTGAGACGACGT_EGAD00001006608_BIOKEY_35_Pre-1    60
AAACCTGAGACTCGGA_EGAD00001006608_BIOKEY_15_Pre-1    55
AAACCTGAGACTGTAA_EGAD00001006608_BIOKEY_23_On-1     95
                                                    ..
TTTGTCATCTGGAGCC_EGAD00001006608_BIOKEY_5_Pre-1     73
TTTGTCATCTGGTTCC_EGAD00001006608_BIOKEY_10_Pre-1    60
TTTGTCATCTTCGGTC_EGAD00001006608_BIOKEY_10_Pre-1    61
TTTGTCATCTTGTATC_EGAD00001006608_BIOKEY_26_On-1      7
TTTGTCATCTTGTTTG_EGAD00001006608_BIOKEY_17_Pre-1    67
Name: Label, Length: 63686, dtype: int64

#### Export predict results along with the latent space vectors

In [None]:
patient_data = pd.read_csv('../../X_pred.csv', low_memory=False).iloc[:,1:].set_index('Barcodes')
patient_data['predicted_best_drug_regime'] = predicted_best_drug_regime
patient_data.to_csv('predicted_best_drug_regime_for_patient_cells.csv')

#### Export drug regime mappings for future reference

In [101]:
anchor = {'5-Fluorouracil': 0,
 'AZD7762': 1,
 'Bortezomib': 2,
 'Cisplatin': 3,
 'Gemcitabine': 4,
 'Linsitinib': 5,
 'MK-1775': 6,
 'Navitoclax': 7,
 'Pictilisib': 8,
 'Taselisib': 9,
 'Vorinostat': 10}

library = {'5-Fluorouracil': 0,
 'AZD7762': 1,
 'Afatinib': 2,
 'Alisertib': 3,
 'Axitinib': 4,
 'Camptothecin': 5,
 'Cisplatin': 6,
 'Crizotinib': 7,
 'Docetaxel': 8,
 'Entinostat': 9,
 'GSK269962A': 10,
 'Gemcitabine': 11,
 'JQ1': 12,
 'Lapatinib': 13,
 'MK-1775': 14,
 'MK-2206': 15,
 'NU7441': 16,
 'Navitoclax': 17,
 'Nutlin-3a (-)': 18,
 'Olaparib': 19,
 'Paclitaxel': 20,
 'RO-3306': 21,
 'SCH772984': 22,
 'Sapitinib': 23,
 'Tozasertib': 24,
 'Trametinib': 25,
 'Uprosertib': 26,
 'Wee1 Inhibitor': 27,
 'ZM447439': 28}
anchor = dict((v,k) for k,v in anchor.items())
library = dict((v,k) for k,v in library.items())

In [104]:
drug_regime_map = pd.DataFrame(drug_combinations, columns=['ANCHOR_NAME', 'LIBRARY_NAME','ANCHOR_CONC', 'LIBRARY_CONC'])
drug_regime_map['ANCHOR_NAME'] = drug_regime_map['ANCHOR_NAME'].apply(lambda x: anchor[x])
drug_regime_map['LIBRARY_NAME'] = drug_regime_map['LIBRARY_NAME'].apply(lambda x: library[x])
drug_regime_map.to_csv('drug_regime_id_mappings.csv')
drug_regime_map

Unnamed: 0,ANCHOR_NAME,LIBRARY_NAME,ANCHOR_CONC,LIBRARY_CONC
0,5-Fluorouracil,AZD7762,2.5000,1.0
1,5-Fluorouracil,AZD7762,10.0000,1.0
2,AZD7762,Camptothecin,0.0625,0.1
3,AZD7762,Camptothecin,0.2500,0.1
4,AZD7762,Crizotinib,0.0625,10.0
...,...,...,...,...
97,Vorinostat,NU7441,2.0000,10.0
98,Vorinostat,SCH772984,0.5000,5.0
99,Vorinostat,SCH772984,2.0000,5.0
100,Vorinostat,Trametinib,0.5000,1.0
