## LINCS MOA prediciton for LINCS curated using decision trees

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing  import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [2]:
file_prostate=pd.read_hdf("/opt/raid10/genomics/rashid/GCN/data/GSE92742_fully_restricted.hdf")




In [3]:
df_prostate=file_prostate.reset_index(level=[0,1,2,3,4,5,6]) #change from multi index to single index

In [4]:
prostate_y=df_prostate["moa"].values #change moa column to numpy array
df_prostate= df_prostate.drop(columns=["pert_id","pert_name","cell_id","primary_site","subtype","moa","Fold"])
prostate_X=df_prostate.values #get the matrix of feature values for 978 genes


In [5]:
labels,uniques=pd.factorize(prostate_y) #labels are the encodings
prostate_y=labels

In [6]:
sc=StandardScaler()
prostate_X=sc.fit_transform(prostate_X) #normalize feature data

In [7]:
#SPLIT DATA
X_train,X_test,y_train,y_test=train_test_split(prostate_X,prostate_y,test_size=0.2,random_state=42)#split into train and test

In [8]:
dct = DecisionTreeClassifier()
dct.fit(X_train,y_train)
pred_dct=dct.predict(X_test)  #1 min!

In [9]:
#METRICS
print(confusion_matrix(y_test,pred_dct))
print(classification_report(y_test,pred_dct))
acc_dct=accuracy_score(y_test,pred_dct) #result for svm
print(acc_dct) #print precision score 
#results doesn't quite matches in the paper where accuracy is 53.2 ± 1.16, macro F1 is 32.6 ± 0.91. Took 4-5 minutes on macbook pro 2.7 GHz inter core i5 processor, 16 GB 1867 MHz DDR3 Memomry, Intel Iris Graphics 6100 1536 MB Graphics

[[  21   21   22 ...   51  111    9]
 [  17   32   18 ...   50  149   16]
 [  11   26   22 ...   62  128   13]
 ...
 [  45   61   57 ... 1339  422  274]
 [ 133  167  113 ...  424 1468  111]
 [   6   12   20 ...  261  101  118]]
              precision    recall  f1-score   support

           0       0.04      0.04      0.04       583
           1       0.04      0.05      0.04       710
           2       0.03      0.03      0.03       634
           3       0.13      0.12      0.13       453
           4       0.03      0.03      0.03       205
           5       0.04      0.04      0.04       588
           6       0.05      0.04      0.05       212
           7       0.04      0.03      0.03       441
           8       0.58      0.54      0.56       859
           9       0.03      0.03      0.03       433
          10       0.01      0.01      0.01       337
          11       0.01      0.01      0.01       290
          12       0.03      0.03      0.03       487
          13   

In [10]:
from sklearn.externals import joblib
joblib.dump(dct,'decisionTree_MOA_FullCurated_notTuned')

['decisionTree_MOA_FullCurated_notTuned']

In [11]:
dct_benchmark=DecisionTreeClassifier(min_impurity_decrease=0.0012257673197162205, min_samples_leaf= 2, min_weight_fraction_leaf=0.0020780796100258966, max_depth=10, max_features=None, criterion= 'entropy', max_leaf_nodes=None, min_samples_split=2, splitter='best')
dct_benchmark.fit(X_train,y_train)
pred_dct_benchmark=dct_benchmark.predict(X_test)  #1 min!

In [12]:
#METRICS
print(confusion_matrix(y_test,pred_dct_benchmark))
print(classification_report(y_test,pred_dct_benchmark))
acc_dct_benchmark=accuracy_score(y_test,pred_dct_benchmark) #result for svm
print(acc_dct_benchmark) #print precision score 
#results almost matches in the paper where accuracy is 53.2 ± 1.16, macro F1 is 32.6 ± 0.91. Took 4-5 minutes on macbook pro 2.7 GHz inter core i5 processor, 16 GB 1867 MHz DDR3 Memomry, Intel Iris Graphics 6100 1536 MB Graphics

[[   0    0    0 ...   30  503    0]
 [   0    0    0 ...   50  565    0]
 [   0    0    0 ...   43  544    0]
 ...
 [   0    0    0 ... 1194 2085    0]
 [   0    0    0 ...  309 4706    0]
 [   0    0    0 ...  353  464    0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       583
           1       0.00      0.00      0.00       710
           2       0.00      0.00      0.00       634
           3       0.22      0.11      0.15       453
           4       0.00      0.00      0.00       205
           5       0.00      0.00      0.00       588
           6       0.00      0.00      0.00       212
           7       0.00      0.00      0.00       441
           8       0.56      0.58      0.57       859
           9       0.00      0.00      0.00       433
          10       0.00      0.00      0.00       337
          11       0.00      0.00      0.00       290
          12       0.00      0.00      0.00       487
          13   

  'precision', 'predicted', average, warn_for)


In [13]:

joblib.dump(dct_benchmark,'decisionTree_MOA_FullCurated_Tuned')

['decisionTree_MOA_FullCurated_Tuned']