## LINCS MOA Predicition for LINCS curated using K nearest neighbors

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing  import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [2]:
file_prostate=pd.read_hdf("/opt/raid10/genomics/rashid/GCN/data/GSE92742_fully_restricted.hdf")

In [3]:
df_prostate=file_prostate.reset_index(level=[0,1,2,3,4,5,6]) #change from multi index to single index

In [4]:
prostate_y=df_prostate["moa"].values #change moa column to numpy array
df_prostate= df_prostate.drop(columns=["pert_id","pert_name","cell_id","primary_site","subtype","moa","Fold"])
prostate_X=df_prostate.values #get the matrix of feature values for 978 genes


In [5]:
labels,uniques=pd.factorize(prostate_y) #labels are the encodings
prostate_y=labels

In [6]:
sc=StandardScaler()
prostate_X=sc.fit_transform(prostate_X) #normalize feature data

In [7]:
#SPLIT DATA
X_train,X_test,y_train,y_test=train_test_split(prostate_X,prostate_y,test_size=0.2,random_state=42)#split into train and test

In [8]:
knn = KNeighborsClassifier(n_neighbors=13) #with 5 it has 0.41 accuracy,

knn.fit(X_train,y_train)
pred_knn=knn.predict(X_test) #3-4 mins

In [9]:
#METRICS
print(confusion_matrix(y_test,pred_knn))
print(classification_report(y_test,pred_knn))
acc_knn=accuracy_score(y_test,pred_knn) #result for svm
print(acc_knn) #print precision score 
#results doesn't quite matches in the paper where accuracy is 66.5 ± 0.71, macro F1 is 46.2 +/- 0.89. Took 4-5 minutes on macbook pro 2.7 GHz inter core i5 processor, 16 GB 1867 MHz DDR3 Memomry, Intel Iris Graphics 6100 1536 MB Graphics

[[  19   17    3 ...  101  173    7]
 [  20   20    5 ...  107  186    7]
 [  20   17    8 ...   98  203    2]
 ...
 [  29   12    5 ... 2303  353   57]
 [ 122   48   17 ...  921 1859   50]
 [   4    1    2 ...  546   52   67]]
              precision    recall  f1-score   support

           0       0.02      0.03      0.03       583
           1       0.05      0.03      0.04       710
           2       0.05      0.01      0.02       634
           3       0.34      0.08      0.13       453
           4       0.03      0.04      0.03       205
           5       0.04      0.04      0.04       588
           6       0.32      0.08      0.12       212
           7       0.04      0.02      0.02       441
           8       0.93      0.50      0.65       859
           9       0.03      0.01      0.01       433
          10       0.01      0.00      0.00       337
          11       0.02      0.02      0.02       290
          12       0.08      0.02      0.03       487
          13   

In [10]:
from sklearn.externals import joblib
joblib.dump(knn,'KNN_MOA_FullCurated_notTuned')

['KNN_MOA_FullCurated_notTuned']

In [11]:
knn_benchmark=KNeighborsClassifier(metric='canberra',p=1, weights='distance', n_neighbors= 12)
knn_benchmark.fit(X_train,y_train)
pred_knn_benchmark=knn_benchmark.predict(X_test) #4-5 mins

In [12]:
#METRICS
print(confusion_matrix(y_test,pred_knn_benchmark))
print(classification_report(y_test,pred_knn_benchmark))
acc_knn_benchmark=accuracy_score(y_test,pred_knn_benchmark) #result for svm
print(acc_knn_benchmark) #print precision score 
#results match MIT benchmark experiment  after using tuned hyperparameters from the experiment

[[  27   32   16 ...   56  233    5]
 [  22   50    9 ...   72  281    5]
 [  12   29   47 ...   76  242    8]
 ...
 [   7    2    8 ... 2918  300  124]
 [  33   43   22 ...  552 3804   70]
 [   0    0    0 ...  532   31  264]]
              precision    recall  f1-score   support

           0       0.08      0.05      0.06       583
           1       0.09      0.07      0.08       710
           2       0.12      0.07      0.09       634
           3       0.37      0.20      0.26       453
           4       0.11      0.02      0.04       205
           5       0.07      0.04      0.05       588
           6       0.53      0.14      0.22       212
           7       0.08      0.03      0.04       441
           8       0.88      0.65      0.75       859
           9       0.08      0.03      0.05       433
          10       0.04      0.01      0.02       337
          11       0.03      0.02      0.02       290
          12       0.07      0.04      0.05       487
          13   

In [13]:

joblib.dump(knn_benchmark,'KNN_MOA_FullCurated_Tuned')

['KNN_MOA_FullCurated_Tuned']