## LINCS MOA Predicition for LINCS curated using K nearest neighbors

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing  import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [2]:
file_prostate=pd.read_hdf("/opt/raid10/genomics/rashid/GCN/data/GSE92742_fully_restricted.hdf")

In [3]:
df_prostate=file_prostate.reset_index(level=[0,1,2,3,4,5,6]) #change from multi index to single index

In [4]:
prostate_y=df_prostate["moa"].values #change moa column to numpy array
df_prostate= df_prostate.drop(columns=["pert_id","pert_name","cell_id","primary_site","subtype","moa","Fold"])
prostate_X=df_prostate.values #get the matrix of feature values for 978 genes


In [5]:
labels,uniques=pd.factorize(prostate_y) #labels are the encodings
prostate_y=labels

In [6]:
sc=StandardScaler()
prostate_X=sc.fit_transform(prostate_X) #normalize feature data

In [7]:
#SPLIT DATA
X_train,X_test,y_train,y_test=train_test_split(prostate_X,prostate_y,test_size=0.2,random_state=42)#split into train and test

In [10]:
knn = KNeighborsClassifier(n_neighbors=13) #with 5 it has 0.41 accuracy,

knn.fit(X_train,y_train)
pred_knn=knn.predict(X_test) #3-4 mins

In [11]:
#METRICS
print(confusion_matrix(y_test,pred_knn))
print(classification_report(y_test,pred_knn))
acc_knn=accuracy_score(y_test,pred_knn) #result for svm
print(acc_knn) #print precision score 
#results doesn't quite matches in the paper where accuracy is 66.5 ± 0.71, macro F1 is 46.2 +/- 0.89. Took 4-5 minutes on macbook pro 2.7 GHz inter core i5 processor, 16 GB 1867 MHz DDR3 Memomry, Intel Iris Graphics 6100 1536 MB Graphics

[[  7   0 110   2   1  22  19  77   1]
 [  3 130  16   1   0  16   8  48   0]
 [  8   0 225   1   3  35  24 130   2]
 [  2   0  84   0   0  26  16  91   0]
 [  4   1  75   0   3  20  19  82   0]
 [  8   1 136   0   1 595  28 158   0]
 [  3   0 219   0   0  65 460 152   3]
 [  9   0 606   0   3 229 179 717   3]
 [  2   0  42   0   0  11 129  31  11]]
              precision    recall  f1-score   support

           0       0.15      0.03      0.05       239
           1       0.98      0.59      0.73       222
           2       0.15      0.53      0.23       428
           3       0.00      0.00      0.00       219
           4       0.27      0.01      0.03       204
           5       0.58      0.64      0.61       927
           6       0.52      0.51      0.52       902
           7       0.48      0.41      0.44      1746
           8       0.55      0.05      0.09       226

    accuracy                           0.42      5113
   macro avg       0.41      0.31      0.30      511

In [None]:
from sklearn.externals import joblib
joblib.dump(knn,'KNN_MOA_FullCurated_notTuned')

In [17]:
knn_benchmark=KNeighborsClassifier(metric='canberra',p=1, weights='distance', n_neighbors= 12)
knn_benchmark.fit(X_train,y_train)
pred_knn_benchmark=knn_benchmark.predict(X_test) #4-5 mins

In [18]:
#METRICS
print(confusion_matrix(y_test,pred_knn_benchmark))
print(classification_report(y_test,pred_knn_benchmark))
acc_knn_benchmark=accuracy_score(y_test,pred_knn_benchmark) #result for svm
print(acc_knn_benchmark) #print precision score 
#results match MIT benchmark experiment  after using tuned hyperparameters from the experiment

[[  14    1   19    8    8    9   11  165    4]
 [   0  160    5    0    2    9    6   40    0]
 [  10    1  133   15   12   21   10  221    5]
 [   7    1   29    4    8    8   10  148    4]
 [   6    1   16    7   16    5    9  141    3]
 [   2    2   18    1    1  784    6  111    2]
 [   0    0   17    5    1   16  680  154   29]
 [  11    2   68    9   11   38   61 1528   18]
 [   2    0    2    0    0    1  139   20   62]]
              precision    recall  f1-score   support

           0       0.27      0.06      0.10       239
           1       0.95      0.72      0.82       222
           2       0.43      0.31      0.36       428
           3       0.08      0.02      0.03       219
           4       0.27      0.08      0.12       204
           5       0.88      0.85      0.86       927
           6       0.73      0.75      0.74       902
           7       0.60      0.88      0.72      1746
           8       0.49      0.27      0.35       226

    accuracy             

In [None]:

joblib.dump(knn_benchmark,'KNN_MOA_FullCurated_Tuned')