## LINCS MOA classification for LINCS curated using Logistic regression(linear classifier)

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing  import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [2]:
file_prostate=pd.read_hdf("/opt/raid10/genomics/rashid/GCN/data/GSE92742_fully_restricted.hdf")

In [3]:
df_prostate=file_prostate.reset_index(level=[0,1,2,3,4,5,6]) #change from multi index to single index

In [4]:
prostate_y=df_prostate["moa"].values #change moa column to numpy array
df_prostate= df_prostate.drop(columns=["pert_id","pert_name","cell_id","primary_site","subtype","moa","Fold"])
prostate_X=df_prostate.values #get the matrix of feature values for 978 genes


In [5]:
labels,uniques=pd.factorize(prostate_y) #labels are the encodings
prostate_y=labels

In [6]:
sc=StandardScaler()
prostate_X=sc.fit_transform(prostate_X) #normalize feature data

In [7]:
#SPLIT DATA
X_train,X_test,y_train,y_test=train_test_split(prostate_X,prostate_y,test_size=0.2,random_state=42)#split into train and test

In [8]:
lc = SGDClassifier()
lc.fit(X_train,y_train)
pred_lc=lc.predict(X_test)  #3-4 min



In [9]:
#METRICS
print(confusion_matrix(y_test,pred_lc))
print(classification_report(y_test,pred_lc))
acc_lc=accuracy_score(y_test,pred_lc) #result for linear classification
print(acc_lc) #print precision score 
#results doesn't quite matches in the paper where accuracy is 63.8 ± 0.52, macro F1 is 42.6 ± 1.03. Took 3-4 minutes on macbook pro 2.7 GHz inter core i5 processor, 16 GB 1867 MHz DDR3 Memomry, Intel Iris Graphics 6100 1536 MB Graphics

[[  31    1   15   10    9   20   28  125    0]
 [   6  158    3    2    2   13   13   25    0]
 [  22    1  126   29   16   37   37  157    3]
 [  10    0   19   21   10   14   28  116    1]
 [  12    0   10   12   28   16   24  102    0]
 [  16    0   11   11   20  673   39  152    5]
 [  16    1   17   15   13   20  525  206   89]
 [  83    2   61   57   85   87  181 1178   12]
 [   4    0    3    2    0    3  115   40   59]]
              precision    recall  f1-score   support

           0       0.15      0.13      0.14       239
           1       0.97      0.71      0.82       222
           2       0.48      0.29      0.36       428
           3       0.13      0.10      0.11       219
           4       0.15      0.14      0.14       204
           5       0.76      0.73      0.74       927
           6       0.53      0.58      0.55       902
           7       0.56      0.67      0.61      1746
           8       0.35      0.26      0.30       226

    accuracy             

In [None]:
from sklearn.externals import joblib
joblib.dump(lc,'linearClassifier_MOA_FullCurated_notTuned')

In [11]:
lc_benchmark = SGDClassifier(eta0= 0.00031685190815167104, l1_ratio= 0.40558624196055393, tol=1e-05, penalty='l1', random_state= 101, max_iter= 1000, learning_rate= 'invscaling', alpha= 0.0012311722512335377, n_jobs= -1, power_t= 0.1840171707888663, loss= 'log')
lc_benchmark.fit(X_train,y_train)
pred_lc_benchmark=lc_benchmark.predict(X_test)  #3-4 min

In [14]:
#METRICS
print(confusion_matrix(y_test,pred_lc_benchmark))
print(classification_report(y_test,pred_lc_benchmark))
acc_lc_benchmark=accuracy_score(y_test,pred_lc_benchmark) #result for linear classification
print(acc_lc_benchmark) #print precision score 
#result matches with actual benchmark test

[[  10    0   17    2    0    8   21  180    1]
 [   3  167    0    0    0    6    7   39    0]
 [   5    0  157    4    2   26   25  208    1]
 [   0    0   29    0    1   11   19  158    1]
 [   2    1    5    1    8   11   11  165    0]
 [   3    1    6    1    2  661   24  229    0]
 [   1    0    2    3    0    8  625  252   11]
 [   5    1   15    4    5   35   96 1581    4]
 [   0    0    1    0    0    1  172   39   13]]
              precision    recall  f1-score   support

           0       0.34      0.04      0.07       239
           1       0.98      0.75      0.85       222
           2       0.68      0.37      0.48       428
           3       0.00      0.00      0.00       219
           4       0.44      0.04      0.07       204
           5       0.86      0.71      0.78       927
           6       0.62      0.69      0.66       902
           7       0.55      0.91      0.69      1746
           8       0.42      0.06      0.10       226

    accuracy             

In [None]:

joblib.dump(lc_benchmark,'linearClassifier_MOA_FullCurated_Tuned')