## LINCS MOA classification for LINCS curated using Logistic regression(linear classifier)

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing  import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [2]:
file_prostate=pd.read_hdf("/opt/raid10/genomics/rashid/GCN/data/GSE92742_fully_restricted.hdf")

In [3]:
df_prostate=file_prostate.reset_index(level=[0,1,2,3,4,5,6]) #change from multi index to single index

In [4]:
prostate_y=df_prostate["moa"].values #change moa column to numpy array
df_prostate= df_prostate.drop(columns=["pert_id","pert_name","cell_id","primary_site","subtype","moa","Fold"])
prostate_X=df_prostate.values #get the matrix of feature values for 978 genes


In [5]:
labels,uniques=pd.factorize(prostate_y) #labels are the encodings
prostate_y=labels

In [6]:
sc=StandardScaler()
prostate_X=sc.fit_transform(prostate_X) #normalize feature data

In [7]:
#SPLIT DATA
X_train,X_test,y_train,y_test=train_test_split(prostate_X,prostate_y,test_size=0.2,random_state=42)#split into train and test

In [8]:
lc = SGDClassifier()
lc.fit(X_train,y_train)
pred_lc=lc.predict(X_test)  #3-4 min



In [9]:
#METRICS
print(confusion_matrix(y_test,pred_lc))
print(classification_report(y_test,pred_lc))
acc_lc=accuracy_score(y_test,pred_lc) #result for linear classification
print(acc_lc) #print precision score 
#results doesn't quite matches in the paper where accuracy is 63.8 ± 0.52, macro F1 is 42.6 ± 1.03. Took 3-4 minutes on macbook pro 2.7 GHz inter core i5 processor, 16 GB 1867 MHz DDR3 Memomry, Intel Iris Graphics 6100 1536 MB Graphics

[[   0    3    1 ...   15  530    0]
 [   1    4    3 ...   12  635    0]
 [   2    3   10 ...   20  563    0]
 ...
 [   0    8    3 ... 1765 1484  105]
 [   3   24   14 ...  223 4635    2]
 [   0    0    0 ...  329  403   94]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       583
           1       0.04      0.01      0.01       710
           2       0.16      0.02      0.03       634
           3       0.72      0.13      0.22       453
           4       0.00      0.00      0.00       205
           5       0.07      0.00      0.00       588
           6       1.00      0.03      0.06       212
           7       0.00      0.00      0.00       441
           8       0.95      0.58      0.72       859
           9       0.00      0.00      0.00       433
          10       0.00      0.00      0.00       337
          11       0.00      0.00      0.00       290
          12       0.22      0.00      0.01       487
          13   

  'precision', 'predicted', average, warn_for)


In [10]:
from sklearn.externals import joblib
joblib.dump(lc,'linearClassifier_MOA_FullCurated_notTuned')

['linearClassifier_MOA_FullCurated_notTuned']

In [11]:
lc_benchmark = SGDClassifier(eta0= 0.00031685190815167104, l1_ratio= 0.40558624196055393, tol=1e-05, penalty='l1', random_state= 101, max_iter= 1000, learning_rate= 'invscaling', alpha= 0.0012311722512335377, n_jobs= -1, power_t= 0.1840171707888663, loss= 'log')
lc_benchmark.fit(X_train,y_train)
pred_lc_benchmark=lc_benchmark.predict(X_test)  #3-4 min

In [12]:
#METRICS
print(confusion_matrix(y_test,pred_lc_benchmark))
print(classification_report(y_test,pred_lc_benchmark))
acc_lc_benchmark=accuracy_score(y_test,pred_lc_benchmark) #result for linear classification
print(acc_lc_benchmark) #print precision score 
#result matches with actual benchmark test

[[   0    0    1 ...   48  476    1]
 [   0    0    0 ...   53  574    1]
 [   0    0    1 ...   68  519    1]
 ...
 [   0    0    1 ... 2559  861    3]
 [   0    0    1 ...  398 4711    5]
 [   0    0    0 ...  688  138    6]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       583
           1       0.00      0.00      0.00       710
           2       0.10      0.00      0.00       634
           3       0.55      0.19      0.28       453
           4       0.00      0.00      0.00       205
           5       0.00      0.00      0.00       588
           6       1.00      0.02      0.05       212
           7       0.00      0.00      0.00       441
           8       0.83      0.71      0.77       859
           9       0.00      0.00      0.00       433
          10       0.00      0.00      0.00       337
          11       0.00      0.00      0.00       290
          12       0.00      0.00      0.00       487
          13   

  'precision', 'predicted', average, warn_for)


In [13]:

joblib.dump(lc_benchmark,'linearClassifier_MOA_FullCurated_Tuned')

['linearClassifier_MOA_FullCurated_Tuned']