## LINCS MOA prediction in LINCS Curated using Random Forest Classifier

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing  import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [None]:
file_prostate=pd.read_hdf("/opt/raid10/genomics/rashid/GCN/data/GSE92742_fully_restricted.hdf")

In [None]:
file_prostate.head(100) #show top 100 samples

In [None]:
df_prostate=file_prostate.reset_index(level=[0,1,2,3,4,5,6]) #change from multi index to single index
df_prostate.head(20)

In [None]:
df_prostate.moa.value_counts() # 9 classes of MOA. What is Fold  here in dataframe?

In [None]:
df_prostate.pert_name.value_counts() #239 perturbagens

In [None]:
df_prostate.cell_id.value_counts() #2 cell lines

In [None]:
df_prostate.primary_site.value_counts() # 1 primary-site:prostate with 25565 samples

In [None]:
df_prostate.subtype.value_counts() #2 subtypes

In [None]:
prostate_y=df_prostate["moa"].values #change moa column to numpy array
prostate_y #predictor values

In [None]:
df_prostate= df_prostate.drop(columns=["pert_id","pert_name","cell_id","primary_site","subtype","moa","Fold"])
prostate_X=df_prostate.values #get the matrix
prostate_X # feature values from 978 genes

In [None]:
labels,uniques=pd.factorize(prostate_y) #labels are the encodings
print(uniques)
print(labels)
prostate_y=labels

In [None]:
sc=StandardScaler()
prostate_X=sc.fit_transform(prostate_X) #normalize feature data
prostate_X

In [None]:
#SPLIT DATA
X_train,X_test,y_train,y_test=train_test_split(prostate_X,prostate_y,test_size=0.2,random_state=42)#split into train and test

### Random Forest Classifier

In [None]:
rfc=RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,y_train) #fit random forest with training data 
pred_rfc=rfc.predict(X_test) #see how the test data is predicted

In [None]:
pred_rfc[:20]

In [None]:
#METRICS
print(confusion_matrix(y_test,pred_rfc))
print(classification_report(y_test,pred_rfc))
cm=accuracy_score(y_test,pred_rfc) #result for randomforest
print(cm) #print precision score 
#results matches in the paper where accuracy is 60.4+/-0.48, macro F1 is 37.4 +/- 0.41. Took 4-5 minutes on macbook pro 2.7 GHz inter core i5 processor, 16 GB 1867 MHz DDR3 Memomry, Intel Iris Graphics 6100 1536 MB Graphics

In [None]:
from sklearn.externals import joblib
joblib.dump(rfc,'randomForest_MOA_FullCurated')

In [None]:
rfc_benchmark=RandomForestClassifier(max_depth= 100, max_leaf_nodes= None, criterion= 'gini', n_estimators= 211, min_samples_split= 2, min_weight_fraction_leaf= 1.2722643563513202e-06, min_impurity_decrease= 1.6979071770883573e-05, min_samples_leaf= 1)
rfc_benchmark.fit(X_train,y_train) #fit random forest with training data 
pred_rfc_benchmark=rfc_benchmark.predict(X_test) #see how the test data is predicted

In [None]:
#METRICS
print(confusion_matrix(y_test,pred_rfc_benchmark))
print(classification_report(y_test,pred_rfc_benchmark))
cm_benchmark=accuracy_score(y_test,pred_rfc_benchmark) #result for randomforest
print(cm_benchmark) #print precision score 
#results matches in the paper where accuracy is 60.4+/-0.48, macro F1 is 37.4 +/- 0.41. Took 4-5 minutes on macbook pro 2.7 GHz inter core i5 processor, 16 GB 1867 MHz DDR3 Memomry, Intel Iris Graphics 6100 1536 MB Graphics

In [None]:
joblib.dump(rfc_benchmark,'randomForest_MOA_FullCurated_Tuned')