In [42]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [43]:
data = pd.read_csv("data/combined_expression.csv")

In [44]:
data.head()

Unnamed: 0,CELL_LINE_NAME,classification,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,...,COL15A1,C6orf10,TMEM225,NOTCH4,PBX2,AGER,RNF5,AGPAT1,DFNB59,PRRT1
0,1240121,1,6.419526,3.182094,9.320548,3.759654,3.802619,3.215753,4.698729,7.873672,...,3.245454,2.953508,3.543429,3.352022,4.67231,3.641128,3.13531,3.737072,3.450927,3.1688
1,1240122,2,7.646494,2.626819,10.153853,3.564755,3.942749,3.29076,3.551675,8.252413,...,2.786709,3.077382,3.728232,3.208882,4.58684,3.395654,3.5868,3.519128,3.115323,3.051645
2,1240123,1,8.319417,3.111183,9.643558,4.757258,3.919757,3.602185,3.329644,9.07695,...,3.459089,3.085394,3.462811,3.33903,4.614897,3.395845,3.419193,3.971646,3.72931,3.320022
3,1240124,1,9.006994,3.028173,9.6867,4.280504,3.147646,3.188881,3.293807,8.67879,...,2.835403,2.960303,3.415083,3.290171,4.770123,3.400821,3.383734,3.798107,2.822404,3.297547
4,1240127,1,7.985676,2.694729,10.676134,4.159685,3.804637,3.481942,3.111261,7.555407,...,2.896523,2.849899,3.480114,3.226128,5.83271,3.612179,3.347095,4.457963,5.198524,4.553586


In [45]:
X = data.drop(columns=['CELL_LINE_NAME', 'classification'])
y = data['classification']
feat_labels = list(X.columns)

In [46]:
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X)
X = pd.DataFrame(x_scaled)

In [47]:
# 20% test, 80% train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [48]:
# create and train the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [49]:
# associating each feature with its relative importance
feat_importances = list(zip(feat_labels, clf.feature_importances_))

In [60]:
# sorting the feature importances
def sort_tuple(tup):
    tup.sort(key = lambda x: x[1], reverse=True)
    return tup
sort_tuple(feat_importances)

[('TIAL1', 0.0021065491682396814),
 ('LRIT2', 0.0020019450295935067),
 ('ERBB3', 0.0018976702881125262),
 ('UCK2', 0.0018677106994603632),
 ('CTSA', 0.0017387772508319545),
 ('FRA10AC1', 0.0014497532246973014),
 ('CCT3', 0.001417116143827872),
 ('RHOC', 0.0014054444495184423),
 ('NOP58', 0.0013459054137422276),
 ('DEXI', 0.0013320257992684007),
 ('AMBN', 0.001327802470173914),
 ('ATOX1', 0.0013250772416098925),
 ('MED15', 0.0013166147435092767),
 ('TMEM159', 0.0013016221769980006),
 ('C19orf54', 0.00128897912193914),
 ('RASGEF1A', 0.0012772804470981968),
 ('TTPAL', 0.0012756813469479956),
 ('PLAGL1', 0.0012194142058848782),
 ('FAM98B', 0.0011940232441150617),
 ('ECH1', 0.0011875823000959032),
 ('FAM199X', 0.0011798965959089406),
 ('MNF1', 0.001178985151762741),
 ('HOMER1', 0.0011703323280341862),
 ('HPS5', 0.001153268806665033),
 ('NCL', 0.0011403490467444348),
 ('KHDRBS2', 0.0011295106001797724),
 ('EEPD1', 0.001129119230012539),
 ('TESK2', 0.0011209069263379316),
 ('PTGES3L.AARSD1', 

In [64]:
# creating selector object that uses random forest classifier to identify features
# importance of more than e-04 will be used to select features
sfm = SelectFromModel(clf, threshold=1e-3)
sfm.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                                 class_weight=None,
                                                 criterion='gini',
                                                 max_depth=None,
                                                 max_features='auto',
                                                 max_leaf_nodes=None,
                                                 max_samples=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100, n_jobs=-1,
                                                 oob_score=False,
 

In [65]:
selected_feat= X_train.columns[(sfm.get_support())]

In [66]:
x = list(selected_feat)
len(x)

40

In [67]:
# transforming data to create new dataset containing only the most important features
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [68]:
# creating and training new random forest classifier for most important features
clf_important = RandomForestClassifier(n_estimators=X.shape[1], random_state=0, n_jobs=-1)
clf_important.fit(X_important_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=16381,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [69]:
# apply the full-featured classifier to test data
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.5968992248062015

In [70]:
# apply important features classifier to test data
y_important_pred = clf_important.predict(X_important_test)
accuracy_score(y_test, y_important_pred)

0.6511627906976745

In [59]:
s_feats = pd.DataFrame(selected_feat)
s_feats.to_csv('cleaned/selected_genes.csv', index=False)