In [91]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn import preprocessing

# Transforming and Splitting Data

In [92]:
df = pd.read_csv("data/combined_expression.csv")
df.head()

Unnamed: 0,CELL_LINE_NAME,classification,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,...,COL15A1,C6orf10,TMEM225,NOTCH4,PBX2,AGER,RNF5,AGPAT1,DFNB59,PRRT1
0,1240121,1,6.419526,3.182094,9.320548,3.759654,3.802619,3.215753,4.698729,7.873672,...,3.245454,2.953508,3.543429,3.352022,4.67231,3.641128,3.13531,3.737072,3.450927,3.1688
1,1240122,2,7.646494,2.626819,10.153853,3.564755,3.942749,3.29076,3.551675,8.252413,...,2.786709,3.077382,3.728232,3.208882,4.58684,3.395654,3.5868,3.519128,3.115323,3.051645
2,1240123,1,8.319417,3.111183,9.643558,4.757258,3.919757,3.602185,3.329644,9.07695,...,3.459089,3.085394,3.462811,3.33903,4.614897,3.395845,3.419193,3.971646,3.72931,3.320022
3,1240124,1,9.006994,3.028173,9.6867,4.280504,3.147646,3.188881,3.293807,8.67879,...,2.835403,2.960303,3.415083,3.290171,4.770123,3.400821,3.383734,3.798107,2.822404,3.297547
4,1240127,1,7.985676,2.694729,10.676134,4.159685,3.804637,3.481942,3.111261,7.555407,...,2.896523,2.849899,3.480114,3.226128,5.83271,3.612179,3.347095,4.457963,5.198524,4.553586


In [99]:
features = [f for f in  df.columns if f not in ['CELL_LINE_NAME', 'classification']]
len(features)

16381

In [100]:
X = df[features].values
Y = df['classification'].values.ravel()

In [101]:
min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)

In [102]:
# max_depth of tree advised on Boruta Github to be ~3-7
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=3)
boruta_feature_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1, perc=99, max_iter=50)
boruta_feature_selector.fit(X, Y)

Iteration: 	1 / 50
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	2 / 50
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	3 / 50
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	4 / 50
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	5 / 50
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	6 / 50
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	7 / 50
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	8 / 50
Confirmed: 	0
Tentative: 	1424
Rejected: 	14957
Iteration: 	9 / 50
Confirmed: 	207
Tentative: 	1217
Rejected: 	14957
Iteration: 	10 / 50
Confirmed: 	207
Tentative: 	1217
Rejected: 	14957
Iteration: 	11 / 50
Confirmed: 	207
Tentative: 	1217
Rejected: 	14957
Iteration: 	12 / 50
Confirmed: 	239
Tentative: 	797
Rejected: 	15345
Iteration: 	13 / 50
Confirmed: 	239
Tentative: 	797
Rejected: 	15345
Iteration: 	14 / 50
Confirmed: 	239
Tentative: 	797
Rejected: 	15345
Iteration: 	15 / 50
Confirmed: 	239
Tentative: 	797
Rejected: 	15345


BorutaPy(alpha=0.05,
         estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                          class_weight='balanced',
                                          criterion='gini', max_depth=3,
                                          max_features='auto',
                                          max_leaf_nodes=None, max_samples=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=1135, n_jobs=-1,
                                          oob_score=False,
                                          random_state=RandomState(MT19937) at 0x1A98C33678,
                                          verbose=0, warm_start=F

In [103]:
X_filtered = boruta_feature_selector.transform(X)
X_filtered.shape

(642, 280)

In [104]:
final_features = list()
indices = np.where(boruta_feature_selector.support_ == True)
for x in np.nditer(indices):
    final_features.append(features[x])
final_features

['LASP1',
 'KDM1A',
 'CX3CL1',
 'RHBDF1',
 'PSMB1',
 'MRC2',
 'PTBP1',
 'TMEM159',
 'FHL1',
 'NUP160',
 'SKIV2L2',
 'STAU2',
 'ZIC2',
 'GOPC',
 'R3HDM1',
 'MRTO4',
 'NOP58',
 'ZNF280C',
 'CTSA',
 'WDR18',
 'ERBB3',
 'TMEM206',
 'DIP2B',
 'ZNRD1',
 'KIF2A',
 'NUCKS1',
 'TESK2',
 'PDCD2',
 'NDE1',
 'SCARB1',
 'MARK3',
 'FMO4',
 'ANKRD13A',
 'PAG1',
 'TYR',
 'TP53INP2',
 'DUSP12',
 'CD82',
 'BCORL1',
 'SEH1L',
 'DIMT1',
 'TFAP2C',
 'RFX2',
 'KHSRP',
 'C20orf26',
 'TEKT2',
 'CDC5L',
 'CDC7',
 'HNRNPM',
 'PACSIN2',
 'PRMT5',
 'CEP128',
 'KIAA0247',
 'ZMYND8',
 'ARFGAP1',
 'EEA1',
 'MEDAG',
 'ZNF423',
 'USP31',
 'PIH1D1',
 'SF3A2',
 'ISYNA1',
 'TMEM59L',
 'WDR91',
 'COBL',
 'FUBP3',
 'TRDMT1',
 'NPM3',
 'CUEDC2',
 'SLC6A4',
 'MANBA',
 'GAR1',
 'CRYAB',
 'CPT1A',
 'RNGTT',
 'FANCE',
 'RNF8',
 'BAG2',
 'KHDRBS2',
 'E2F3',
 'WASF1',
 'VNN2',
 'MDFI',
 'BYSL',
 'GHR',
 'TCERG1',
 'NCL',
 'ELMOD3',
 'ORC2',
 'SUMO1',
 'FARSB',
 'EBNA1BP2',
 'CD3EAP',
 'CASP8AP2',
 'UBE3D',
 'HEATR1',
 'NEK6',
 'I

In [105]:
s_feats = pd.DataFrame(final_features)
s_feats.to_csv('cleaned/boruta-99-25-0.01.csv', index=False)