In [80]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

# Transforming and Splitting Data

In [81]:
df = pd.read_csv("data/combined_expression.csv")
df.head()

Unnamed: 0,CELL_LINE_NAME,classification,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,...,COL15A1,C6orf10,TMEM225,NOTCH4,PBX2,AGER,RNF5,AGPAT1,DFNB59,PRRT1
0,1240121,5,6.419526,3.182094,9.320548,3.759654,3.802619,3.215753,4.698729,7.873672,...,3.245454,2.953508,3.543429,3.352022,4.67231,3.641128,3.13531,3.737072,3.450927,3.1688
1,1240122,6,7.646494,2.626819,10.153853,3.564755,3.942749,3.29076,3.551675,8.252413,...,2.786709,3.077382,3.728232,3.208882,4.58684,3.395654,3.5868,3.519128,3.115323,3.051645
2,1240123,5,8.319417,3.111183,9.643558,4.757258,3.919757,3.602185,3.329644,9.07695,...,3.459089,3.085394,3.462811,3.33903,4.614897,3.395845,3.419193,3.971646,3.72931,3.320022
3,1240124,1,9.006994,3.028173,9.6867,4.280504,3.147646,3.188881,3.293807,8.67879,...,2.835403,2.960303,3.415083,3.290171,4.770123,3.400821,3.383734,3.798107,2.822404,3.297547
4,1240127,6,7.985676,2.694729,10.676134,4.159685,3.804637,3.481942,3.111261,7.555407,...,2.896523,2.849899,3.480114,3.226128,5.83271,3.612179,3.347095,4.457963,5.198524,4.553586


In [82]:
features = [f for f in  df.columns if f not in ['CELL_LINE_NAME', 'classification']]
len(features)

16381

In [83]:
X = df[features].values
Y = df['classification'].values.ravel()

In [84]:
# max_depth of tree advised on Boruta Github to be ~3-7
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=3)
boruta_feature_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1, perc=99, max_iter=25, alpha=0.01)
boruta_feature_selector.fit(X, Y)

Iteration: 	1 / 25
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	2 / 25
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	3 / 25
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	4 / 25
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	5 / 25
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	6 / 25
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	7 / 25
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	8 / 25
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	9 / 25
Confirmed: 	0
Tentative: 	16381
Rejected: 	0
Iteration: 	10 / 25
Confirmed: 	0
Tentative: 	354
Rejected: 	16027
Iteration: 	11 / 25
Confirmed: 	56
Tentative: 	298
Rejected: 	16027
Iteration: 	12 / 25
Confirmed: 	56
Tentative: 	298
Rejected: 	16027
Iteration: 	13 / 25
Confirmed: 	56
Tentative: 	298
Rejected: 	16027
Iteration: 	14 / 25
Confirmed: 	56
Tentative: 	298
Rejected: 	16027
Iteration: 	15 / 25
Confirmed: 	78
Tentative: 	237
Rejected: 	16066
Iteration: 	16 / 

BorutaPy(alpha=0.01,
         estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                          class_weight='balanced',
                                          criterion='gini', max_depth=3,
                                          max_features='auto',
                                          max_leaf_nodes=None, max_samples=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=788, n_jobs=-1,
                                          oob_score=False,
                                          random_state=RandomState(MT19937) at 0x1A98C33BA0,
                                          verbose=0, warm_start=Fa

In [85]:
X_filtered = boruta_feature_selector.transform(X)
X_filtered.shape

(544, 86)

In [86]:
final_features = list()
indices = np.where(boruta_feature_selector.support_ == True)
for x in np.nditer(indices):
    final_features.append(features[x])
final_features

['ITGA3',
 'TNFRSF12A',
 'TRAF3IP3',
 'ZCCHC8',
 'BCAR1',
 'DCBLD2',
 'NCKAP1',
 'KIF26A',
 'HLTF',
 'ACAP1',
 'ARHGAP15',
 'CTTN',
 'GNA11',
 'GRAMD1A',
 'SH2D3C',
 'MFNG',
 'MANBAL',
 'CORO1A',
 'TJP1',
 'OLFM2',
 'RASAL3',
 'PIK3CG',
 'MET',
 'HOXA2',
 'WASL',
 'SH3D19',
 'EFEMP1',
 'FHL2',
 'ERRFI1',
 'CD48',
 'LRMP',
 'SPINK4',
 'NCKAP1L',
 'SDC4',
 'MTRR',
 'X06.Sep',
 'NT5C',
 'OPRL1',
 'FAM78A',
 'GMFG',
 'SPIRE1',
 'DOCK2',
 'CTSL1',
 'EHF',
 'FAM129B',
 'YAP1',
 'MYOF',
 'IQGAP1',
 'ESCO1',
 'CYR61',
 'CD53',
 'PTPN7',
 'PRSS23',
 'TEX30',
 'FAM177A1',
 'FLI1',
 'DSPP',
 'ELMO1',
 'ANKLE1',
 'PDPN',
 'KCNJ3',
 'RPL22L1',
 'DYNLT3',
 'GTF2A1',
 'A2ML1',
 'KATNAL2',
 'HRASLS5',
 'PPIC',
 'TM4SF1',
 'FAM174A',
 'NT5DC1',
 'NQO1',
 'OR4A16',
 'P2RY8',
 'IKZF1',
 'C7orf61',
 'ARHGAP30',
 'SPIN4',
 'ZKSCAN4',
 'FBXL22',
 'HIST1H2BH',
 'PARVA',
 'NRAP',
 'S100A6',
 'SNHG12',
 'RCSD1']

In [87]:
s_feats = pd.DataFrame(final_features)
s_feats.to_csv('cleaned/boruta-99-25-0.01.csv', index=False)