In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn import preprocessing

# Transforming and Splitting Data

In [2]:
df = pd.read_csv("data/combined_expression.csv")
df.head()

Unnamed: 0,CELL_LINE_NAME,cluster,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,...,C6orf10,TMEM225,NOTCH4,PBX2,AGER,RNF5,AGPAT1,DFNB59,PRRT1,FKBPL
0,1240123,2,8.319417,3.111183,9.643558,4.757258,3.919757,3.602185,3.329644,9.07695,...,3.085394,3.462811,3.33903,4.614897,3.395845,3.419193,3.971646,3.72931,3.320022,6.447316
1,1240131,1,7.611268,2.704739,10.276079,3.650299,3.481567,3.145538,3.565127,7.861068,...,2.801456,2.985889,3.180068,5.415729,3.299858,3.028414,3.877889,3.911516,3.379405,4.729557
2,1240132,1,7.678658,2.845781,10.180954,3.573048,3.431235,3.090781,4.116643,8.12119,...,2.934962,2.952937,3.164655,5.707506,3.434295,2.961345,4.272194,3.085696,3.002557,5.653588
3,1240134,1,3.265063,3.063746,10.490285,3.340791,3.676912,3.512821,3.873922,8.790851,...,3.041839,3.398847,3.10671,5.773963,3.412641,3.13611,4.422262,3.522122,3.509437,5.953242
4,1240140,1,7.090138,2.988043,10.264692,4.119555,3.432585,3.308033,3.318371,6.927761,...,3.028787,3.225982,3.27582,5.334283,3.864678,3.259242,3.840581,5.809553,3.674587,5.577503


In [3]:
features = [f for f in  df.columns if f not in ['CELL_LINE_NAME', 'cluster']]
len(features)

16382

In [4]:
X = df[features].values
Y = df['cluster'].values.ravel()

In [5]:
min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)

In [7]:
# max_depth of tree advised on Boruta Github to be ~3-7
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
boruta_feature_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1, max_iter=100)
boruta_feature_selector.fit(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	1479
Rejected: 	14903
Iteration: 	9 / 100
Confirmed: 	255
Tentative: 	1224
Rejected: 	14903
Iteration: 	10 / 100
Confirmed: 	255
Tentative: 	1224
Rejected: 	14903
Iteration: 	11 / 100
Confirmed: 	255
Tentative: 	1224
Rejected: 	14903
Iteration: 	12 / 100
Confirmed: 	265
Tentative: 	848
Rejected: 	15269
Iteration: 	13 / 100
Confirmed: 	265
Tentative: 	848
Rejected: 	15269
Iteration: 	14 / 100
Confirmed: 	265
Tentative: 	848
Rejected: 	15269
Iteration: 	15 / 100
Confirmed: 	265
Tentative: 	848
Re

BorutaPy(estimator=RandomForestClassifier(class_weight='balanced', max_depth=5,
                                          n_estimators=561, n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x1A1E8C5990),
         n_estimators='auto', random_state=RandomState(MT19937) at 0x1A1E8C5990,
         verbose=2)

In [8]:
# check selected features - first 5 features are selected
boruta_feature_selector.support_

array([False, False, False, ..., False, False, False])

In [9]:
# check ranking of features
boruta_feature_selector.ranking_

array([  244,  4614,  4220, ..., 10181, 11398, 15509])

In [10]:
X_filtered = boruta_feature_selector.transform(X)
X_filtered.shape

(541, 301)

In [11]:
final_features = list()
indices = np.where(boruta_feature_selector.support_ == True)
for x in np.nditer(indices):
    final_features.append(features[x])
final_features

['FAM214B',
 'ITGA3',
 'TNFRSF12A',
 'ALDH3B1',
 'RHBDF1',
 'CYTH3',
 'HFE',
 'MVP',
 'GPRC5A',
 'CCDC88C',
 'WWTR1',
 'SAMD4A',
 'VIM',
 'CTNNA1',
 'POLR2B',
 'DTNBP1',
 'VAMP3',
 'BCAR1',
 'FOXC1',
 'DCBLD2',
 'NCKAP1',
 'GPC1',
 'CTSA',
 'SUGP2',
 'SNX24',
 'PTPN21',
 'DAZAP1',
 'ACTN1',
 'PPP2R3A',
 'IGF2BP2',
 'NTN4',
 'NUAK1',
 'SEMA3C',
 'RASAL2',
 'FNDC3B',
 'FOSL2',
 'PLD1',
 'RBMS2',
 'EDN1',
 'ITGB5',
 'SMAP2',
 'CD59',
 'CTTN',
 'EPB41L1',
 'SNX5',
 'KDM2B',
 'PXN',
 'LAMB1',
 'TBC1D2',
 'CDC7',
 'KDELR3',
 'VRK1',
 'NOP56',
 'POLA1',
 'PLS3',
 'CORO1A',
 'GABPB1',
 'TJP1',
 'UBR5',
 'CLASRP',
 'RASAL3',
 'SUGP1',
 'TFPI2',
 'OGDH',
 'CAV2',
 'CAV1',
 'MET',
 'HIBADH',
 'SERPINE1',
 'EZH2',
 'PLEKHA1',
 'DKK1',
 'BLMH',
 'ABCC3',
 'DUSP3',
 'TNFAIP1',
 'SH3D19',
 'CCND1',
 'PRPF19',
 'ARHGEF17',
 'CPSF6',
 'RPS12',
 'AMOTL2',
 'FHL2',
 'RND3',
 'EPAS1',
 'RPL22',
 'ERRFI1',
 'F3',
 'ARID1A',
 'RAB32',
 'MYB',
 'CTGF',
 'LTBP2',
 'AVPI1',
 'RCL1',
 'TGFBI',
 'B4GALT4',
 'KHD

In [12]:
s_feats = pd.DataFrame(final_features)
s_feats.to_csv('cleaned/boruta.csv', index=False)