In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn import preprocessing

# Transforming and Splitting Data

In [2]:
df = pd.read_csv("data/combined_expression.csv")
df.head()

Unnamed: 0,CELL_LINE_NAME,cluster,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,...,C6orf10,TMEM225,NOTCH4,PBX2,AGER,RNF5,AGPAT1,DFNB59,PRRT1,FKBPL
0,1240123,2,8.319417,3.111183,9.643558,4.757258,3.919757,3.602185,3.329644,9.07695,...,3.085394,3.462811,3.33903,4.614897,3.395845,3.419193,3.971646,3.72931,3.320022,6.447316
1,1240131,1,7.611268,2.704739,10.276079,3.650299,3.481567,3.145538,3.565127,7.861068,...,2.801456,2.985889,3.180068,5.415729,3.299858,3.028414,3.877889,3.911516,3.379405,4.729557
2,1240132,1,7.678658,2.845781,10.180954,3.573048,3.431235,3.090781,4.116643,8.12119,...,2.934962,2.952937,3.164655,5.707506,3.434295,2.961345,4.272194,3.085696,3.002557,5.653588
3,1240134,1,3.265063,3.063746,10.490285,3.340791,3.676912,3.512821,3.873922,8.790851,...,3.041839,3.398847,3.10671,5.773963,3.412641,3.13611,4.422262,3.522122,3.509437,5.953242
4,1240140,1,7.090138,2.988043,10.264692,4.119555,3.432585,3.308033,3.318371,6.927761,...,3.028787,3.225982,3.27582,5.334283,3.864678,3.259242,3.840581,5.809553,3.674587,5.577503


In [3]:
features = [f for f in  df.columns if f not in ['CELL_LINE_NAME', 'cluster']]
len(features)

16382

In [4]:
X = df[features].values
Y = df['cluster'].values.ravel()

In [5]:
min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)

In [None]:
# max_depth of tree advised on Boruta Github to be ~3-7
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
boruta_feature_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1, max_iter=100)
boruta_feature_selector.fit(X, Y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	16382
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	5858
Tentative: 	10524
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	5858
Tentative: 	10524
Rejected: 	0


In [None]:
# check selected features - first 5 features are selected
boruta_feature_selector.support_

In [None]:
# check ranking of features
boruta_feature_selector.ranking_

In [None]:
X_filtered = boruta_feature_selector.transform(X)
X_filtered.shape

In [None]:
final_features = list()
indices = np.where(boruta_feature_selector.support_ == True)
for x in np.nditer(indices):
    final_features.append(features[x])
final_features

In [None]:
s_feats = pd.DataFrame(final_features)
s_feats.to_csv('cleaned/boruta.csv', index=False)