In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import fnmatch

from sklearn.linear_model import LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV, SelectKBest, f_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('./data/clean_data_fe.csv')
df.head()

Unnamed: 0,var8,a.2,a.4,var5,b.5,b.6,b.7,b.8,b.9,b.12,...,month_6,month_7,month_8,month_9,month_10,month_11,month_12,var9_Mono,var9_Multi,var9_nan
0,2.0,5,57,34,1,2.0,1,1.0,2,2,...,0,0,0,0,0,0,0,1,0,0
1,2.0,4,57,34,2,3.0,1,1.0,2,2,...,0,0,0,0,0,0,0,1,0,0
2,2.0,5,57,42,2,1.0,1,1.0,1,1,...,0,0,0,0,0,0,0,0,1,0
3,2.0,6,57,34,2,4.0,1,1.0,2,2,...,0,0,0,0,0,0,0,1,0,0
4,2.0,8,22,1,1,2.0,1,2.0,1,1,...,0,0,0,0,0,0,0,0,1,0


In [3]:
target = 'pov6'
col_ls = df.columns.tolist()
col_ls.remove(target)
col_ls

X = df[col_ls].copy()
y = df[[target]].copy()

In [4]:
X_train, X_tempt, y_train, y_tempt = train_test_split( X, y, test_size=0.4, random_state=42, stratify=y)
X_test, X_val, y_test, y_val = train_test_split( X_tempt, y_tempt, test_size=0.5, random_state=42,stratify=y_tempt)

In [5]:
X_train.shape, X_test.shape, X_val.shape, X_train.shape, y_test.shape, y_val.shape

((11027, 261), (3676, 261), (3676, 261), (11027, 261), (3676, 1), (3676, 1))

## Feature selection through voting

In [6]:
df_feature = pd.DataFrame(col_ls, columns=['features'])  
df_feature.head()

Unnamed: 0,features
0,var8
1,a.2
2,a.4
3,var5
4,b.5


### 1. df_corr

In [None]:
target = y.columns[0]
X_corr = pd.concat([X_train, y_train], axis=1)
df_corr = X_corr.corr()
df_corr_target = df_corr[target].abs().sort_values(ascending = False).to_frame()
df_corr_target.drop(index=target,inplace=True)
df_corr_target
df_corr_target.reset_index(inplace=True)
df_corr_target.columns = ['features', 'corr']
df_feature = pd.merge(df_feature, df_corr_target, on='features', how='left')

### 2. LassoCV

In [None]:
lasso_model = LassoCV(alphas=[0.1, 1.0, 10.0],cv=StratifiedKFold(10), max_iter=3000)
lasso_model.fit(X_train, y_train)
print('lasso_cv score = ',lasso_model.score(X_train,y_train))
df_lasso = pd.DataFrame(list(zip(X_train.columns,abs(lasso_model.coef_))), columns=['features', 'lasso'])
df_lasso[df_lasso['lasso']!= 0].shape
df_feature = pd.merge(df_feature, df_lasso, on='features', how='left')

  y = column_or_1d(y, warn=True)
  model = cd_fast.enet_coordinate_descent_gram(


lasso_cv score =  0.32243052472121714


In [None]:
df_feature

Unnamed: 0,features,corr,lasso
0,var8,0.018433,0.000000
1,a.2,0.050498,0.000000
2,a.4,0.039626,0.000667
3,var5,0.029138,0.000454
4,b.5,0.193846,0.000000
...,...,...,...
256,month_11,0.016806,0.000000
257,month_12,0.012828,0.000000
258,var9_Mono,0.160413,0.000000
259,var9_Multi,0.155672,0.000000


### 3. Selectkbest

In [None]:
select_k = SelectKBest(score_func=f_classif, k = 30)
fit = select_k.fit(X_train, y_train)
df_selectkbest = pd.DataFrame(list(zip(X_train.columns,fit.scores_)), columns=['features', 'k_best'])
df_selectkbest.sort_values('k_best', ascending=False, inplace = True)
# df_selectkbest.plot(kind ='barh')
df_feature = pd.merge(df_feature, df_selectkbest, on='features', how='left')

  y = column_or_1d(y, warn=True)
  f = msb / msw


### 4. Feature importance - Random Forest

In [20]:
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train, y_train)
df_feat_impt = pd.DataFrame(list(zip(rf.feature_names_in_,rf.feature_importances_)), columns=['features', 'rf-importance'])
df_feature = pd.merge(df_feature, df_feat_impt, on='features', how='left')

  rf.fit(X_train, y_train)


In [21]:
df_feature

Unnamed: 0,features,corr,lasso,k_best,rf-importance
0,var8,0.018433,0.000000,1.601460,0.002567
1,a.2,0.050498,0.000000,73.303880,0.007101
2,a.4,0.039626,0.000667,5.673678,0.004935
3,var5,0.029138,0.000454,5.091987,0.005190
4,b.5,0.193846,0.000000,204.162117,0.004541
...,...,...,...,...,...
256,month_11,0.016806,0.000000,0.652200,0.000595
257,month_12,0.012828,0.000000,2.566551,0.000920
258,var9_Mono,0.160413,0.000000,71.158401,0.004569
259,var9_Multi,0.155672,0.000000,67.169040,0.004125


### 5. Recursive Feature Selection - Decision Tree

In [22]:
rfecv_model = RFECV(estimator=DecisionTreeClassifier(), cv=StratifiedKFold(10), scoring='f1_weighted', min_features_to_select=10, step=3,verbose=1, n_jobs=-1)
rfecv_model.fit(X,y)
feat_ls = [feat for feat, result in list(zip(X, rfecv_model.support_)) if result == True ]
feat_coeff = rfecv_model.estimator_.feature_importances_
df_rfe = pd.DataFrame(list(zip(feat_ls,feat_coeff)), columns=['features', 'rfecv'])
df_feature = pd.merge(df_feature, df_rfe, on='features', how='left')

Fitting estimator with 261 features.
Fitting estimator with 258 features.
Fitting estimator with 255 features.
Fitting estimator with 252 features.
Fitting estimator with 249 features.
Fitting estimator with 246 features.
Fitting estimator with 243 features.
Fitting estimator with 240 features.
Fitting estimator with 237 features.
Fitting estimator with 234 features.
Fitting estimator with 231 features.
Fitting estimator with 228 features.
Fitting estimator with 225 features.
Fitting estimator with 222 features.
Fitting estimator with 219 features.
Fitting estimator with 216 features.
Fitting estimator with 213 features.
Fitting estimator with 210 features.
Fitting estimator with 207 features.
Fitting estimator with 204 features.
Fitting estimator with 201 features.
Fitting estimator with 198 features.
Fitting estimator with 195 features.
Fitting estimator with 192 features.
Fitting estimator with 189 features.
Fitting estimator with 186 features.
Fitting estimator with 183 features.
F

## Voting for Feature Selection using ranking for each method
1. Select up to top 30 features for each method

In [23]:
for i in df_feature.columns[1:]:
    df_feature[f'{i}_rank'] = df_feature[i].rank(ascending = False)
    df_feature[f'{i}_rank'] = df_feature[f'{i}_rank'].apply(lambda x : 1 if x <= 30 else 0)
rank_ls = fnmatch.filter(df_feature.columns,'*_rank')
df_feature['voting'] = df_feature[rank_ls].sum(axis=1)
df_feature.sort_values('voting', ascending=False, inplace = True)
df_feature.reset_index(drop=True, inplace=True)

In [24]:
df_feature[df_feature['voting'] > 0]

Unnamed: 0,features,corr,lasso,k_best,rf-importance,rfecv,corr_rank,lasso_rank,k_best_rank,rf-importance_rank,rfecv_rank,voting
0,b.22,0.397158,0.005682,556.230795,0.082671,0.201592,1,1,1,1,1,5
1,e.96,0.508463,0.087968,2489.587832,0.041386,,1,1,1,1,0,4
2,e.100,0.482191,0.044884,1836.404214,0.034945,,1,1,1,1,0,4
3,e.97,0.485365,0.037164,1982.477083,0.053395,,1,1,1,1,0,4
4,c.48,0.179160,0.000000,290.815181,0.028782,0.015384,1,0,1,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...
64,b.6,0.172598,0.000000,115.860810,0.006093,,1,0,0,0,0,1
65,b.24,0.059944,0.000280,14.596418,0.001428,,0,1,0,0,0,1
66,var5,0.029138,0.000454,5.091987,0.005190,,0,1,0,0,0,1
67,c.65,0.059097,0.000000,63.833334,0.008347,,0,0,0,1,0,1


In [27]:
df_feature.to_csv('./data/fs_data.csv')

In [28]:
X_train.to_csv('./data/X_train.csv')
X_test.to_csv('./data/X_test.csv')
X_val.to_csv('./data/X_val.csv')

y_train.to_csv('./data/y_train.csv')
y_test.to_csv('./data/y_test.csv')
y_val.to_csv('./data/y_val.csv')
