In [11]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestClassifier

In [4]:
# Load data
with open('../hhold.pickle', 'rb') as file:
    hhold = pickle.load(file)
with open('../indiv.pickle', 'rb') as file:
    indiv = pickle.load(file)

In [7]:
# Indices and target variable are in protected class
protected_cols = ['id', 'iid', 'poor']
# Get a list of dummy features
feat_list = [col for col in indiv.columns if col not in protected_cols]
# Separate data into independent and dependent variables
X = indiv[feat_list]
Y = indiv['poor']

In [8]:
# Fit forest
forest = RandomForestClassifier(n_estimators=10000, n_jobs=-1)
forest.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
# Gather feature importances
importances = forest.feature_importances_

In [13]:
# Gather ranked indices of feature importances
indices = np.argsort(importances)[::-1]
for f in range(len(feat_list)):
    print('{0:3d} ({1:.4f}) {2}'.format(f + 1, importances[indices[f]], feat_list[indices[f]]))

  1 (0.2209) i_num_002
  2 (0.0279) i_cat_014_BQEnF
  3 (0.0242) i_cat_014_flBEG
  4 (0.0186) i_cat_014_VGNER
  5 (0.0170) i_cat_023_vtkRP
  6 (0.0169) i_cat_023_Qydia
  7 (0.0164) i_cat_010_scxJu
  8 (0.0145) i_cat_002_kzSFB
  9 (0.0129) i_cat_014_iyqBV
 10 (0.0114) i_cat_017_OeQKE
 11 (0.0110) i_cat_022_AyuSE
 12 (0.0108) i_cat_010_bJTYb
 13 (0.0105) i_cat_002_mOlYV
 14 (0.0104) i_cat_033_gCSRj
 15 (0.0102) i_cat_022_hCKQi
 16 (0.0101) i_num_001
 17 (0.0099) i_cat_014_lcEtN
 18 (0.0095) i_cat_019_dpMMl
 19 (0.0092) i_cat_033_uEstx
 20 (0.0092) i_cat_014_bszTA
 21 (0.0092) i_cat_018_CneHb
 22 (0.0087) i_cat_036_rkLqZ
 23 (0.0086) i_cat_017_XNPgB
 24 (0.0086) i_cat_036_xUYIC
 25 (0.0085) i_cat_012_HHynv
 26 (0.0084) i_cat_012_yhUHu
 27 (0.0083) i_cat_007_fOUHD
 28 (0.0082) i_cat_012_DgtXD
 29 (0.0080) i_cat_010_tqINY
 30 (0.0080) i_cat_012_GmSKW
 31 (0.0078) i_cat_013_pdgUV
 32 (0.0075) i_cat_004_HIvIU
 33 (0.0074) i_cat_012_YEngm
 34 (0.0071) i_cat_012_EaHvf
 35 (0.0071) i_cat_012_AYc

In [20]:
# Convert to numpy array
feat_list = np.array(feat_list)
# Gather a list of the top features
top_features = feat_list[np.where(importances > 0.0092)]
top_features

array(['i_num_001', 'i_num_002', 'i_cat_002_kzSFB', 'i_cat_002_mOlYV',
       'i_cat_010_bJTYb', 'i_cat_010_scxJu', 'i_cat_014_BQEnF',
       'i_cat_014_VGNER', 'i_cat_014_flBEG', 'i_cat_014_iyqBV',
       'i_cat_014_lcEtN', 'i_cat_017_OeQKE', 'i_cat_019_dpMMl',
       'i_cat_022_AyuSE', 'i_cat_022_hCKQi', 'i_cat_023_Qydia',
       'i_cat_023_vtkRP', 'i_cat_033_gCSRj', 'i_cat_033_uEstx'],
      dtype='<U15')

In [22]:
# Save importances for re-use
with open('importances.pickle', 'wb') as f:
    pickle.dump(importances, f, pickle.HIGHEST_PROTOCOL)