In [4]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
from datetime import datetime
from os import path
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import eli5
from eli5.sklearn import PermutationImportance
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib as mpl
import matplotlib.pyplot as plt
from tqdm import tqdm

mpl.style.use('ggplot')

In [6]:
path_to_train = path.join(path.abspath('..'), 'data', 'train.csv')
path_to_test = path.join(path.abspath('..'), 'data', 'test.csv')

train = pd.read_csv(path_to_train)

features = [col for col in train.columns if col not in ['id', 'target']]
obj_cols = [col for col in features if train[col].dtype == np.object]
num_cols = [col for col in features if train[col].dtype == np.float64]

In [7]:
for feature in tqdm(features, ascii=True):
    train[feature + '_na'] = 0
    train.loc[train[feature].isna(), feature + '_na'] = 1

train[obj_cols] = train[obj_cols].fillna(value='NAN')
train[num_cols] = train[num_cols].fillna(value=-1)

for col in obj_cols:    
    encoder = LabelEncoder()    
    train[col] = encoder.fit(train[col]).transform(train[col])

features = [col for col in train.columns if col not in ['id', 'target']]    
    
train_x, test_x, train_y, test_y = train_test_split(
    train[features], 
    train['target'], 
    shuffle=True, 
    train_size=0.7, 
    random_state=42)    

100%|##########| 23/23 [00:02<00:00, 10.58it/s]


In [10]:
def show_weights(expl):
    for row in expl.itertuples():
        print(f'{row.feature:8s}: {100*row.weight:5.2f} +- {100*2*row.std:.3f}')

In [9]:
estimator = lgb.LGBMClassifier(n_estimators=50)
estimator.fit(train_x, train_y, categorical_feature=features)

# initialize permutation importance class
perm_estimator = PermutationImportance(
    estimator=estimator, 
    scoring=make_scorer(roc_auc_score, needs_proba=True),
    n_iter=3)
perm_estimator.fit(test_x, test_y)

# calculate feature weights and return it as DataFrame
expl = eli5.format_as_dataframe(
    eli5.explain_weights(
        perm_estimator,
        top=None,
        feature_names=features
    )
)

In [11]:
show_weights(expl)

ord_3   :  8.04 +- 0.181
ord_2   :  3.09 +- 0.098
ord_5   :  2.54 +- 0.115
month   :  2.07 +- 0.110
ord_0   :  2.04 +- 0.070
nom_8   :  2.01 +- 0.152
nom_7   :  1.81 +- 0.076
ord_4   :  1.12 +- 0.079
nom_9   :  0.77 +- 0.061
nom_1   :  0.63 +- 0.024
bin_2   :  0.60 +- 0.050
bin_0   :  0.57 +- 0.041
ord_1   :  0.55 +- 0.047
nom_3   :  0.36 +- 0.046
nom_5   :  0.32 +- 0.058
day     :  0.31 +- 0.010
nom_2   :  0.17 +- 0.011
nom_4   :  0.09 +- 0.003
nom_6   :  0.01 +- 0.029
bin_4   :  0.00 +- 0.000
nom_0   :  0.00 +- 0.000
ord_4_na:  0.00 +- 0.000
ord_0_na:  0.00 +- 0.000
bin_3   :  0.00 +- 0.000
ord_5_na:  0.00 +- 0.000
bin_1   :  0.00 +- 0.000
ord_3_na:  0.00 +- 0.000
ord_2_na:  0.00 +- 0.000
ord_1_na:  0.00 +- 0.000
nom_5_na:  0.00 +- 0.000
nom_4_na:  0.00 +- 0.000
nom_9_na:  0.00 +- 0.000
nom_8_na:  0.00 +- 0.000
nom_7_na:  0.00 +- 0.000
nom_6_na:  0.00 +- 0.000
day_na  :  0.00 +- 0.000
bin_0_na:  0.00 +- 0.000
bin_1_na:  0.00 +- 0.000
bin_2_na:  0.00 +- 0.000
bin_3_na:  0.00 +- 0.000


In [12]:
train.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_8_na,nom_9_na,ord_0_na,ord_1_na,ord_2_na,ord_3_na,ord_4_na,ord_5_na,day_na,month_na
0,0,0.0,0.0,0.0,0,0,3,5,3,6,...,0,0,0,0,0,0,0,0,0,0
1,1,1.0,1.0,0.0,0,2,3,4,0,5,...,0,0,0,0,0,0,0,0,0,0
2,2,0.0,1.0,0.0,0,0,3,1,3,0,...,0,1,0,1,0,0,0,0,0,0
3,3,-1.0,0.0,0.0,0,0,3,0,3,3,...,0,0,0,0,0,0,0,1,0,0
4,4,0.0,-1.0,0.0,2,0,3,6,3,2,...,1,0,0,0,0,0,0,0,0,0
