In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from datetime import datetime
from os import path
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import eli5
from eli5.sklearn import PermutationImportance
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.style.use('ggplot')

In [3]:
path_to_train = path.join(path.abspath('..'), 'data', 'train.csv')
path_to_test = path.join(path.abspath('..'), 'data', 'test.csv')

train = pd.read_csv(path_to_train)

features = [col for col in train.columns if col not in ['id', 'target']]
obj_cols = [col for col in features if train[col].dtype == np.object]
num_cols = [col for col in features if train[col].dtype == np.float64]

In [4]:
train[obj_cols] = train[obj_cols].fillna(value='NAN')
train[num_cols] = train[num_cols].fillna(value=-1)

for col in obj_cols:    
    encoder = LabelEncoder()    
    train[col] = encoder.fit(train[col]).transform(train[col])
    
train_x, test_x, train_y, test_y = train_test_split(
    train[features], 
    train['target'], 
    shuffle=True, 
    train_size=0.7, 
    random_state=42)    

In [15]:
def show_weights(expl):
    for row in expl.itertuples():
        print(f'{row.feature:5s}: {100*row.weight:5.2f} +- {100*2*row.std:.3f}')

In [16]:
estimator = lgb.LGBMClassifier(n_estimators=50)
estimator.fit(train_x, train_y, categorical_feature=features)

# initialize permutation importance class
perm_estimator = PermutationImportance(
    estimator=estimator, 
    scoring=make_scorer(roc_auc_score, needs_proba=True),
    n_iter=3)
perm_estimator.fit(test_x, test_y)

# calculate feature weights and return it as DataFrame
expl = eli5.format_as_dataframe(
    eli5.explain_weights(
        perm_estimator,
        top=None,
        feature_names=features
    )
)

show_weights(expl)

ord_3:  8.02 +- 0.169
ord_2:  3.07 +- 0.045
ord_5:  2.52 +- 0.044
month:  2.10 +- 0.056
nom_8:  2.08 +- 0.078
ord_0:  2.00 +- 0.158
nom_7:  1.76 +- 0.018
ord_4:  1.08 +- 0.029
nom_9:  0.79 +- 0.047
nom_1:  0.62 +- 0.026
bin_2:  0.58 +- 0.023
ord_1:  0.57 +- 0.021
bin_0:  0.56 +- 0.048
nom_3:  0.35 +- 0.067
nom_5:  0.33 +- 0.038
day  :  0.30 +- 0.044
nom_2:  0.18 +- 0.031
nom_4:  0.09 +- 0.015
nom_6:  0.05 +- 0.015
nom_0:  0.00 +- 0.000
bin_4:  0.00 +- 0.000
bin_3:  0.00 +- 0.000
bin_1:  0.00 +- 0.000
