In [69]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [70]:
from datetime import datetime
from os import path
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import eli5
from eli5.sklearn import PermutationImportance
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib as mpl
import matplotlib.pyplot as plt
from tqdm import tqdm

mpl.style.use('ggplot')

In [71]:
path_to_train = path.join(path.abspath('..'), 'data', 'train.csv')
path_to_test = path.join(path.abspath('..'), 'data', 'test.csv')

train = pd.read_csv(path_to_train)
test = pd.read_csv(path_to_test)

In [72]:
def add_woe_feature(train, test, feature, verbose=True):
    n_events = train['target'].sum()
    n_non_events = len(train) - n_events
    
    bins = train.groupby(feature)['target'].agg(['sum', 'count'])    
    bins['n_non_events'] = bins['count'] - bins['sum']
    bins['p_event'] = bins['sum'] / n_events
    bins['p_non_event'] = bins['n_non_events'] / n_non_events
    bins['woe'] = np.log(bins['p_event'] / bins['p_non_event'])
    
    train[feature + '_woe'] = train[feature].map(bins['woe'].to_dict())
    test[feature + '_woe'] = test[feature].map(bins['woe'].to_dict())
    
    if verbose:
        iv = ((bins['p_event'] - bins['p_non_event']) * bins['woe']).sum()
        print(f'{feature}: IV {iv:.2f}')

    return train, test

def add_woe_max(train, features):
    train['min_woe'] = train[features].min(axis=1)
    train['max_woe'] = train[features].max(axis=1)
    train['woe'] = train['max_woe']
    mask = train['min_woe'].abs() > train['max_woe'].abs()
    train.loc[mask, 'woe'] = train.loc[mask, 'min_woe']    
    return train

In [73]:
features = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 
            'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 
            'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 
            'day', 'month']

for feature in features:
    train, test = add_woe_feature(train, test, feature)

features_woe = [feature + '_woe' for feature in features]    
train = add_woe_max(train, features_woe)
test = add_woe_max(test, features_woe)

for feature in features_woe:
    train[feature] = train[feature].fillna(train['woe'])
    test[feature] = test[feature].fillna(test['woe'])

bin_0: IV 0.03
bin_1: IV 0.00
bin_2: IV 0.03
bin_3: IV 0.00
bin_4: IV 0.00
nom_0: IV 0.00
nom_1: IV 0.03
nom_2: IV 0.01
nom_3: IV 0.02
nom_4: IV 0.01
ord_0: IV 0.07
ord_1: IV 0.03
ord_2: IV 0.11
ord_3: IV 0.25
ord_4: IV 0.05
day: IV 0.02
month: IV 0.08


In [74]:
features = [col for col in train.columns if col not in ['id', 'target']]
obj_cols = [col for col in features if train[col].dtype == np.object]
num_cols = [col for col in features if train[col].dtype == np.float64]

train[obj_cols] = train[obj_cols].fillna(value='NAN')
train[num_cols] = train[num_cols].fillna(value=-1)

for col in obj_cols:    
    encoder = LabelEncoder()    
    train[col] = encoder.fit(train[col]).transform(train[col])

In [75]:
train_x, test_x, train_y, test_y = train_test_split(
    train[features], 
    train['target'], 
    shuffle=True, 
    train_size=0.7, 
    random_state=42)    

In [76]:
def show_weights(expl):
    for row in expl.itertuples():
        print(f'{row.feature:8s}: {100*row.weight:5.2f} +- {100*2*row.std:.3f}')

In [77]:
cat_features = [col for col in test.columns if ('woe' not in col) and (col not in ['id'])]
estimator = lgb.LGBMClassifier(n_estimators=50, categorical_features=cat_features)
estimator.fit(train_x, train_y)

# initialize permutation importance class
perm_estimator = PermutationImportance(
    estimator=estimator, 
    scoring=make_scorer(roc_auc_score, needs_proba=True),
    n_iter=3)
perm_estimator.fit(test_x, test_y)

# calculate feature weights and return it as DataFrame
expl = eli5.format_as_dataframe(
    eli5.explain_weights(
        perm_estimator,
        top=None,
        feature_names=features
    )
)

In [78]:
show_weights(expl)

ord_5   :  2.88 +- 0.099
woe     :  1.66 +- 0.011
ord_2_woe:  1.02 +- 0.067
max_woe :  0.87 +- 0.042
min_woe :  0.86 +- 0.049
nom_1_woe:  0.73 +- 0.016
ord_4   :  0.60 +- 0.021
day_woe :  0.51 +- 0.064
month_woe:  0.48 +- 0.055
ord_1_woe:  0.44 +- 0.031
nom_3_woe:  0.42 +- 0.014
ord_0_woe:  0.41 +- 0.011
bin_2   :  0.36 +- 0.025
ord_0   :  0.32 +- 0.047
nom_2_woe:  0.28 +- 0.018
month   :  0.22 +- 0.040
ord_3   :  0.22 +- 0.005
ord_3_woe:  0.18 +- 0.034
nom_4   :  0.16 +- 0.020
ord_1   :  0.07 +- 0.016
bin_2_woe:  0.06 +- 0.022
ord_4_woe:  0.06 +- 0.010
nom_8   :  0.05 +- 0.008
bin_0   :  0.03 +- 0.011
nom_3   :  0.03 +- 0.008
bin_1   :  0.02 +- 0.001
ord_2   :  0.02 +- 0.008
bin_4   :  0.02 +- 0.006
nom_0   :  0.01 +- 0.002
nom_4_woe:  0.01 +- 0.003
nom_1   :  0.01 +- 0.004
nom_7   :  0.01 +- 0.003
bin_0_woe:  0.00 +- 0.000
day     :  0.00 +- 0.002
bin_1_woe:  0.00 +- 0.001
nom_0_woe:  0.00 +- 0.001
nom_2   :  0.00 +- 0.001
bin_4_woe:  0.00 +- 0.001
nom_5   :  0.00 +- 0.000
nom_6   : 

In [79]:
cat_features

['bin_0',
 'bin_1',
 'bin_2',
 'bin_3',
 'bin_4',
 'nom_0',
 'nom_1',
 'nom_2',
 'nom_3',
 'nom_4',
 'nom_5',
 'nom_6',
 'nom_7',
 'nom_8',
 'nom_9',
 'ord_0',
 'ord_1',
 'ord_2',
 'ord_3',
 'ord_4',
 'ord_5',
 'day',
 'month']