In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [1]:
from datetime import datetime
from os import path
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import eli5
from eli5.sklearn import PermutationImportance
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib as mpl
import matplotlib.pyplot as plt
from cafeen import utils

mpl.style.use('ggplot')

In [3]:
path_to_train = path.join(path.abspath('..'), 'data', 'train.csv')
path_to_test = path.join(path.abspath('..'), 'data', 'test.csv')

train = pd.read_csv(path_to_train)

features = [col for col in train.columns if col not in ['id', 'target']]
obj_cols = [col for col in features if train[col].dtype == np.object]
num_cols = [col for col in features if train[col].dtype == np.float64]

In [4]:
train[obj_cols] = train[obj_cols].fillna(value='NAN')
train[num_cols] = train[num_cols].fillna(value=-1)

for col in obj_cols:    
    encoder = LabelEncoder()    
    train[col] = encoder.fit(train[col]).transform(train[col])
    
train_x, test_x, train_y, test_y = train_test_split(
    train[features], 
    train['target'], 
    shuffle=True, 
    train_size=0.7, 
    random_state=42)    

In [4]:
def show_weights(expl):
    for row in expl.itertuples():
        print(f'{row.feature:5s}: {100*row.weight:5.2f} +- {100*2*row.std:.3f}')

In [16]:
estimator = lgb.LGBMClassifier(n_estimators=50)
estimator.fit(train_x, train_y, categorical_feature=features)

# initialize permutation importance class
perm_estimator = PermutationImportance(
    estimator=estimator, 
    scoring=make_scorer(roc_auc_score, needs_proba=True),
    n_iter=3)
perm_estimator.fit(test_x, test_y)

# calculate feature weights and return it as DataFrame
expl = eli5.format_as_dataframe(
    eli5.explain_weights(
        perm_estimator,
        top=None,
        feature_names=features
    )
)

show_weights(expl)

ord_3:  8.02 +- 0.169
ord_2:  3.07 +- 0.045
ord_5:  2.52 +- 0.044
month:  2.10 +- 0.056
nom_8:  2.08 +- 0.078
ord_0:  2.00 +- 0.158
nom_7:  1.76 +- 0.018
ord_4:  1.08 +- 0.029
nom_9:  0.79 +- 0.047
nom_1:  0.62 +- 0.026
bin_2:  0.58 +- 0.023
ord_1:  0.57 +- 0.021
bin_0:  0.56 +- 0.048
nom_3:  0.35 +- 0.067
nom_5:  0.33 +- 0.038
day  :  0.30 +- 0.044
nom_2:  0.18 +- 0.031
nom_4:  0.09 +- 0.015
nom_6:  0.05 +- 0.015
nom_0:  0.00 +- 0.000
bin_4:  0.00 +- 0.000
bin_3:  0.00 +- 0.000
bin_1:  0.00 +- 0.000


In [3]:
import logging

import lightgbm as lgb
import pandas as pd
from sklearn.impute import MissingIndicator

from cafeen import config, steps, utils

df = utils.read_data()
features = utils.get_features(df.columns)

df = utils.mark_as_na(df, ['nom_5', 'nom_6', 'nom_9'], threshold=85)

ind_features = MissingIndicator().fit_transform(df[features])
ind_columns = ['ind_' + str(i) for i in range(ind_features.shape[1])]
df[ind_columns] = pd.DataFrame(ind_features).astype('int')

df = utils.encode_ordinal_features(df, features)
df = utils.fill_na(df, features, initial_strategy='most_frequent')
df = utils.add_counts(
    df, ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_5'])

train, test = utils.split_data(utils.encode_features(df))

[IterativeImputer] Completing matrix with shape (1000000, 23)
[IterativeImputer] Ending imputation round 1/10, elapsed time 61.85
[IterativeImputer] Change: 896.7260140560772, scaled tolerance: 0.2008 
[IterativeImputer] Ending imputation round 2/10, elapsed time 124.20
[IterativeImputer] Change: 0.9492554429233921, scaled tolerance: 0.2008 
[IterativeImputer] Ending imputation round 3/10, elapsed time 186.03
[IterativeImputer] Change: 0.0019703717794072872, scaled tolerance: 0.2008 
[IterativeImputer] Early stopping criterion reached.


encoding: 0it [00:00, ?it/s]


In [5]:
features = utils.get_features(train.columns)

train_x, test_x, train_y, test_y = train_test_split(
    train[features], 
    train['target'], 
    shuffle=True, 
    train_size=0.7, 
    random_state=42)    

estimator = lgb.LGBMClassifier(n_estimators=50)
estimator.fit(train_x, train_y)

# initialize permutation importance class
perm_estimator = PermutationImportance(
    estimator=estimator, 
    scoring=make_scorer(roc_auc_score, needs_proba=True),
    n_iter=3)
perm_estimator.fit(test_x, test_y)

# calculate feature weights and return it as DataFrame
expl = eli5.format_as_dataframe(
    eli5.explain_weights(
        perm_estimator,
        top=None,
        feature_names=features
    )
)

show_weights(expl)

ord_3:  7.50 +- 0.202
ord_2:  2.89 +- 0.078
ord_5:  2.69 +- 0.036
month:  2.14 +- 0.051
nom_8:  2.12 +- 0.091
ord_0:  1.93 +- 0.075
nom_7:  1.88 +- 0.096
nom_9:  1.81 +- 0.054
ord_4:  1.15 +- 0.083
nom_1:  0.67 +- 0.022
nom_5:  0.64 +- 0.078
bin_2:  0.59 +- 0.028
bin_0:  0.59 +- 0.044
ord_1:  0.59 +- 0.039
day  :  0.43 +- 0.018
nom_3:  0.42 +- 0.039
nom_6:  0.21 +- 0.025
nom_2:  0.20 +- 0.022
nom_4:  0.14 +- 0.018
bin_1:  0.00 +- 0.000
bin_4:  0.00 +- 0.001
nom_9_count:  0.00 +- 0.000
nom_0:  0.00 +- 0.000
bin_3:  0.00 +- 0.000
ind_0:  0.00 +- 0.000
ind_1:  0.00 +- 0.000
ord_5_count:  0.00 +- 0.000
ind_3:  0.00 +- 0.000
ind_16:  0.00 +- 0.000
nom_8_count:  0.00 +- 0.000
nom_7_count:  0.00 +- 0.000
nom_6_count:  0.00 +- 0.000
nom_5_count:  0.00 +- 0.000
ind_22:  0.00 +- 0.000
ind_21:  0.00 +- 0.000
ind_20:  0.00 +- 0.000
ind_19:  0.00 +- 0.000
ind_18:  0.00 +- 0.000
ind_17:  0.00 +- 0.000
ind_15:  0.00 +- 0.000
ind_4:  0.00 +- 0.000
ind_14:  0.00 +- 0.000
ind_13:  0.00 +- 0.000
ind_12: 

In [8]:
train['ind_14']

0         0
1         0
2         1
3         0
4         0
         ..
599995    0
599996    0
599997    0
599998    0
599999    0
Name: ind_14, Length: 600000, dtype: int64