# Model Risk MVP

_Initial commit: Anton Markov, 1 October 2021_

_Latest edit: Anton Markov, 30 October 2021_


In [1]:
import time
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, metrics, model_selection
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from hyperopt import hp
# from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# for HyperOpt class
import lightgbm as lgb
import xgboost as xgb
# import catboost as ctb
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials

In [2]:
# новый пакет!
from feature_engine.encoding import WoEEncoder
from feature_engine.creation import CombineWithReferenceFeature
from feature_engine.selection import RecursiveFeatureAddition

In [3]:
from sklearn.pipeline import Pipeline

In [6]:
with open('../datasets/01_german/factors.json') as json_file:
    factors_dict = json.load(json_file)

In [7]:
factors_dict['cat_vals']

['cheq_acc',
 'cred_hist',
 'purp',
 'save_acc',
 'empl_t',
 'pers_status',
 'guarant_flg',
 'prop',
 'inst_plan',
 'house',
 'job',
 'tel_flg',
 'foreign_flg']

In [8]:
seed = 42

In [9]:
def Gini(y, y_pred):
    res = roc_auc_score(y, y_pred) * 2 - 1
    print(f"Gini: {res}")
    return(res)

## With pipeline

In [16]:
X_train = pd.read_parquet('../datasets/01_german/samples/X_train.parquet')
X_test  = pd.read_parquet('../datasets/01_german/samples/X_test.parquet')
y_train = pd.read_parquet('../datasets/01_german/samples/y_train.parquet').target
y_test  = pd.read_parquet('../datasets/01_german/samples/y_test.parquet').target

In [17]:
woe = WoEEncoder(variables = factors_dict['cat_vals'])
feat_eng = CombineWithReferenceFeature(
    variables_to_combine = list(X_train.columns),
    reference_variables = list(X_train.columns),
    operations = ['mul']
)
lgbm_mdl = LGBMClassifier(
    num_leaves = 10,
    learning_rate = .1,
    reg_alpha = 8,
    reg_lambda = 8,
    random_state = seed
)
feat_sel = RecursiveFeatureAddition(
    lgbm_mdl,
    threshold = 0.005
)

In [18]:
mdl_pipe = Pipeline(
    [('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)

In [21]:
mdl_pipe.fit(X_train, y_train)
Gini(y_train, mdl_pipe.predict_proba(X_train)[:, 1])

Gini: 0.7746717241072156


0.7746717241072156

In [22]:
Gini(y_test, mdl_pipe.predict_proba(X_test)[:, 1])

Gini: 0.5843250390671957


0.5843250390671957

Получили то же самое!