In [143]:
import sgpp
import polars as pl
import pandas as pd
import numpy as np
import pickle as pkl

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [144]:
from sklearn.metrics import roc_auc_score

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, ShuffleSplit

from sklearn.preprocessing import StandardScaler, OneHotEncoder
skf = StratifiedKFold(5, random_state = 123, shuffle = True)
ss = ShuffleSplit(1, random_state = 123)

In [169]:
df_train.shape, df_test.shape

((593994, 15), (254569, 12))

In [145]:
import catboost as cb

class CatBoostFitProgressbar:
    def __init__(self, n_estimators, precision=5, start_position=0, metric=None, greater_is_better=True, update_cycle=10):
        self.start_position = start_position
        self.n_estimators = n_estimators
        self.fmt = '{:.' + str(precision) + 'f}'
        self.metric = metric
        self.metric_hist = list()
        self.greater_is_better = greater_is_better
        self.progress_bar = None
        self.update_cycle = update_cycle
        self.prog = 0

    def __repr__(self):
        return 'CatBoostFitProgressbar'

    def after_iteration(self, info):
        if self.progress_bar is None:
            self.progress_bar = tqdm(
                total=self.n_estimators, desc='Round', position=self.start_position, leave=False)

        self.prog += 1
        if (self.prog % self.update_cycle) != 0:
            return True
        self.progress_bar.update(self.update_cycle)
        results = list()
        if info.metrics is not None:
            for k, v in info.metrics.items():
                results_2 = list()
                for k2, v2 in v.items():
                    results_2.append('{}: {}'.format(
                        k2, self.fmt.format(v2[-1])))
                    if self.metric == f'{k}_{k2}':
                        self.metric_hist.append(v2[-1])
                results.append('{}: {}'.format(k, ', '.join(results_2)))

        if self.metric is not None and self.metric_hist and len(self.metric_hist) > 0:
            if self.greater_is_better:
                best_round = np.argmax(self.metric_hist) + 1
                best_value = np.max(self.metric_hist)
            else:
                best_round = np.argmin(self.metric_hist) + 1
                best_value = np.min(self.metric_hist)

            results.append(
                f'Best {self.metric}: {best_round}/{self.fmt.format(best_value)}')

        self.progress_bar.set_postfix_str(', '.join(results))
        if self.progress_bar.n == self.n_estimators:
            self.after_train()
        return True

    def after_train(self):
        if self.progress_bar is not None:
            self.progress_bar.close()
            del self.progress_bar
            if clear_output is not None:
                clear_output()
            self.progress_bar = None

In [146]:
p = make_pipeline(
    sgpp.PolarsProcessor(predefined_types={'id': pl.Int64}),
    sgpp.ExprProcessor({
        'loan_paid_back': pl.col('loan_paid_back').cast(pl.Int8)
    }),
    sgpp.PandasConverter(index_col = 'id')
)
df_train = p.fit_transform('data/train.csv')
df_test = p.transform('data/test.csv')
with open('grade_subgrade.pkl', 'rb') as f:
    c_map = pkl.load(f)
df_train['grade_subgrade_no'] = df_train['grade_subgrade'].map(c_map).astype('int')
df_test['grade_subgrade_no'] = df_test['grade_subgrade'].map(c_map).astype('int')

In [147]:
y = 'loan_paid_back'

In [167]:
# df_train['d_c_ratio'] = df_train['debt_to_income_ratio'] / df_train['credit_score']
"""
유효하지 않음
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='first'), ['employment_status']),
    ('std', StandardScaler(), ['debt_to_income_ratio', 'credit_score'])
])

X_lr = ['debt_to_income_ratio', 'credit_score', 'employment_status']
clf_lr = make_pipeline(
    ct, LogisticRegression()
)
df_train = df_train.assign(
    lr_prb = lambda x: cross_val_predict(clf_lr, df_train[X_lr], df_train[y], cv = skf, method = 'predict_proba')[:, 1]
)
"""

''

''

In [172]:
def eval_cb(cb_params, X_num, X_cat, sp = ss, eval_size = 0.0):
    scores = list()
    prds = list()
    clfs = list()
    X_all = list(set(X_num).union(set(X_cat)))
    eval_set = None
    for train_idx, valid_idx in sp.split(df_train[X_all], df_train[y]):
        clf_cb = cb.CatBoostClassifier(
            **cb_params, cat_features = X_cat, verbose = 0
        )
        df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
        # lr_prb = cross_val_predict(clf_lr, df_cv_train[X_lr], df_cv_train[y], cv = skf, method = 'predict_proba')[:, 1]
        # clf_cb.fit(df_cv_train[X_all].assign(lr_prb = lr_prb), df_cv_train[y])
        if eval_size > 0:
            df_cv_train, df_eval = train_test_split(
                df_cv_train, test_size = eval_size, random_state = 123, stratify = df_train[y]
            )
            eval_set = [(df_eval[X_all], df_eval[y])]
        clf_cb.fit(df_cv_train[X_all], df_cv_train[y], eval_set=eval_set)
        prd = pd.Series(clf_cb.predict_proba(df_valid[X_all])[:, 1], index = df_valid.index)
        prds.append(prd)
        scores.append(
            roc_auc_score(df_valid[y], prd)
        )
        clfs.append(clf_cb)
    return np.mean(scores), np.std(scores), pd.concat(prds).sort_index().values, clfs, X_all

In [181]:
results = eval_cb(
    {
        'n_estimators': 2000,
        'task_type': 'CPU',
        'max_depth': 6,
        'learning_rate':0.1,
        'border_count': 254, # border_count 128 -> 254
        'subsample': 0.75 # subsample: 0.75 성능 향상
    }, 
    ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate'] + ['grade_subgrade_no'],
    ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose'],
)
results[0]

0.9248421854596397

In [182]:
pd.DataFrame(
    results[3][0].get_feature_importance(type = 'Interaction'), columns = ['v1', 'v2', 'importance']
).assign(
    v1 = lambda x: x['v1'].astype('int').apply(lambda x: results[-1][x]),
    v2 = lambda x: x['v2'].astype('int').apply(lambda x: results[-1][x])
).iloc[:10]

Unnamed: 0,v1,v2,importance
0,debt_to_income_ratio,credit_score,9.104483
1,debt_to_income_ratio,loan_amount,7.272959
2,debt_to_income_ratio,annual_income,6.670799
3,debt_to_income_ratio,interest_rate,6.178732
4,credit_score,loan_amount,4.848106
5,credit_score,interest_rate,4.718051
6,credit_score,annual_income,4.226164
7,debt_to_income_ratio,employment_status,4.046364
8,interest_rate,loan_amount,3.582667
9,loan_amount,annual_income,3.558486


In [152]:
pd.DataFrame?