In [11]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import confusion_matrix, classification_report, f1_score, fbeta_score, auc, roc_curve, precision_score, recall_score
from sklearn.model_selection import KFold
import mlflow
from mlflow import log_metric, log_param, log_artifact
import mlflow.sklearn
import mlflow.pyfunc
import json
import shap
from utils import *

In [2]:
cat_cols = ["DeviceType", "DeviceInfo"]
cat_cols += ["id_"+str(i) for i in range(12,39)]
cat_cols += ["ProductCD","addr1", "addr2", "P_emaildomain", "R_emaildomain"]
cat_cols += ["card"+str(i) for i in range(1,7)]
cat_cols += ["M"+str(i) for i in range(1,10)]

In [4]:
input_dir = "data/prepped/"

In [5]:
X = pd.read_pickle(input_dir+"X_train_.pkl")
y = pd.read_pickle(input_dir+"y_train_.pkl")

In [9]:
cat_col_idx = np.array([X.columns.get_loc(c) for c in cat_cols])
cat_col_idx

array([394, 393, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416,
       417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429,
       430, 431, 432,  39, 383, 384,  38,  40, 385, 386, 387, 388, 389,
       390,  29,  30,  31,  32,  33,  34,  35,  36,  37])

In [8]:
kfold = KFold(n_splits=3, shuffle=False)

In [None]:
model = {}
metrics = {}
for fold, (train_inds, test_inds) in enumerate(kfold.split(X)):
    print("Fold no "+str(fold))
    X_train, y_train = X.iloc[train_inds], y.iloc[train_inds]
    X_val, y_val = X.iloc[test_inds], y.iloc[test_inds]
    cb = CatBoostClassifier(iterations=500, eval_metric="AUC", task_type="GPU")
    cb.fit(X=X_train, y=y_train, cat_features=cat_col_idx, eval_set=(X_val, y_val), silent=True, plot=True)
    preds = cb.predict_proba(X_val)
    model[fold] = cb
    met = get_metrics(y_val, preds)
    print(met)
    metrics[fold] = met
    plot_roc(y_val, preds)

Fold no 0


In [None]:
explainer = shap.TreeExplainer(cb)