In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../utilities')
from basic_utilities import *
from model_utilities import *
from pandas.plotting import scatter_matrix
from ml_utilities import *
import scipy.stats as stats
import pdb
import os
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import pickle
sys.path.append('../Data_prep')
from data_config import *
sys.path.append('../../../../infrastructure/tools')
from feature_engineering import feature_engineering
from utilities import utilities
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pdb
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [2]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
import torch
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler

In [3]:
from shap import KernelExplainer

In [4]:
train_sample = pd.read_csv('../../../data/processed_data/train_sample_segment0_logit.csv')

In [5]:
train_sample.shape

(23973, 135)

In [6]:
get_knots = read_yaml_file('../Data_prep/1d_knots_segment0.yaml')['get_knots']

In [7]:
knots_vars = []
for var in get_knots:
    matching_strings = [s for s in train_sample.columns.tolist() if re.search(f"{var}_", s)]
    knots_vars += matching_strings

In [8]:
get_dummy = read_yaml_file('../Data_prep/dummy_grouping_segment0_draft.yaml')['get_dummies']

In [9]:
dummy_vars = []
for var in get_dummy:
    matching_strings = [s for s in train_sample.columns.tolist() if re.search(f"{var}_", s)]
    dummy_vars += matching_strings

In [10]:
binning_features = read_yaml_file('../Data_prep/binning_features_segment0.yaml')['binning_features']

In [11]:
binning_features = [f.split(',')[0] for f in binning_features]

In [12]:
bin_vars = []
for var in binning_features:
    matching_strings = [s for s in train_sample.columns.tolist() if re.search(f"{var}_bin:", s)]
    bin_vars += matching_strings

In [13]:
bin_vars

[]

In [14]:
spec1 = knots_vars + dummy_vars + bin_vars
spec1 = sorted(list(set(spec1)))

In [15]:
Xtrain = train_sample[spec1].astype('float')
ytrain = train_sample['fraud_bool']
weights = train_sample['sample_weight']

In [16]:
test_sample = pd.read_csv('../../../data/processed_data/test_sample_segment0_logit.csv')
Xtest = test_sample[spec1].astype('float')
ytest = test_sample['fraud_bool']
weights = test_sample['sample_weight']

In [17]:
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(Xtrain)  

In [18]:
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest) #Apply same scaler to test data

In [19]:
# predictions = model.predict(Xtrain.values)

In [20]:
# accuracy = accuracy_score(ytrain, predictions)
# print(f"Accuracy: {accuracy}")

In [21]:
# auroc = roc_auc_score(ytrain, predictions)
# print(f"AUROC score: {auroc}")

In [22]:
Xtrain.shape

(23973, 63)

In [24]:
param_grid = {
    'n_d': [8, 12, 16],
    'n_a': [8, 12, 16],
    'optimizer_params': [{'lr': 0.02}, {'lr': 0.01}]
}

tabnet_model = TabNetClassifier(verbose=0)

clf = GridSearchCV(
    tabnet_model,
    param_grid,
    scoring=make_scorer(roc_auc_score, needs_proba=False),
    cv=3,
    verbose=2,
    n_jobs=-1,
    error_score='raise'
)

clf.fit(Xtrain, ytrain)

best_params = clf.best_params_
print(f"Best parameters found: {best_params}")


best_model = clf.best_estimator_


Fitting 3 folds for each of 18 candidates, totalling 54 fits




[CV] END ........n_a=8, n_d=8, optimizer_params={'lr': 0.02}; total time=  42.8s




[CV] END ........n_a=8, n_d=8, optimizer_params={'lr': 0.01}; total time=  43.4s
[CV] END ........n_a=8, n_d=8, optimizer_params={'lr': 0.02}; total time=  43.5s
[CV] END ........n_a=8, n_d=8, optimizer_params={'lr': 0.02}; total time=  43.5s




[CV] END ........n_a=8, n_d=8, optimizer_params={'lr': 0.01}; total time=  44.0s




[CV] END ........n_a=8, n_d=8, optimizer_params={'lr': 0.01}; total time=  44.5s




[CV] END .......n_a=8, n_d=12, optimizer_params={'lr': 0.02}; total time=  46.2s




[CV] END .......n_a=8, n_d=12, optimizer_params={'lr': 0.02}; total time=  46.7s




[CV] END .......n_a=8, n_d=12, optimizer_params={'lr': 0.02}; total time=  54.0s




[CV] END .......n_a=8, n_d=12, optimizer_params={'lr': 0.01}; total time=  53.7s




[CV] END .......n_a=8, n_d=12, optimizer_params={'lr': 0.01}; total time=  54.2s




[CV] END .......n_a=8, n_d=12, optimizer_params={'lr': 0.01}; total time=  54.5s




[CV] END .......n_a=8, n_d=16, optimizer_params={'lr': 0.02}; total time=  57.8s
[CV] END .......n_a=8, n_d=16, optimizer_params={'lr': 0.02}; total time=  58.5s




[CV] END .......n_a=8, n_d=16, optimizer_params={'lr': 0.01}; total time=  56.7s




[CV] END .......n_a=8, n_d=16, optimizer_params={'lr': 0.02}; total time=  58.3s




[CV] END .......n_a=12, n_d=8, optimizer_params={'lr': 0.02}; total time=  55.2s




[CV] END .......n_a=8, n_d=16, optimizer_params={'lr': 0.01}; total time=  57.1s
[CV] END .......n_a=8, n_d=16, optimizer_params={'lr': 0.01}; total time=  57.5s




[CV] END .......n_a=12, n_d=8, optimizer_params={'lr': 0.02}; total time=  56.5s




[CV] END .......n_a=12, n_d=8, optimizer_params={'lr': 0.01}; total time=  57.5s




[CV] END .......n_a=12, n_d=8, optimizer_params={'lr': 0.02}; total time=  58.8s




[CV] END .......n_a=12, n_d=8, optimizer_params={'lr': 0.01}; total time=  59.7s




[CV] END .......n_a=12, n_d=8, optimizer_params={'lr': 0.01}; total time=  58.8s




[CV] END ......n_a=12, n_d=12, optimizer_params={'lr': 0.02}; total time= 1.0min




[CV] END ......n_a=12, n_d=12, optimizer_params={'lr': 0.02}; total time=  59.8s




[CV] END ......n_a=12, n_d=12, optimizer_params={'lr': 0.02}; total time= 1.0min




[CV] END ......n_a=12, n_d=12, optimizer_params={'lr': 0.01}; total time= 1.0min




[CV] END ......n_a=12, n_d=12, optimizer_params={'lr': 0.01}; total time=  58.8s




[CV] END ......n_a=12, n_d=12, optimizer_params={'lr': 0.01}; total time= 1.1min




[CV] END ......n_a=12, n_d=16, optimizer_params={'lr': 0.02}; total time= 1.1min




[CV] END ......n_a=12, n_d=16, optimizer_params={'lr': 0.02}; total time= 1.1min




[CV] END ......n_a=12, n_d=16, optimizer_params={'lr': 0.01}; total time= 1.1min




[CV] END .......n_a=16, n_d=8, optimizer_params={'lr': 0.02}; total time= 1.0min




[CV] END ......n_a=12, n_d=16, optimizer_params={'lr': 0.02}; total time= 1.1min




[CV] END ......n_a=12, n_d=16, optimizer_params={'lr': 0.01}; total time= 1.1min
[CV] END ......n_a=12, n_d=16, optimizer_params={'lr': 0.01}; total time= 1.1min




[CV] END .......n_a=16, n_d=8, optimizer_params={'lr': 0.02}; total time= 1.1min
[CV] END .......n_a=16, n_d=8, optimizer_params={'lr': 0.02}; total time= 1.1min




[CV] END .......n_a=16, n_d=8, optimizer_params={'lr': 0.01}; total time= 1.2min




[CV] END .......n_a=16, n_d=8, optimizer_params={'lr': 0.01}; total time= 1.1min




[CV] END .......n_a=16, n_d=8, optimizer_params={'lr': 0.01}; total time= 1.2min




[CV] END ......n_a=16, n_d=12, optimizer_params={'lr': 0.02}; total time= 1.2min
[CV] END ......n_a=16, n_d=12, optimizer_params={'lr': 0.02}; total time= 1.2min




[CV] END ......n_a=16, n_d=12, optimizer_params={'lr': 0.02}; total time= 1.3min




[CV] END ......n_a=16, n_d=12, optimizer_params={'lr': 0.01}; total time= 1.2min




[CV] END ......n_a=16, n_d=12, optimizer_params={'lr': 0.01}; total time= 1.2min
[CV] END ......n_a=16, n_d=12, optimizer_params={'lr': 0.01}; total time= 1.2min
[CV] END ......n_a=16, n_d=16, optimizer_params={'lr': 0.02}; total time=  59.0s
[CV] END ......n_a=16, n_d=16, optimizer_params={'lr': 0.02}; total time=  59.2s
[CV] END ......n_a=16, n_d=16, optimizer_params={'lr': 0.02}; total time=  58.5s
[CV] END ......n_a=16, n_d=16, optimizer_params={'lr': 0.01}; total time=  59.2s
[CV] END ......n_a=16, n_d=16, optimizer_params={'lr': 0.01}; total time=  57.0s
[CV] END ......n_a=16, n_d=16, optimizer_params={'lr': 0.01}; total time=  52.4s
Best parameters found: {'n_a': 12, 'n_d': 12, 'optimizer_params': {'lr': 0.02}}


In [None]:
# y_pred_proba = best_model.predict_proba(Xtest)[:, 1]
# auc_score = roc_auc_score(ytest, y_pred_proba)
# print(f"Validation AUC score with best model: {auc_score}")

In [33]:
predictions = best_model.predict_proba(Xtest)

In [37]:
auroc = roc_auc_score(ytest, predictions[:,1])
print(f"AUROC score: {auroc}")

AUROC score: 0.8102080067234496


In [None]:
# start_time = time.time()


# explainer = KernelExplainer(model.predict_proba, Xtrain.values) 

# # Calculate SHAP values for a subset of the test data (e.g., first 100 rows)
# shap_values = explainer.shap_values(Xtrain.values[:100])

# # Visualize the explanation for a single instance (e.g., the first instance)


In [None]:
# shap.initjs()
# shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], Xtrain.values.iloc[0,:])

# # Generate a summary plot to show feature importance across the dataset
# shap.summary_plot(shap_values, Xtrain.values) 