In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import sys
sys.path.append('../../../../infrastructure/tools')
from utilities import *
from plotting import *
import pdb
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import time
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [2]:
data = pd.read_csv("../../../data/processed_data/training_user.csv")

In [3]:
data.columns.tolist()

['transaction_id',
 'date',
 'user_id',
 'card_id',
 'amount',
 'use_chip',
 'merchant_id',
 'zip',
 'target',
 'card_brand',
 'card_type',
 'card_number',
 'expires',
 'cvv',
 'has_chip',
 'num_cards_issued',
 'credit_limit',
 'acct_open_date',
 'year_pin_last_changed',
 'card_on_dark_web',
 'current_age',
 'retirement_age',
 'birth_year',
 'birth_month',
 'gender',
 'address',
 'latitude',
 'longitude',
 'per_capita_income',
 'yearly_income',
 'total_debt',
 'credit_score',
 'num_credit_cards',
 'sum_credit_limit',
 'sum_card_on_dark_web',
 'dti',
 'income_to_median',
 'geo_encoding',
 'date_year',
 'date_month',
 'date_weekend',
 'date_year_end',
 'date_year_start',
 'date_hour',
 'amount_24_hours_user_id',
 'amount_3_days_user_id',
 'amount_7_days_user_id',
 'amount_30_days_user_id',
 'amount_60_days_user_id',
 'amount_90_days_user_id',
 'merchant_state_nan',
 'merchant_state_0',
 'merchant_state_1',
 'merchant_state_2',
 'merchant_state_3',
 'merchant_state_4',
 'merchant_state_5'

In [4]:
x_vars = [
 'amount',
 'use_chip',
 'card_brand',
 'card_type',
 'has_chip',
 'num_cards_issued',
 'credit_limit',
 'year_pin_last_changed',
 'current_age',
 'retirement_age',
 'birth_year',
 'birth_month',
 'credit_score',
 'num_credit_cards',
 'sum_credit_limit',
 'sum_card_on_dark_web',
 'dti',
 'income_to_median',
 'geo_encoding',
 'date_year',
 'date_month',
 'date_weekend',
 'date_year_end',
 'date_year_start',
 'date_hour',
 'amount_24_hours_user_id',
 'amount_3_days_user_id',
 'amount_7_days_user_id',
 'amount_30_days_user_id',
 'amount_60_days_user_id',
 'amount_90_days_user_id',
 'merchant_state_nan',
 'merchant_state_0',
 'merchant_state_1',
 'merchant_state_2',
 'merchant_state_3',
 'merchant_state_4',
 'merchant_state_5',
 'merchant_state_6',
 'merchant_state_7',
 'merchant_state_8',
 'merchant_state_9',
 'merchant_state_10',
 'mcc_nan',
 'mcc_0',
 'mcc_1',
 'mcc_2',
 'mcc_3',
 'mcc_4',
 'mcc_5',
 'mcc_6',
 'mcc_7',
 'mcc_8',
 'errors_nan',
 'errors_0',
 'errors_1',
 'errors_2',
 'merchant_type_Cleaning and Maintenance Services',
 'merchant_type_nan',
 'merchant_type_0',
 'merchant_type_1',
 'merchant_type_2',
 'merchant_type_3',
 'merchant_type_4',
 'merchant_type_5',
 'merchant_type_6',
 'merchant_type_7',
 'merchant_type_8',
 'merchant_type_9',
 'merchant_city_Annandale',
 'merchant_city_Buchanan',
 'merchant_city_Buchanan Dam',
 'merchant_city_Chenango Forks',
 'merchant_city_Chittenango',
 'merchant_city_Ferdinand',
 'merchant_city_Fernandina Beach',
 'merchant_city_Hernando',
 'merchant_city_Kenansville',
 'merchant_city_Menan',
 'merchant_city_Newnan',
 'merchant_city_San Fernando',
 'merchant_city_Shenandoah',
 'merchant_city_Swannanoa',
 'merchant_city_Venango',
 'merchant_city_nan',
 'merchant_city_0',
 'merchant_city_1']

In [5]:
X = data[x_vars]

In [6]:
qcut_vars = ['dti','credit_score']

In [7]:
qcut_dict = {}
for var in qcut_vars:
    qcut_dict[var] = [i*0.1 for i in range(0,11)]

In [8]:
data, cps = utilities.binning_q(data, qcut_dict)

In [9]:
xy = utilities.pivot(data, varlist={'target':'logodds', 'dti':'mean'}, by_vars = ['dti_bin'])

In [10]:
xy

Unnamed: 0,dti_bin,count,logodds_target,mean_dti
0,0,55760,-6.416319,0.01091
1,1,55522,-6.516337,0.112421
2,2,54909,-6.357824,0.44243
3,3,55326,-6.246234,0.782573
4,4,55596,-6.935911,1.153461
5,5,55485,-6.828441,1.391611
6,6,55100,-6.62512,1.620061
7,7,55809,-5.622777,1.88754
8,8,54725,-6.7345,2.195413
9,9,55328,-6.476846,2.815094


In [11]:
figs = []
for var in qcut_vars:
    var_bin = f"{var}_bin"
    xy = utilities.pivot(data, varlist={'target':'logodds', var:'mean'}, by_vars = [var_bin])
    figs.append(plotting.px_scatter_plot(xy, f'mean_{var}', 'logodds_target', show=False))

In [12]:
if os.path.exists("logodds_plots.html"):
  os.remove("logodds_plots.html")
    
with open('logodds_plots.html', 'a') as file:
    for f in figs:
        file.write(f.to_html())

In [13]:
params = {
        'min_child_weight':100,
        'reg_alpha':0.3,
        'subsample': 0.7,
        'colsample_bytree': 0.6,
        'max_depth': 3,
        'learning_rate': 0.02
        }

In [14]:
for v in ['use_chip','card_brand','card_type','has_chip']:
    data[v] = data[v].astype('category')

In [15]:
X = data[x_vars]
y = data['target']

In [16]:
model = XGBClassifier(n_estimators=100, random_state=12, enable_categorical=True, params=params)

In [17]:
start_time = time.time()
model.fit(X,y)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 1.947739839553833 seconds


In [18]:
y_pred = model.predict(X)
auroc = roc_auc_score(y, y_pred)
print(f"AUROC score: {auroc}")

AUROC score: 0.9233242539458771


In [19]:
importances = model.feature_importances_
feature_imp = pd.DataFrame(columns=['feature','importance'])

In [20]:
feature_imp.feature = X.columns
feature_imp.importance = importances

In [21]:
feature_imp = feature_imp.sort_values('importance', ascending=False)

In [None]:
test_data = pd.read_csv("../../../data/processed_data/testing_user.csv")
for v in ['use_chip','card_brand','card_type','has_chip']:
    test_data[v] = test_data[v].astype('category')

In [None]:
def choose_top_n(model,X,y,feature_list, nlist):

    metrics = pd.DataFrame(columns=['test_auc','auc_diff', 'top_n'])

    test_auc_list = []

    auc_diff_list = []
    
    for n in nlist:
        
        top_n = feature_list[0:n]
        X_top_n = X[top_n]
        start_time = time.time()
        model.fit(X_top_n,y)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Elapsed time: {elapsed_time} seconds")

        y_pred = model.predict(X_top_n)
        train_auroc = roc_auc_score(y, y_pred)
        print(f"training AUROC score: {train_auroc}")

        X_test = test_data[top_n]
        y_test = test_data['target']

        y_test_pred = model.predict(X_test)
        test_auroc = roc_auc_score(y_test, y_test_pred)
        print(f"testing AUROC score: {test_auroc}")

        auc_diff_list.append(train_auroc-test_auroc)

        test_auc_list.append(test_auroc)

    metrics.test_auc = test_auc_list
    metrics.auc_diff = auc_diff_list
    metrics.top_n = nlist

    return metrics.sort_values(by=['test_auc','auc_diff'], ascending=[False,True])
        

In [None]:
top = choose_top_n(model, X, y, feature_imp.feature.tolist(), nlist=range(10,20))

In [None]:
top.head(1)

In [None]:
x_vars = feature_imp.feature.head(15).tolist()

In [None]:
start_time = time.time()
model.fit(X[x_vars],y)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

In [None]:
y_pred = model.predict(X[x_vars])

In [None]:
ConfusionMatrixDisplay.from_estimator(model, X[x_vars], y)

In [None]:
X_test = test_data[x_vars]
y_test = test_data['target']

ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)