In [21]:
import os
from preprocessing.base_model_preprocessor import build_preprocessor
from preprocessing.cleaner import clean_data
import numpy as np
import pandas as pd
import sqlite3, math
import xgboost as xgb

#from xgboost import XGBClassifier
#from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_score
from sklearn.model_selection import GridSearchCV

# Data Loading and Preprocessing

In [2]:
# Load CSV data into Pandas DataFrame 
#train_identity_data = pd.read_csv('../data/raw_data/train_identity.csv') 
train_transaction_data = pd.read_csv('../data/raw_data/train_transaction.csv') 
#test_identity_data = pd.read_csv('../data/raw_data/test_identity.csv') 
test_transaction_data = pd.read_csv('../data/raw_data/test_transaction.csv') 

In [3]:
cat_cols = ["ProductCD", 
                    "card1", "card2", "card3", "card4", "card5", "card6",
                    "addr1", "addr2",
                    "P_emaildomain", "R_emaildomain",
                    "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9"
                   ]
union_list = list(set(cat_cols).union({"TransactionID", "isFraud", }))
sub_df = train_transaction_data.drop(columns=union_list)
numeric_cols = sub_df.columns

In [4]:
X_train, X_test, y_train, y_test = clean_data(train_transaction_data, split=True, cat_cols=cat_cols, y_col="isFraud")

In [5]:
prep = build_preprocessor(numeric_cols, cat_cols)
X_train_processed = prep.fit_transform(X_train)
X_test_processed = prep.transform(X_test)

In [18]:
feat_names = prep.get_feature_names_out()
feat_names

array(['cat__ProductCD_C', 'cat__ProductCD_H', 'cat__ProductCD_R',
       'cat__ProductCD_S', 'cat__ProductCD_W', 'cat__card1_Other',
       'cat__card2_Other', 'cat__card3_Other', 'cat__card4_Other',
       'cat__card4_american express', 'cat__card4_discover',
       'cat__card4_mastercard', 'cat__card4_visa', 'cat__card5_Other',
       'cat__card6_Other', 'cat__card6_credit', 'cat__card6_debit',
       'cat__addr1_Other', 'cat__addr2_Other', 'cat__P_emaildomain_Other',
       'cat__P_emaildomain_anonymous.com', 'cat__P_emaildomain_aol.com',
       'cat__P_emaildomain_att.net', 'cat__P_emaildomain_comcast.net',
       'cat__P_emaildomain_gmail.com', 'cat__P_emaildomain_hotmail.com',
       'cat__P_emaildomain_icloud.com', 'cat__P_emaildomain_msn.com',
       'cat__P_emaildomain_outlook.com', 'cat__P_emaildomain_yahoo.com',
       'cat__M1_Other', 'cat__M1_T', 'cat__M2_F', 'cat__M2_Other',
       'cat__M2_T', 'cat__M3_F', 'cat__M3_Other', 'cat__M3_T',
       'cat__M4_M0', 'cat__M4_M1',

In [19]:
df_new = pd.DataFrame(X_train_processed, columns=feat_names)

In [7]:
X_train_processed.shape

(472432, 262)

# Training a basic gradient boosted forest

In [8]:
# fit model no training data
model = xgb.XGBClassifier()
model.fit(X_train_processed, y_train)

In [9]:
# make predictions for test data
y_pred = model.predict(X_test_processed)
predictions = [round(value) for value in y_pred]
model.get_xgb_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'eval_metric': None,
 'gamma': None,
 'grow_policy': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [10]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 97.91%


In [11]:
precision = precision_score(y_test, predictions)
print("Precision: %.2f%%" % (precision * 100.0))
recall = recall_score(y_test, predictions, average='binary')
print('Recall: %.3f' % (recall * 100.0))
score = f1_score(y_test, predictions, average='binary')
print('F-Measure: %.3f' % (score * 100.0))

Precision: 88.74%
Recall: 45.971
F-Measure: 60.567


In [12]:
# double check F1 measure calculation
(2*precision*recall)/(precision+recall)

0.6056742110296461

# Hyperparameter Tuning with Cross Validated Grid Search

In [13]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01],
    'subsample': [0.5, 0.7, 1]
}
#param_grid = {
#    'max_depth': [3, 5, 7],
#    'learning_rate': [0.1, 0.01, 0.001],
#    'subsample': [0.5, 0.7, 1]
#}

In [14]:
model_tuned = xgb.XGBClassifier()


# Create the GridSearchCV object
grid_search = GridSearchCV(model_tuned, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train_processed, y_train)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best set of hyperparameters:  {'learning_rate': 0.1, 'max_depth': 7, 'subsample': 0.7}
Best score:  0.977787279671543


In [15]:
# retrain on whole dataset
model_tuned = xgb.XGBClassifier()
model_tuned.fit(X_train_processed, y_train)

predictions = model_tuned.predict(X_test_processed, y_test)
predictions



ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
precision = precision_score(y_test, predictions)
print("Precision: %.2f%%" % (precision * 100.0))
recall = recall_score(y_test, predictions, average='binary')
print('Recall: %.3f' % (recall * 100.0))
score = f1_score(y_test, predictions, average='binary')
print('F-Measure: %.3f' % (score * 100.0))

# Testing preprocessed data

In [24]:
DATA_DIRECTORY = "../data/processed_data"
train_path = os.path.join(DATA_DIRECTORY, "processed_X_train_transaction_data.csv")
test_path = os.path.join(DATA_DIRECTORY, "processed_X_test_transaction_data.csv")

In [28]:
df_train = pd.read_csv(train_path, index=False)
df_test = pd.read_csv(test_path)

TypeError: read_csv() got an unexpected keyword argument 'index'

In [29]:
df_train.isFraud

0         NaN
1         0.0
2         0.0
3         NaN
4         NaN
         ... 
472427    0.0
472428    0.0
472429    0.0
472430    0.0
472431    0.0
Name: isFraud, Length: 472432, dtype: float64