## ML using XGBoost
Installations to run:

$ pip install --user xgboost

---------------------
CPU only

$ conda install -c conda-forge py-xgboost-cpu

--Or--

Use NVIDIA GPU

$ conda install -c conda-forge py-xgboost-gpu

In [17]:
# Dependencies
import seaborn as sns

import pandas as pd

import numpy as np

from imblearn.under_sampling import RandomUnderSampler

import matplotlib.pyplot as plt

import warnings


warnings.filterwarnings("ignore")

In [18]:
#Load in dataset
df = pd.read_csv("data/test_100.csv")

# Review the DataFrame
df.head()

Unnamed: 0.1,Unnamed: 0,Amount,Use Chip,Merchant State,Errors?,Is Fraud?,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,...,Has Chip,Cards Issued,International,Online,Age_at_transaction,income_to_debt,day_of_week,timestamp,time_of_day,distances
0,0,134.09,Swipe Transaction,CA,No Error,0,29278,59696,127613,787,...,1,2,0,0,36,0.467789,6,1030861000.0,Morning,0.0
1,1,38.48,Swipe Transaction,CA,No Error,0,29278,59696,127613,787,...,1,2,0,0,36,0.467789,6,1030863000.0,Morning,33.540588
2,2,120.34,Swipe Transaction,CA,No Error,0,29278,59696,127613,787,...,1,2,0,0,36,0.467789,0,1030948000.0,Morning,33.540588
3,3,128.95,Swipe Transaction,CA,No Error,0,29278,59696,127613,787,...,1,2,0,0,36,0.467789,0,1030989000.0,Afternoon,33.540588
4,4,104.71,Swipe Transaction,CA,No Error,0,29278,59696,127613,787,...,1,2,0,0,36,0.467789,1,1031034000.0,Morning,0.0


In [19]:
df.shape

(1203635, 21)

In [20]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Amount,Is Fraud?,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,Has Chip,Cards Issued,International,Online,Age_at_transaction,income_to_debt,day_of_week,timestamp,distances
count,1203635.0,1203635.0,1203635.0,1203635.0,1203635.0,1203635.0,1203635.0,1203635.0,1203635.0,1203635.0,1203635.0,1203635.0,1203635.0,1203635.0,1203635.0,1203635.0,1203635.0
mean,627813.3,53.42784,0.001051814,26824.09,52035.43,69607.32,724.3838,3.537857,0.9066287,1.595465,0.006098194,0.1509419,47.23789,1523181.0,3.003065,1336919000.0,207.6576
std,363802.7,78.93761,0.03241463,19352.3,35760.85,64455.62,60.81448,1.618776,0.2909522,0.4909323,0.0778525,0.3579924,15.8015,8205016.0,1.999756,160870200.0,821.9707
min,0.0,0.02,0.0,10059.0,10869.0,0.0,566.0,1.0,0.0,1.0,0.0,0.0,18.0,0.3046351,0.0,828349600.0,0.0
25%,312737.5,12.81,0.0,17898.0,35602.0,21844.0,685.0,2.0,1.0,1.0,0.0,0.0,35.0,0.5123189,1.0,1224319000.0,0.0
50%,625689.0,35.4,0.0,21916.0,42509.0,63689.0,723.0,3.0,1.0,2.0,0.0,0.0,45.0,0.7407534,3.0,1353999000.0,0.0
75%,944451.5,70.17,0.0,26748.0,54537.0,95454.0,770.0,5.0,1.0,2.0,0.0,0.0,57.0,1.913292,5.0,1471149000.0,166.5762
max,1258232.0,6820.2,1.0,163145.0,249925.0,437533.0,850.0,8.0,1.0,3.0,1.0,1.0,91.0,216740000.0,6.0,1582934000.0,8148.505


In [21]:
df.describe(exclude=np.number)

Unnamed: 0,Use Chip,Merchant State,Errors?,time_of_day
count,1203635,1203635,1203635,1203635
unique,3,127,20,4
top,Swipe Transaction,Online,No Error,Morning
freq,728231,181679,1184197,481179


In [22]:
from sklearn.model_selection import train_test_split

# Extract feature and target arrays
X, y = df.drop('Is Fraud?', axis=1), df[['Is Fraud?']]

In [23]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

In [24]:
X.dtypes

Unnamed: 0                        int64
Amount                          float64
Use Chip                       category
Merchant State                 category
Errors?                        category
Per Capita Income - Zipcode       int64
Yearly Income - Person            int64
Total Debt                        int64
FICO Score                        int64
Num Credit Cards                  int64
Has Chip                          int64
Cards Issued                      int64
International                     int64
Online                            int64
Age_at_transaction                int64
income_to_debt                  float64
day_of_week                       int64
timestamp                       float64
time_of_day                    category
distances                       float64
dtype: object

In [25]:
# Apply RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [26]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=1)

In [27]:
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [28]:
# Define hyperparameters
params = {"objective": "binary:logistic", "tree_method": "gpu_hist"}

n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

In [29]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)
# Convert probabilities to binary class predictions
predictions = (preds > 0.5).astype(int)

In [30]:
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")


RMSE of the base model: 0.213


In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
# Evaluate with classification metrics
y_test_np = y_test.to_numpy()
print("Accuracy:", accuracy_score(y_test_np, predictions))
print("Precision:", precision_score(y_test_np, predictions))
print("Recall:", recall_score(y_test_np, predictions))
print("F1 Score:", f1_score(y_test_np, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test_np, predictions))
print("Classification Report:\n", classification_report(y_test_np, predictions))

Accuracy: 0.9447077409162717
Precision: 0.9384615384615385
Recall: 0.953125
F1 Score: 0.9457364341085271
Confusion Matrix:
 [[293  20]
 [ 15 305]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.94      0.94       313
           1       0.94      0.95      0.95       320

    accuracy                           0.94       633
   macro avg       0.94      0.94      0.94       633
weighted avg       0.94      0.94      0.94       633

