## ML using XGBoost
Installations to run:

$ pip install --user xgboost

---------------------
CPU only

$ conda install -c conda-forge py-xgboost-cpu

--Or--

Use NVIDIA GPU

$ conda install -c conda-forge py-xgboost-gpu

In [1]:
# Dependencies
import seaborn as sns

import pandas as pd

import numpy as np

from imblearn.under_sampling import RandomUnderSampler

import matplotlib.pyplot as plt

import warnings


warnings.filterwarnings("ignore")

In [2]:
#Load in dataset
df = pd.read_csv("data/test_1000.csv")

# Review the DataFrame
df.head()

Unnamed: 0.1,Unnamed: 0,Amount,Use Chip,Merchant State,Errors?,Is Fraud?,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,...,Has Chip,Cards Issued,International,Online,Age_at_transaction,income_to_debt,day_of_week,timestamp,time_of_day,distances
0,0,134.09,Swipe Transaction,CA,No Error,0,29278,59696,127613,787,...,1,2,0,0,36,0.467789,6,1030861000.0,Morning,0.0
1,1,38.48,Swipe Transaction,CA,No Error,0,29278,59696,127613,787,...,1,2,0,0,36,0.467789,6,1030863000.0,Morning,33.540588
2,2,120.34,Swipe Transaction,CA,No Error,0,29278,59696,127613,787,...,1,2,0,0,36,0.467789,0,1030948000.0,Morning,33.540588
3,3,128.95,Swipe Transaction,CA,No Error,0,29278,59696,127613,787,...,1,2,0,0,36,0.467789,0,1030989000.0,Afternoon,33.540588
4,4,104.71,Swipe Transaction,CA,No Error,0,29278,59696,127613,787,...,1,2,0,0,36,0.467789,1,1031034000.0,Morning,0.0


In [3]:
df.shape

(11487390, 21)

In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Amount,Is Fraud?,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,Has Chip,Cards Issued,International,Online,Age_at_transaction,income_to_debt,day_of_week,timestamp,distances
count,11487390.0,11487390.0,11487390.0,11487390.0,11487390.0,11487390.0,11487390.0,11487390.0,11487390.0,11487390.0,11487390.0,11487390.0,11487390.0,11487390.0,11487390.0,11487390.0,10003860.0
mean,6066054.0,52.70799,0.001214201,24543.09,47666.12,59827.85,709.2227,3.673632,0.8960742,1.522627,0.006491379,0.1226491,45.89595,2162980.0,3.003639,1340771000.0,165.7987
std,3508492.0,75.97164,0.03482423,12514.63,25014.69,52633.6,68.05891,1.569881,0.3051643,0.5179914,0.08030717,0.328034,15.62589,10338080.0,1.999734,159461800.0,605.0011
min,0.0,0.01,0.0,0.0,1.0,0.0,488.0,1.0,0.0,1.0,0.0,0.0,17.0,0.2008577,0.0,662800200.0,0.0
25%,3018895.0,12.17,0.0,17271.0,33566.0,18116.0,681.0,3.0,1.0,1.0,0.0,0.0,34.0,0.5291839,1.0,1232008000.0,0.0
50%,6067742.0,34.14,0.0,21639.0,41469.0,52419.0,714.0,4.0,1.0,2.0,0.0,0.0,44.0,0.7622536,3.0,1357837000.0,0.0
75%,9096439.0,69.12,0.0,28760.0,56605.0,85771.0,752.0,5.0,1.0,2.0,0.0,0.0,55.0,2.274873,5.0,1472105000.0,36.25127
max,12136590.0,12390.5,1.0,163145.0,280199.0,448929.0,850.0,9.0,1.0,3.0,1.0,1.0,102.0,216740000.0,6.0,1582934000.0,8317.257


In [5]:
df.describe(exclude=np.number)

Unnamed: 0,Use Chip,Merchant State,Errors?,time_of_day
count,11487390,11487390,11487390,11487390
unique,3,191,23,4
top,Swipe Transaction,Online,No Error,Morning
freq,7159943,1408918,11302080,4612490


In [6]:
from sklearn.model_selection import train_test_split

# Extract feature and target arrays
X, y = df.drop('Is Fraud?', axis=1), df[['Is Fraud?']]

In [7]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

In [8]:
X.dtypes

Unnamed: 0                        int64
Amount                          float64
Use Chip                       category
Merchant State                 category
Errors?                        category
Per Capita Income - Zipcode       int64
Yearly Income - Person            int64
Total Debt                        int64
FICO Score                        int64
Num Credit Cards                  int64
Has Chip                          int64
Cards Issued                      int64
International                     int64
Online                            int64
Age_at_transaction                int64
income_to_debt                  float64
day_of_week                       int64
timestamp                       float64
time_of_day                    category
distances                       float64
dtype: object

In [9]:
# Apply RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [10]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=1)

In [11]:
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [12]:
# Define hyperparameters
params = {"objective": "binary:logistic", "tree_method": "gpu_hist"}

n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

In [13]:
from sklearn.metrics import mean_squared_error

# Make predictions
train_preds = model.predict(dtrain_reg)
test_preds = model.predict(dtest_reg)

# Convert probabilities to binary predictions
train_preds_binary = (train_preds > 0.5).astype(int)
test_preds_binary = (test_preds > 0.5).astype(int)

In [14]:
rmse = mean_squared_error(y_test, test_preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")


RMSE of the base model: 0.180


In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
# Evaluate with classification metrics
y_test_np = y_test.to_numpy()
print("Confusion Matrix (Test Set):")
print(confusion_matrix(y_test, test_preds_binary))
print("Classification Report (Test Set):")
print(classification_report(y_test, test_preds_binary))

Confusion Matrix (Test Set):
[[3342  197]
 [  99 3336]]
Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.97      0.94      0.96      3539
           1       0.94      0.97      0.96      3435

    accuracy                           0.96      6974
   macro avg       0.96      0.96      0.96      6974
weighted avg       0.96      0.96      0.96      6974



In [16]:
print("Confusion Matrix (Training Set):")
print(confusion_matrix(y_train, train_preds_binary))
print("Classification Report (Training Set):")
print(classification_report(y_train, train_preds_binary))

Confusion Matrix (Training Set):
[[10223   186]
 [   18 10495]]
Classification Report (Training Set):
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     10409
           1       0.98      1.00      0.99     10513

    accuracy                           0.99     20922
   macro avg       0.99      0.99      0.99     20922
weighted avg       0.99      0.99      0.99     20922

