###   Model Building and Training

In [1]:
# Importhing important Libraries 
import sys
import os
sys.path.append(os.path.abspath(".."))
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")

#### 1. Data Preparation

In [2]:
credit_df = pd.read_csv("../data/creditcard.csv")
fraud_df = pd.read_csv("../data/Fraud_Data.csv")

In [5]:
credit_df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [6]:

fraud_df.columns

Index(['user_id', 'signup_time', 'purchase_time', 'purchase_value',
       'device_id', 'source', 'browser', 'sex', 'age', 'ip_address', 'class'],
      dtype='object')

In [7]:
# Preprocessing - Credit Card Data

X_credit = credit_df.drop(columns=['Class'])
y_credit = credit_df['Class']


In [10]:
# Standardize numeric features
scaler_credit = StandardScaler()
X_credit_scaled = scaler_credit.fit_transform(X_credit)


In [11]:
# Train-test split
Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_credit_scaled, y_credit, test_size=0.3, stratify=y_credit, random_state=42)


In [12]:
# Handle imbalance
smote = SMOTE(random_state=42)
Xc_train_bal, yc_train_bal = smote.fit_resample(Xc_train, yc_train)

In [13]:
# 🧹 Preprocessing - Fraud_Data (E-Commerce)

fraud_df = fraud_df.dropna()
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])


In [14]:
# Time difference feature
fraud_df['time_diff_sec'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds()

In [15]:
# Drop ID/time columns
fraud_df = fraud_df.drop(columns=['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address'])

In [16]:
# One-hot encode categoricals
fraud_df = pd.get_dummies(fraud_df, columns=['source', 'browser', 'sex'], drop_first=True)


In [17]:
X_fraud = fraud_df.drop(columns=['class'])
y_fraud = fraud_df['class']


In [18]:
# Scale
scaler_fraud = StandardScaler()
X_fraud_scaled = scaler_fraud.fit_transform(X_fraud)


In [19]:
# Train-test split
Xf_train, Xf_test, yf_train, yf_test = train_test_split(
    X_fraud_scaled, y_fraud, test_size=0.3, stratify=y_fraud, random_state=42)

In [20]:
# Handle imbalance
Xf_train_bal, yf_train_bal = smote.fit_resample(Xf_train, yf_train)


In [21]:
# 🔍 Evaluation Function

def evaluate_model(model, X_train, y_train, X_test, y_test, label):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print(f"\n📊 Evaluation Results for {label}:")
    print("-" * 40)
    print("F1 Score:", f1_score(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_prob))
    print("Average Precision (AUC-PR):", average_precision_score(y_test, y_prob))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))


In [22]:
# 🧪 Train and Compare Models
lr_model = LogisticRegression(solver='liblinear')

In [24]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)


In [25]:
# Credit Card
print("\n=== 🚀 Credit Card Dataset ===")
evaluate_model(lr_model, Xc_train_bal, yc_train_bal, Xc_test, yc_test, "Logistic Regression")
evaluate_model(xgb_model, Xc_train_bal, yc_train_bal, Xc_test, yc_test, "XGBoost")



=== 🚀 Credit Card Dataset ===

📊 Evaluation Results for Logistic Regression:
----------------------------------------
F1 Score: 0.12009237875288684
ROC AUC Score: 0.9670318671447107
Average Precision (AUC-PR): 0.7071301472238914

Confusion Matrix:
[[83408  1887]
 [   18   130]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     85295
           1       0.06      0.88      0.12       148

    accuracy                           0.98     85443
   macro avg       0.53      0.93      0.55     85443
weighted avg       1.00      0.98      0.99     85443


📊 Evaluation Results for XGBoost:
----------------------------------------
F1 Score: 0.7727272727272727
ROC AUC Score: 0.9738266873474096
Average Precision (AUC-PR): 0.8255596496077687

Confusion Matrix:
[[85254    41]
 [   29   119]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295


In [26]:
# E-Commerce Fraud Data
print("\n=== 🚀 E-Commerce Fraud Dataset ===")
evaluate_model(lr_model, Xf_train_bal, yf_train_bal, Xf_test, yf_test, "Logistic Regression")
evaluate_model(xgb_model, Xf_train_bal, yf_train_bal, Xf_test, yf_test, "XGBoost")



=== 🚀 E-Commerce Fraud Dataset ===

📊 Evaluation Results for Logistic Regression:
----------------------------------------
F1 Score: 0.26915717539863326
ROC AUC Score: 0.7567761681163193
Average Precision (AUC-PR): 0.46708325704003706

Confusion Matrix:
[[26338 14751]
 [ 1291  2954]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.64      0.77     41089
           1       0.17      0.70      0.27      4245

    accuracy                           0.65     45334
   macro avg       0.56      0.67      0.52     45334
weighted avg       0.88      0.65      0.72     45334


📊 Evaluation Results for XGBoost:
----------------------------------------
F1 Score: 0.6800419224434796
ROC AUC Score: 0.767079124200531
Average Precision (AUC-PR): 0.6076537089384364

Confusion Matrix:
[[40926   163]
 [ 1974  2271]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     4