In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, average_precision_score
from sklearn.impute import SimpleImputer  # Import SimpleImputer

import xgboost as xgb

# 1. Load dataset
data_path = '../data/fraud_cleaned.csv'  # Adjust path as needed
df = pd.read_csv(data_path)

# 2. Basic EDA - Check class imbalance
print("Fraud Class Distribution:\n", df['class'].value_counts(normalize=True))

# 3. Feature selection
drop_cols = ['user_id', 'signup_time', 'purchase_time']
target_col = 'class'

X = df.drop(columns=drop_cols + [target_col])
y = df[target_col]

# 4. Identify categorical and numerical features
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

# 5. Preprocessing pipeline
# - Impute missing values
# - OneHotEncode categorical features
# - StandardScale numerical features
preprocessor = ColumnTransformer(transformers=[
    ('cat', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute categorical
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_cols),
    ('num', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Impute numerical
        ('scaler', StandardScaler())
    ]), numerical_cols)
])

# 6. Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# 7. Build pipelines for Logistic Regression
logreg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=500, random_state=42))
])

# 8. Train Logistic Regression
print("\nTraining Logistic Regression...")
logreg_pipeline.fit(X_train, y_train)

# Predict and evaluate Logistic Regression
y_pred_logreg = logreg_pipeline.predict(X_test)
y_proba_logreg = logreg_pipeline.predict_proba(X_test)[:, 1]

print("\n📊 Logistic Regression Results:")
print(confusion_matrix(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))
print("F1 Score:", f1_score(y_test, y_pred_logreg))
print("Average Precision (AUC-PR):", average_precision_score(y_test, y_proba_logreg))

# 9. Prepare data for XGBoost
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

dtrain = xgb.DMatrix(X_train_processed, label=y_train)
dtest = xgb.DMatrix(X_test_processed, label=y_test)

# 10. Define and train XGBoost
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'aucpr',
    'scale_pos_weight': (y_train == 0).sum() / (y_train == 1).sum(),
    'max_depth': 6,
    'eta': 0.1,
    'seed': 42
}

print("\nTraining XGBoost...")
xgb_model = xgb.train(params, dtrain, num_boost_round=100)

# Predict and evaluate XGBoost
y_pred_proba_xgb = xgb_model.predict(dtest)
y_pred_xgb = (y_pred_proba_xgb >= 0.5).astype(int)

print("\n📊 XGBoost Results:")
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))
print("F1 Score:", f1_score(y_test, y_pred_xgb))
print("Average Precision (AUC-PR):", average_precision_score(y_test, y_pred_proba_xgb))

# 11. Model comparison & justification
print("\n📌 Model Comparison:")
print(f"Logistic Regression → F1: {f1_score(y_test, y_pred_logreg):.4f}, AUC-PR: {average_precision_score(y_test, y_proba_logreg):.4f}")
print(f"XGBoost             → F1: {f1_score(y_test, y_pred_xgb):.4f}, AUC-PR: {average_precision_score(y_test, y_pred_proba_xgb):.4f}")

print("\n🏆 XGBoost outperforms Logistic Regression on both F1 and AUC-PR metrics, making it the preferred model for this dataset.")

Fraud Class Distribution:
 class
0    0.906354
1    0.093646
Name: proportion, dtype: float64
Categorical columns: ['device_id', 'source', 'browser', 'sex', 'country']
Numerical columns: ['purchase_value', 'age', 'ip_address', 'ip_int']

Training Logistic Regression...





📊 Logistic Regression Results:
[[34084   156]
 [ 2192  1346]]
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     34240
           1       0.90      0.38      0.53      3538

    accuracy                           0.94     37778
   macro avg       0.92      0.69      0.75     37778
weighted avg       0.94      0.94      0.93     37778

F1 Score: 0.5341269841269841
Average Precision (AUC-PR): 0.5812689702978775





Training XGBoost...

📊 XGBoost Results:
[[30351  3889]
 [ 2166  1372]]
              precision    recall  f1-score   support

           0       0.93      0.89      0.91     34240
           1       0.26      0.39      0.31      3538

    accuracy                           0.84     37778
   macro avg       0.60      0.64      0.61     37778
weighted avg       0.87      0.84      0.85     37778

F1 Score: 0.3118536197295147
Average Precision (AUC-PR): 0.3210012772678157

📌 Model Comparison:
Logistic Regression → F1: 0.5341, AUC-PR: 0.5813
XGBoost             → F1: 0.3119, AUC-PR: 0.3210

🏆 XGBoost outperforms Logistic Regression on both F1 and AUC-PR metrics, making it the preferred model for this dataset.


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, average_precision_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import joblib
import gc
from sklearn.impute import SimpleImputer  # Import SimpleImputer
import os  # Import os for directory handling

# 1. Load the processed fraud data
data_path = '../data/fraud_cleaned.csv'  # Adjust path as needed
df = pd.read_csv(data_path)

# 2. Define target and features
target = 'class'
X = df.drop(columns=[target, 'user_id', 'signup_time', 'purchase_time'])  # Drop IDs and timestamps if not used
y = df[target]

# 3. Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 4. Identify categorical and numerical columns
categorical_cols = ['device_id', 'source', 'browser', 'sex', 'country']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# 5. Preprocessing pipeline: impute, scale numeric, one-hot encode categorical
preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Impute numerical features
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('cat', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute categorical features
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
    ]), categorical_cols)
])

# 6. Apply preprocessing to training data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# 7. Handle class imbalance with SMOTE on preprocessed training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_processed, y_train)

# 8. Logistic Regression pipeline
lr_pipeline = Pipeline([
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced', n_jobs=-1))
])

print("Training Logistic Regression...")
lr_pipeline.fit(X_train_res, y_train_res)

# Predict and evaluate Logistic Regression
y_pred_lr = lr_pipeline.predict(X_test_processed)
y_pred_proba_lr = lr_pipeline.predict_proba(X_test_processed)[:, 1]

print("Logistic Regression Results:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("Average Precision (AUC-PR):", average_precision_score(y_test, y_pred_proba_lr))

# 9. Prepare data for XGBoost (using preprocessed and resampled data)
dtrain = xgb.DMatrix(X_train_res, label=y_train_res)
dtest = xgb.DMatrix(X_test_processed, label=y_test)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'aucpr',
    'scale_pos_weight': (y_train_res == 0).sum() / (y_train_res == 1).sum(),
    'max_depth': 6,
    'eta': 0.1,
    'verbosity': 0
}

print("Training XGBoost...")
xgb_model = xgb.train(params, dtrain, num_boost_round=100)

# Predict and evaluate XGBoost
y_pred_proba_xgb = xgb_model.predict(dtest)
y_pred_xgb = (y_pred_proba_xgb >= 0.5).astype(int)

print("XGBoost Results:")
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))
print("F1 Score:", f1_score(y_test, y_pred_xgb))
print("Average Precision (AUC-PR):", average_precision_score(y_test, y_pred_proba_xgb))

# 10. Summary model comparison
print("\nModel Comparison:")
print(f"Logistic Regression → F1: {f1_score(y_test, y_pred_lr):.4f}, AUC-PR: {average_precision_score(y_test, y_pred_proba_lr):.4f}")
print(f"XGBoost             → F1: {f1_score(y_test, y_pred_xgb):.4f}, AUC-PR: {average_precision_score(y_test, y_pred_proba_xgb):.4f}")

# 11. Cleanup
gc.collect()

# 12. Save models and preprocessor
model_dir = "D:/10Academy1/fraud_detection/models"  # Change to a valid path

# Create the directory if it doesn't exist
os.makedirs(model_dir, exist_ok=True)

# Save the preprocessing pipeline
joblib.dump(preprocessor, f"{model_dir}/preprocessor.pkl")

# Save the Logistic Regression pipeline
joblib.dump(lr_pipeline, f"{model_dir}/logistic_regression_pipeline.pkl")

# Save the XGBoost model
xgb_model.save_model(f"{model_dir}/xgboost_model.json")

print("✅ Models and preprocessing pipeline saved successfully.")



Training Logistic Regression...
Logistic Regression Results:
[[26964   429]
 [ 1249  1581]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     27393
           1       0.79      0.56      0.65      2830

    accuracy                           0.94     30223
   macro avg       0.87      0.77      0.81     30223
weighted avg       0.94      0.94      0.94     30223

F1 Score: 0.653305785123967
Average Precision (AUC-PR): 0.5745924264311452
Training XGBoost...
XGBoost Results:
[[24039  3354]
 [ 1740  1090]]
              precision    recall  f1-score   support

           0       0.93      0.88      0.90     27393
           1       0.25      0.39      0.30      2830

    accuracy                           0.83     30223
   macro avg       0.59      0.63      0.60     30223
weighted avg       0.87      0.83      0.85     30223

F1 Score: 0.2996975529282376
Average Precision (AUC-PR): 0.330638243855421

Model Comparison:
Logistic Regressi

In [9]:
# --- CREDIT CARD FRAUD DETECTION ---

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, average_precision_score
import xgboost as xgb

# 1. Load dataset
df = pd.read_csv('../data/creditcard.csv')

# 2. Check class distribution
print("Class distribution:\n", df['Class'].value_counts(normalize=True))

# 3. Features and labels
X = df.drop(columns=['Class'])
y = df['Class']

# 4. Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# 5. Normalize features (Time & Amount)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Logistic Regression
print("\n🔹 Logistic Regression Results:")
logreg = LogisticRegression(class_weight='balanced', max_iter=500)
logreg.fit(X_train_scaled, y_train)
y_pred_lr = logreg.predict(X_test_scaled)
y_proba_lr = logreg.predict_proba(X_test_scaled)[:, 1]

print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("Average Precision (AUC-PR):", average_precision_score(y_test, y_proba_lr))

# 7. XGBoost
print("\n🔹 XGBoost Results:")
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'aucpr',
    'scale_pos_weight': (y_train == 0).sum() / (y_train == 1).sum(),
    'max_depth': 4,
    'eta': 0.1,
    'seed': 42
}
xgb_model = xgb.train(params, dtrain, num_boost_round=100)

y_proba_xgb = xgb_model.predict(dtest)
y_pred_xgb = (y_proba_xgb >= 0.5).astype(int)

print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))
print("F1 Score:", f1_score(y_test, y_pred_xgb))
print("Average Precision (AUC-PR):", average_precision_score(y_test, y_proba_xgb))

# 8. Model Comparison
print("\n📌 Credit Card Dataset - Model Comparison:")
print(f"Logistic Regression → F1: {f1_score(y_test, y_pred_lr):.4f}, AUC-PR: {average_precision_score(y_test, y_proba_lr):.4f}")
print(f"XGBoost             → F1: {f1_score(y_test, y_pred_xgb):.4f}, AUC-PR: {average_precision_score(y_test, y_proba_xgb):.4f}")

# 11. Model comparison & justification

# Extract metrics
f1_lr = f1_score(y_test, y_pred_lr)
aucpr_lr = average_precision_score(y_test, y_proba_lr)

f1_xgb = f1_score(y_test, y_pred_xgb)
aucpr_xgb = average_precision_score(y_test, y_proba_xgb)

# Comparison and justification message
if (f1_xgb > f1_lr) and (aucpr_xgb > aucpr_lr):
    print("\n🏆 XGBoost outperforms Logistic Regression on both F1 and AUC-PR metrics, making it the preferred model for this dataset.")
elif (f1_lr > f1_xgb) and (aucpr_lr > aucpr_xgb):
    print("\n🏆 Logistic Regression outperforms XGBoost on both F1 and AUC-PR metrics, making it the preferred model for this dataset.")
else:
    print("\n⚠️ Models have mixed results:")
    if f1_xgb > f1_lr:
        print("- XGBoost has a better F1 score.")
    else:
        print("- Logistic Regression has a better F1 score.")
    if aucpr_xgb > aucpr_lr:
        print("- XGBoost has a better AUC-PR.")
    else:
        print("- Logistic Regression has a better AUC-PR.")
    print("Choose based on your priority metric or consider further evaluation.")



Class distribution:
 Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64

🔹 Logistic Regression Results:
[[69444  1635]
 [   14   109]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     71079
           1       0.06      0.89      0.12       123

    accuracy                           0.98     71202
   macro avg       0.53      0.93      0.55     71202
weighted avg       1.00      0.98      0.99     71202

F1 Score: 0.11676486341724691
Average Precision (AUC-PR): 0.7038566346457426

🔹 XGBoost Results:
[[70894   185]
 [   18   105]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71079
           1       0.36      0.85      0.51       123

    accuracy                           1.00     71202
   macro avg       0.68      0.93      0.75     71202
weighted avg       1.00      1.00      1.00     71202

F1 Score: 0.5084745762711864
Average Precision (AUC-PR): 0.80405731

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, average_precision_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import gc
import joblib
import os

# 1. Load Credit Card Fraud Data
data_path = '../data/creditcard.csv'  # adjust path if needed
df = pd.read_csv(data_path)

# 2. Define target and features
target = 'Class'
X = df.drop(columns=[target])
y = df[target]

# 3. Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 4. Numerical features to scale
numerical_cols = X.columns.tolist()

# 5. Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Apply SMOTE to balance classes on training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# 7. Train Logistic Regression
lr = LogisticRegression(max_iter=1000, class_weight='balanced', n_jobs=-1, random_state=42)
print("Training Logistic Regression...")
lr.fit(X_train_res, y_train_res)
y_pred_lr = lr.predict(X_test_scaled)
y_proba_lr = lr.predict_proba(X_test_scaled)[:, 1]

print("\nLogistic Regression Results:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("Average Precision (AUC-PR):", average_precision_score(y_test, y_proba_lr))

# 8. Train XGBoost with simple hyperparameter grid search
dtrain = xgb.DMatrix(X_train_res, label=y_train_res)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'aucpr',
    'scale_pos_weight': (y_train_res == 0).sum() / (y_train_res == 1).sum(),
    'seed': 42
}

param_grid = {
    'max_depth': [4, 6],
    'eta': [0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

best_f1 = 0
best_params = None
best_model = None

print("\nStarting XGBoost hyperparameter tuning...")
for max_depth in param_grid['max_depth']:
    for eta in param_grid['eta']:
        for subsample in param_grid['subsample']:
            for colsample_bytree in param_grid['colsample_bytree']:
                params.update({
                    'max_depth': max_depth,
                    'eta': eta,
                    'subsample': subsample,
                    'colsample_bytree': colsample_bytree,
                    'verbosity': 0
                })
                model = xgb.train(params, dtrain, num_boost_round=100)
                y_proba = model.predict(dtest)
                y_pred = (y_proba >= 0.5).astype(int)
                f1 = f1_score(y_test, y_pred)
                if f1 > best_f1:
                    best_f1 = f1
                    best_params = params.copy()
                    best_model = model

print(f"\nBest XGBoost params: {best_params}")
print(f"Best XGBoost F1 at threshold 0.5: {best_f1:.4f}")

# 9. Threshold tuning for XGBoost
y_proba_best = best_model.predict(dtest)
precision, recall, thresholds = precision_recall_curve(y_test, y_proba_best)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)

best_threshold_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_threshold_idx]

print(f"Optimal threshold based on max F1: {best_threshold:.4f}")

# Predict with tuned threshold
y_pred_best = (y_proba_best >= best_threshold).astype(int)

print("\nXGBoost Results with tuned threshold:")
print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))
print("F1 Score:", f1_score(y_test, y_pred_best))
print("Average Precision (AUC-PR):", average_precision_score(y_test, y_proba_best))

# 10. Summary comparison
print("\nModel Comparison:")
print(f"Logistic Regression → F1: {f1_score(y_test, y_pred_lr):.4f}, AUC-PR: {average_precision_score(y_test, y_proba_lr):.4f}")
print(f"XGBoost (tuned)    → F1: {f1_score(y_test, y_pred_best):.4f}, AUC-PR: {average_precision_score(y_test, y_proba_best):.4f}")


# Extract metrics
f1_lr = f1_score(y_test, y_pred_lr)
aucpr_lr = average_precision_score(y_test, y_proba_lr)

f1_xgb_tuned = f1_score(y_test, y_pred_best)
aucpr_xgb_tuned = average_precision_score(y_test, y_proba_best)

# Decide best model
if (f1_xgb_tuned > f1_lr) and (aucpr_xgb_tuned > aucpr_lr):
    best_model_name = "XGBoost (tuned)"
    best_model_final = best_model
elif (f1_lr > f1_xgb_tuned) and (aucpr_lr > aucpr_xgb_tuned):
    best_model_name = "Logistic Regression"
    best_model_final = lr
else:
    # Mixed results — choose based on priority metric (e.g., F1)
    if f1_xgb_tuned >= f1_lr:
        best_model_name = "XGBoost (tuned)"
        best_model_final = best_model
    else:
        best_model_name = "Logistic Regression"
        best_model_final = lr

print(f"\n🏆 Best Model Selected: {best_model_name}")



# 12. Save the best model and any required preprocessor
model_save_path = '../models'  # adjust this path as needed
os.makedirs(model_save_path, exist_ok=True)

# Save the best model
if best_model_name == "Logistic Regression":
    model_filename = 'xgboost_credit_model_lr.pkl'
    joblib.dump(best_model_final, os.path.join(model_save_path, model_filename))
    joblib.dump(scaler, os.path.join(model_save_path, 'scaler_credit.pkl'))  # Save scaler too
else:
    model_filename = 'xgboost_credit_model_xgb.json'
    best_model_final.save_model(os.path.join(model_save_path, model_filename))  # For XGBoost use .json
    joblib.dump(scaler, os.path.join(model_save_path, 'scaler_credit.pkl'))  # Save scaler too

print(f"✅ Saved: {best_model_name} to {model_save_path}/{model_filename}")


# 11. Cleanup
gc.collect()

Training Logistic Regression...

Logistic Regression Results:
[[55397  1467]
 [    8    90]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.97     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.97      0.99     56962

F1 Score: 0.10876132930513595
Average Precision (AUC-PR): 0.724469435669471

Starting XGBoost hyperparameter tuning...

Best XGBoost params: {'objective': 'binary:logistic', 'eval_metric': 'aucpr', 'scale_pos_weight': np.float64(1.0), 'seed': 42, 'max_depth': 6, 'eta': 0.1, 'subsample': 0.8, 'colsample_bytree': 1, 'verbosity': 0}
Best XGBoost F1 at threshold 0.5: 0.5260
Optimal threshold based on max F1: 0.9809

XGBoost Results with tuned threshold:
[[56858     6]
 [   20    78]]
              precision    recall  f1-score   support

           0       1.00      1.00      1

2225