In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)
from imblearn.over_sampling import SMOTE

print("Libraries imported.")

# Detect environment (Colab vs Local)
running_in_colab = "COLAB_GPU" in os.environ

if running_in_colab:
    BASE = Path("/content/data")
    if not BASE.exists():
        BASE = Path("/content")
else:
    current_dir = Path.cwd()

    # If notebook is inside /notebooks/, data is one level up
    if (current_dir.parent / "data").exists():
        BASE = current_dir.parent / "data"
    # If notebook and data are in same/root folder
    elif (current_dir / "data").exists():
        BASE = current_dir / "data"
    else:
        BASE = current_dir
        print("Warning: Could not find 'data' folder. Using current directory.")

print("Using BASE directory:", BASE)

# Correct path to processed_data_ready_for_modeling.csv
data_path = BASE / "processed_data_ready_for_modeling.csv"

# Load processed dataset
df = pd.read_csv(data_path)
print(f"Data loaded. Shape: {df.shape}")


Libraries imported.
Using BASE directory: D:\fraud_detection_project\data
Data loaded. Shape: (5410, 22)


In [2]:
# Separate features and target
X = df.drop("PotentialFraud", axis=1)
y = df["PotentialFraud"]

print("Target value counts:")
print(y.value_counts())

# First split: train (60%) + temp (40%)
X_train_full, X_temp, y_train_full, y_temp = train_test_split(
    X, y,
    test_size=0.4,
    random_state=42,
    stratify=y
)

# Second split: validation (20%) + test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

print(f"Train shape: {X_train_full.shape}, class dist:\n{y_train_full.value_counts()}")
print(f"Val shape:   {X_val.shape}, class dist:\n{y_val.value_counts()}")
print(f"Test shape:  {X_test.shape}, class dist:\n{y_test.value_counts()}")

Target value counts:
PotentialFraud
0    4904
1     506
Name: count, dtype: int64
Train shape: (3246, 21), class dist:
PotentialFraud
0    2942
1     304
Name: count, dtype: int64
Val shape:   (1082, 21), class dist:
PotentialFraud
0    981
1    101
Name: count, dtype: int64
Test shape:  (1082, 21), class dist:
PotentialFraud
0    981
1    101
Name: count, dtype: int64


In [3]:
# Scale features (needed for Logistic Regression, helpful for GB)
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame to keep column names if needed
X_train_full_scaled = pd.DataFrame(X_train_full_scaled, columns=X.columns)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

print("Features scaled.")

# Initialize SMOTE and apply ONLY on training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_full_scaled, y_train_full)

print("\nTraining class distribution before SMOTE:")
print(y_train_full.value_counts())
print("\nTraining class distribution after SMOTE:")
print(y_train_smote.value_counts())

Features scaled.

Training class distribution before SMOTE:
PotentialFraud
0    2942
1     304
Name: count, dtype: int64

Training class distribution after SMOTE:
PotentialFraud
0    2942
1    2942
Name: count, dtype: int64


In [4]:
def evaluate_model(name, model, X_tr, y_tr, X_v, y_v):
    """
    Fit model on training data and evaluate on validation data.
    Returns metrics dict plus predictions and probabilities.
    """
    model.fit(X_tr, y_tr)

    y_pred = model.predict(X_v)
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_v)[:, 1]
    else:
        y_proba = model.decision_function(X_v)

    metrics = {
        "model": name,
        "precision": precision_score(y_v, y_pred),
        "recall": recall_score(y_v, y_pred),
        "f1": f1_score(y_v, y_pred),
        "roc_auc": roc_auc_score(y_v, y_proba),
        "pr_auc": average_precision_score(y_v, y_proba)
    }
    return metrics, y_pred, y_proba

print("Evaluation helper ready.")

Evaluation helper ready.


In [5]:
results = {}

# 1) Random Forest on SMOTE data
print("Training primary model: Random Forest (SMOTE)...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
rf_results, y_val_pred_rf, y_val_proba_rf = evaluate_model(
    "RF_SMOTE",
    rf_model,
    X_train_smote, y_train_smote,
    X_val_scaled, y_val
)
results["RF_SMOTE"] = rf_results
print("RF_SMOTE results:", rf_results)

# 2) Logistic Regression (SMOTE)
print("\nTraining comparison model 1: Logistic Regression (SMOTE)...")
lr_model = LogisticRegression(
    max_iter=1000,
    random_state=42
)
lr_results, y_val_pred_lr, y_val_proba_lr = evaluate_model(
    "LogReg_SMOTE",
    lr_model,
    X_train_smote, y_train_smote,
    X_val_scaled, y_val
)
results["LogReg_SMOTE"] = lr_results
print("LogReg_SMOTE results:", lr_results)

# 3) Gradient Boosting (SMOTE)
print("\nTraining comparison model 2: Gradient Boosting (SMOTE)...")
gb_model = GradientBoostingClassifier(
    n_estimators=100,
    random_state=42
)
gb_results, y_val_pred_gb, y_val_proba_gb = evaluate_model(
    "GB_SMOTE",
    gb_model,
    X_train_smote, y_train_smote,
    X_val_scaled, y_val
)
results["GB_SMOTE"] = gb_results
print("GB_SMOTE results:", gb_results)

print("\nValidation metrics summary:")
pd.DataFrame(results).T

Training primary model: Random Forest (SMOTE)...
RF_SMOTE results: {'model': 'RF_SMOTE', 'precision': 0.6410256410256411, 'recall': 0.7425742574257426, 'f1': 0.6880733944954128, 'roc_auc': 0.9551225764778313, 'pr_auc': 0.7577305178565125}

Training comparison model 1: Logistic Regression (SMOTE)...
LogReg_SMOTE results: {'model': 'LogReg_SMOTE', 'precision': 0.49710982658959535, 'recall': 0.8514851485148515, 'f1': 0.6277372262773723, 'roc_auc': 0.9577214602194163, 'pr_auc': 0.7905023148150148}

Training comparison model 2: Gradient Boosting (SMOTE)...
GB_SMOTE results: {'model': 'GB_SMOTE', 'precision': 0.5845070422535211, 'recall': 0.8217821782178217, 'f1': 0.6831275720164609, 'roc_auc': 0.9497229539467709, 'pr_auc': 0.7859825912449433}

Validation metrics summary:


Unnamed: 0,model,precision,recall,f1,roc_auc,pr_auc
RF_SMOTE,RF_SMOTE,0.641026,0.742574,0.688073,0.955123,0.757731
LogReg_SMOTE,LogReg_SMOTE,0.49711,0.851485,0.627737,0.957721,0.790502
GB_SMOTE,GB_SMOTE,0.584507,0.821782,0.683128,0.949723,0.785983


In [6]:
print("\nStarting lightweight RF_SMOTE hyperparameter tuning...")

rf_base_smote = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10],
    "min_samples_split": [2, 5],
}

grid_smote = GridSearchCV(
    rf_base_smote,
    param_grid,
    scoring="f1",
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Tune using SMOTE training data
grid_smote.fit(X_train_smote, y_train_smote)

best_rf_smote_tuned = grid_smote.best_estimator_
print("Best params (RF_SMOTE):", grid_smote.best_params_)

rf_smote_tuned_results, y_val_pred_rf_tuned, y_val_proba_rf_tuned = evaluate_model(
    "RF_SMOTE_tuned",
    best_rf_smote_tuned,
    X_train_smote, y_train_smote,
    X_val_scaled, y_val
)

print("\nRF_SMOTE_tuned results:", rf_smote_tuned_results)

# Compare RF_SMOTE vs RF_SMOTE_tuned
comparison = {
    "RF_SMOTE": rf_results,
    "RF_SMOTE_tuned": rf_smote_tuned_results,
    "GB_SMOTE": gb_results,
    "LogReg_SMOTE": lr_results
}
print("\nAll model variants (validation):")
pd.DataFrame(comparison).T


Starting lightweight RF_SMOTE hyperparameter tuning...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best params (RF_SMOTE): {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}

RF_SMOTE_tuned results: {'model': 'RF_SMOTE_tuned', 'precision': 0.6440677966101694, 'recall': 0.7524752475247525, 'f1': 0.6940639269406392, 'roc_auc': 0.9560662488267175, 'pr_auc': 0.762651509749543}

All model variants (validation):


Unnamed: 0,model,precision,recall,f1,roc_auc,pr_auc
RF_SMOTE,RF_SMOTE,0.641026,0.742574,0.688073,0.955123,0.757731
RF_SMOTE_tuned,RF_SMOTE_tuned,0.644068,0.752475,0.694064,0.956066,0.762652
GB_SMOTE,GB_SMOTE,0.584507,0.821782,0.683128,0.949723,0.785983
LogReg_SMOTE,LogReg_SMOTE,0.49711,0.851485,0.627737,0.957721,0.790502


In [7]:
from joblib import dump

# Decide best model (here we assume RF_SMOTE_tuned is best; adjust if needed)
final_model = best_rf_smote_tuned
final_model_name = "RF_SMOTE_tuned"

print(f"Final chosen model: {final_model_name}")

# Retrain final model on FULL train (train_full + val) with SMOTE again
X_train_full_all = pd.concat([X_train_full_scaled, X_val_scaled], axis=0)
y_train_full_all = pd.concat([y_train_full, y_val], axis=0)

X_train_full_smote, y_train_full_smote = smote.fit_resample(X_train_full_all, y_train_full_all)
print("Full train after SMOTE:", y_train_full_smote.value_counts())

final_model.fit(X_train_full_smote, y_train_full_smote)

# Evaluate on TEST set (scaled, original distribution)
y_test_pred = final_model.predict(X_test_scaled)
y_test_proba = final_model.predict_proba(X_test_scaled)[:, 1]

test_metrics = {
    "precision": precision_score(y_test, y_test_pred),
    "recall": recall_score(y_test, y_test_pred),
    "f1": f1_score(y_test, y_test_pred),
    "roc_auc": roc_auc_score(y_test, y_test_proba),
    "pr_auc": average_precision_score(y_test, y_test_proba)
}

print("\nFinal test metrics for", final_model_name, ":", test_metrics)

# Save all artifacts for Notebook 3
artifacts = {
    "rf_model": rf_model,
    "lr_model": lr_model,
    "gb_model": gb_model,
    "best_model": final_model,
    "X_test": X_test_scaled,
    "y_test": y_test,
    "scaler": scaler,
    "feature_names": X.columns.tolist()
}

model_filename = BASE / "trained_models_and_test_data.pkl"
joblib.dump(artifacts, model_filename)

print(f"\nAll models and test data saved to: {model_filename}")
print("Ready for Notebook 03 (Evaluation).")


Final chosen model: RF_SMOTE_tuned
Full train after SMOTE: PotentialFraud
0    3923
1    3923
Name: count, dtype: int64

Final test metrics for RF_SMOTE_tuned : {'precision': 0.5555555555555556, 'recall': 0.7425742574257426, 'f1': 0.635593220338983, 'roc_auc': 0.9431323866331588, 'pr_auc': 0.7103441119079906}

All models and test data saved to: D:\fraud_detection_project\data\trained_models_and_test_data.pkl
Ready for Notebook 03 (Evaluation).
