# Model Persistence — Save & Load Final Model

This notebook demonstrates how to save the final trained model and reload it
for inference, ensuring reproducibility and deployment readiness.


# Import & Path Setup

In [10]:
import os
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score

from imblearn.over_sampling import SMOTE


# Project Path



In [11]:
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_PATH = os.path.join(PROJECT_ROOT, "data", "raw", "creditcard.csv")
MODEL_DIR = os.path.join(PROJECT_ROOT, "models")

os.makedirs(MODEL_DIR, exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Data exists:", os.path.exists(DATA_PATH))


Project root: /Users/waheedkehinde/Desktop/fraud-detection-app
Data exists: True


# Load Dataset

In [12]:
df = pd.read_csv(DATA_PATH)

X = df.drop("Class", axis=1)
y = df["Class"]

print(df.shape)
y.value_counts()


(284807, 31)


Class
0    284315
1       492
Name: count, dtype: int64

# Train/Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

y_train.value_counts()


Class
0    227451
1       394
Name: count, dtype: int64

# Apply SMOTE To Train Set Only

In [14]:
print("Before SMOTE:\n", y_train.value_counts())

smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

print("After SMOTE:\n", y_train_sm.value_counts())


Before SMOTE:
 Class
0    227451
1       394
Name: count, dtype: int64




After SMOTE:
 Class
0    227451
1    227451
Name: count, dtype: int64


# Train Final Random Forest Model

In [15]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_sm, y_train_sm)


# Baseline Evaluation(threshold = 0.5)

In [16]:
y_probs = rf_model.predict_proba(X_test)[:, 1]
y_pred_default = (y_probs >= 0.5).astype(int)

print("Classification Report (threshold=0.5):\n")
print(classification_report(y_test, y_pred_default))
print("ROC-AUC:", roc_auc_score(y_test, y_probs))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_default))


Classification Report (threshold=0.5):

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.83      0.84      0.83        98

    accuracy                           1.00     56962
   macro avg       0.91      0.92      0.92     56962
weighted avg       1.00      1.00      1.00     56962

ROC-AUC: 0.9685208460142639
Confusion Matrix:
 [[56847    17]
 [   16    82]]


# Threshold Optimization (Maxmize Fraud F1)

In [17]:
thresholds = np.linspace(0.01, 0.99, 50)
results = []

for t in thresholds:
    preds = (y_probs >= t).astype(int)
    f1 = f1_score(y_test, preds, pos_label=1)
    results.append((t, f1))

results_df = pd.DataFrame(results, columns=["threshold", "f1_fraud"])
optimal_threshold = results_df.loc[results_df["f1_fraud"].idxmax(), "threshold"]

print("Optimal threshold:", optimal_threshold)


Optimal threshold: 0.77


# Evaluation At Optimal Threshold

In [18]:
y_pred_opt = (y_probs >= optimal_threshold).astype(int)

print("Classification Report at Optimal Threshold:\n")
print(classification_report(y_test, y_pred_opt))
print("ROC-AUC (unchanged):", roc_auc_score(y_test, y_probs))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_opt))


Classification Report at Optimal Threshold:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.95      0.79      0.86        98

    accuracy                           1.00     56962
   macro avg       0.98      0.89      0.93     56962
weighted avg       1.00      1.00      1.00     56962

ROC-AUC (unchanged): 0.9685208460142639
Confusion Matrix:
 [[56860     4]
 [   21    77]]


# Save Model & Threshold

In [19]:
joblib.dump(rf_model, os.path.join(MODEL_DIR, "final_random_forest.pkl"))
joblib.dump(optimal_threshold, os.path.join(MODEL_DIR, "optimal_threshold.pkl"))

print("Model and threshold saved successfully.")


Model and threshold saved successfully.


# Reload Model (Proof Of Persistence)

In [20]:
loaded_model = joblib.load(os.path.join(MODEL_DIR, "final_random_forest.pkl"))
loaded_threshold = joblib.load(os.path.join(MODEL_DIR, "optimal_threshold.pkl"))

loaded_probs = loaded_model.predict_proba(X_test)[:, 1]
loaded_preds = (loaded_probs >= loaded_threshold).astype(int)

print("Reloaded Model Report:\n")
print(classification_report(y_test, loaded_preds))


Reloaded Model Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.95      0.79      0.86        98

    accuracy                           1.00     56962
   macro avg       0.98      0.89      0.93     56962
weighted avg       1.00      1.00      1.00     56962

