# Import & Path Setup

In [1]:
import os
import pandas as pd
import numpy as np
import joblib


In [2]:
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_PATH = os.path.join(PROJECT_ROOT, "data", "raw", "creditcard.csv")
MODEL_DIR = os.path.join(PROJECT_ROOT, "models")

print("Data exists:", os.path.exists(DATA_PATH))
print("Model exists:", os.path.exists(os.path.join(MODEL_DIR, "final_random_forest.pkl")))
print("Threshold exists:", os.path.exists(os.path.join(MODEL_DIR, "optimal_threshold.pkl")))


Data exists: True
Model exists: True
Threshold exists: True


# Load Saved Model & Threshold

In [3]:
rf_model = joblib.load(os.path.join(MODEL_DIR, "final_random_forest.pkl"))
optimal_threshold = joblib.load(os.path.join(MODEL_DIR, "optimal_threshold.pkl"))

print("Loaded threshold:", optimal_threshold)


Loaded threshold: 0.77


# This proves:

model persistence works

you don’t need to retrain to predict

# Load Dataset (To Simulate New Data)

In [4]:
df = pd.read_csv(DATA_PATH)

X = df.drop("Class", axis=1)
y = df["Class"]

X.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


# Note : In real life, this would come from:

an API request

a streaming system

a transaction database

# Predict Probabilities

In [5]:
probs = rf_model.predict_proba(X)[:, 1]

probs[:10]


array([0.   , 0.   , 0.005, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   ])

# Note : Output meaning:

Each value = probability of fraud

Example: 0.87 → 87% chance of fraud

# Apply Threshold For Final Decision

In [6]:
predictions = (probs >= optimal_threshold).astype(int)

predictions[:10]


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Note : Interpretation:

0 → allow transaction

1 → flag as fraud

# Build A Clean Prediction Output Table

In [7]:
results = X.copy()
results["fraud_probability"] = probs
results["fraud_prediction"] = predictions

results.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,fraud_probability,fraud_prediction
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.005,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0,0


 Exactly What Production Systems Log Looks like

# Inpecting High-Risk Transactions

In [8]:
high_risk = results[results["fraud_prediction"] == 1]

high_risk.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,fraud_probability,fraud_prediction
4920,4462.0,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,...,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029,239.93,0.995,1
6108,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.0,0.9,1
6329,7519.0,1.234235,3.01974,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.0,1.0,1
6331,7526.0,0.00843,4.137837,-6.240697,6.675732,0.768307,-3.35306,-1.631735,0.154612,-2.795892,...,-0.608057,-0.539528,0.12894,1.488481,0.507963,0.735822,0.513574,1.0,1.0,1
6334,7535.0,0.026779,4.132464,-6.5606,6.348557,1.329666,-2.513479,-1.689102,0.303253,-3.139409,...,-0.576752,-0.669605,-0.759908,1.605056,0.540675,0.73704,0.496699,1.0,1.0,1


# Note : This is what:

fraud analysts review

alerts systems consume

dashboards visualize

# Predict A SINGLE Transantion(Most Important Cell)

In [9]:
single_transaction = X.iloc[0].values.reshape(1, -1)

prob = rf_model.predict_proba(single_transaction)[0][1]
decision = int(prob >= optimal_threshold)

print("Fraud probability:", prob)
print("Fraud decision:", decision)


Fraud probability: 0.0
Fraud decision: 0


