In [None]:
!pip install scikit-learn joblib

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from collections import Counter




In [None]:

# Load dataset (update the filename if needed)
df = pd.read_csv("fraudTest.csv")
print("Dataset shape:", df.shape)
print("Fraud ratio:", df["is_fraud"].mean())


Dataset shape: (62233, 23)
Fraud ratio: 0.004258259416377427


In [None]:
# Drop useless columns
drop_cols = ["trans_num","cc_num","first","last","street","job","dob","zip","Unnamed: 0"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Convert datetime
df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])

# Create time features
df["hour"] = df["trans_date_trans_time"].dt.hour
df["dayofweek"] = df["trans_date_trans_time"].dt.dayofweek
df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)

# Frequency encode merchant & category
for col in ["merchant", "category"]:
    freq = df[col].value_counts()
    df[col + "_freq"] = df[col].map(freq)
df = df.drop(columns=["merchant", "category", "trans_date_trans_time"])

# Encode gender
df["gender"] = df["gender"].map({"M": 0, "F": 1})

# Fill missing values
df = df.fillna(df.median(numeric_only=True))


In [None]:
frauds = df[df["is_fraud"] == 1]
non_frauds = df[df["is_fraud"] == 0] # Sample all non-frauds
sampled_df = pd.concat([frauds, non_frauds])

X = sampled_df.drop(columns=["is_fraud"])
y = sampled_df["is_fraud"]

# Apply one-hot encoding to 'city' and 'state'
X = pd.get_dummies(X, columns=["city", "state"], drop_first=True)

print("Sampled data shape:", sampled_df.shape)
print("Class distribution:", Counter(y))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

Sampled data shape: (62233, 16)
Class distribution: Counter({0.0: 61968, 1.0: 265})


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, class_weight="balanced", random_state=42)
}

best_model = None
best_f1 = 0

for name, model in models.items():
    pipe = Pipeline([("scaler", StandardScaler()), ("clf", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    print("\n", "="*40)
    print(f"{name} Report:")
    print(classification_report(y_test, y_pred, digits=3))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    report = classification_report(y_test, y_pred, output_dict=True)
    # Check if the '1' key exists in the report before accessing it
    f1_fraud = report.get("1", {}).get("f1-score", 0)
    if f1_fraud > best_f1:
        best_f1 = f1_fraud
        best_model = pipe


Logistic Regression Report:
              precision    recall  f1-score   support

         0.0      1.000     0.979     0.989     18590
         1.0      0.168     0.975     0.286        80

    accuracy                          0.979     18670
   macro avg      0.584     0.977     0.638     18670
weighted avg      0.996     0.979     0.986     18670

Confusion Matrix:
 [[18203   387]
 [    2    78]]

Decision Tree Report:
              precision    recall  f1-score   support

         0.0      0.999     0.979     0.989     18590
         1.0      0.147     0.850     0.250        80

    accuracy                          0.978     18670
   macro avg      0.573     0.914     0.620     18670
weighted avg      0.996     0.978     0.986     18670

Confusion Matrix:
 [[18195   395]
 [   12    68]]

Random Forest Report:
              precision    recall  f1-score   support

         0.0      0.999     0.996     0.997     18590
         1.0      0.447     0.787     0.570        80

    acc

In [None]:
import joblib
from google.colab import files

joblib.dump(best_model, "best_fraud_model.joblib")
print(f"✅ Best model saved as best_fraud_model.joblib (F1 for fraud: {best_f1:.3f})")

# Download to your local machine
files.download("best_fraud_model.joblib")

✅ Best model saved as best_fraud_model.joblib (F1 for fraud: 0.000)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>