In [None]:
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
import datetime

# Define paths
TRAIN_DATA_PATH = "../datasets/fraudTrain.csv"
TEST_DATA_PATH = "../datasets/fraudTest.csv"
MODEL_DIR = "../models/"
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

# **1-Read and Truncate Data**

In [None]:
# Read training data
train_data = pd.read_csv(TRAIN_DATA_PATH)

# Truncate training data to 10,000 samples for faster processing
train_data = train_data.sample(n=10000, random_state=42).reset_index(drop=True)
print(f"Training data shape after truncation: {train_data.shape}")

# Read test data
test_data = pd.read_csv(TEST_DATA_PATH)

# Truncate test data to 2,000 samples
test_data = test_data.sample(n=2000, random_state=42).reset_index(drop=True)
print(f"Test data shape after truncation: {test_data.shape}")

# **2-Preprocessing**

In [None]:
# Convert date columns to datetime if needed (optional since we're dropping them)
# If retaining 'trans_date_trans_time' or 'dob', we can extract features like transaction hour, age, etc.

# Drop PII and unnecessary columns
columns_to_drop = ['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'city', 'state', 'zip', 'dob', 'trans_num', 'trans_date_trans_time']
train_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Handle missing values
train_data.dropna(inplace=True, ignore_index=True)
test_data.dropna(inplace=True, ignore_index=True)

print(f"Training data shape after dropping columns and NaNs: {train_data.shape}")
print(f"Test data shape after dropping columns and NaNs: {test_data.shape}")

# Encode categorical variables using LabelEncoder
categorical_features = ['merchant', 'category', 'gender', 'job']
label_encoders = {}

for col in categorical_features:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    test_data[col] = le.transform(test_data[col])
    label_encoders[col] = le  # Save the encoder for later use

# Save label encoders
label_encoders_dir = os.path.join(MODEL_DIR, 'label_encoders')
if not os.path.exists(label_encoders_dir):
    os.makedirs(label_encoders_dir)

for col, le in label_encoders.items():
    joblib.dump(le, os.path.join(label_encoders_dir, f"{col}_encoder.pkl"))

print("Categorical features encoded.")

# Feature scaling
scaler = StandardScaler()
numerical_features = ['amt', 'time']  # 'time' is assumed to be present

train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])
test_data[numerical_features] = scaler.transform(test_data[numerical_features])

# Save scaler
joblib.dump(scaler, os.path.join(MODEL_DIR, 'scaler.pkl'))
print("Numerical features scaled.")

# **3-EDA**

In [None]:
# Visualize class imbalance
exit_counts = train_data["is_fraud"].value_counts()
plt.figure(figsize=(6,6))
plt.pie(exit_counts, labels=["No Fraud", "Fraud"], autopct="%0.1f%%", colors=['skyblue', 'salmon'])
plt.title("Fraudulent Transactions Percentage")
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(train_data.corr(), annot=False, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Distribution of Amount
plt.figure(figsize=(8,6))
sns.histplot(train_data['amt'], bins=50, kde=True)
plt.title("Transaction Amount Distribution")
plt.show()

In [None]:
# Time distribution (if 'time' is present)
plt.figure(figsize=(8,6))
sns.histplot(train_data['time'], bins=50, kde=True)
plt.title("Transaction Time Distribution")
plt.show()

# **4-Train the Model**

In [None]:
# Define features and target
X = train_data.drop(columns=["is_fraud"])
Y = train_data["is_fraud"]

# Define test features and target
X_test = test_data.drop(columns=["is_fraud"])
Y_test = test_data["is_fraud"]

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, Y)

# Evaluate model on training data
train_accuracy = model.score(X, Y)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Save the trained model
model_version = "v1"
model_path = os.path.join(MODEL_DIR, f"model_{model_version}.pkl")
joblib.dump(model, model_path)
print(f"Model saved to {model_path}")

# **5-Test the Model**

In [None]:
# Predict on test data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
roc_auc = roc_auc_score(Y_test, y_pred)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Test ROC AUC: {roc_auc:.4f}")

# Save test predictions
test_predictions = test_data.copy()
test_predictions['predicted_fraud'] = y_pred
test_predictions.to_csv("../data/test_predictions.csv", index=False)
print("Test predictions saved.")

In [None]:
# Plot confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(Y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["No Fraud", "Fraud"])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Save evaluation metrics to a file for MLOps tracking
metrics = {
    'model_version': model_version,
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'roc_auc': roc_auc,
    'timestamp': datetime.datetime.now().isoformat()
}

metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv(os.path.join(MODEL_DIR, f"metrics_{model_version}.csv"), index=False)
print("Evaluation metrics saved.")