In [None]:
# ==============================================================================
# Notebook 3: Model Training and Evaluation
#
# This notebook combines the original dataset with the synthetic fraud data,
# trains a RandomForestClassifier, and evaluates its performance on a test set.
# ==============================================================================

import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# --- 1. Load Datasets ---------------------------------------------------------
data_path = os.path.join('C:\\Users\\pvgre\\Desktop\\Proposals\\GNCIPL_mini_projects\\Synthetic_Fraud_AI_Project\\Data', 'creditcard.csv')
synthetic_data_path = os.path.join('C:\\Users\\pvgre\\Desktop\\Proposals\\GNCIPL_mini_projects\\Synthetic_Fraud_AI_Project\\Data', 'synthetic_fraud.csv')

try:
    original_data = pd.read_csv(data_path)
    synthetic_fraud_data = pd.read_csv(synthetic_data_path)
    print("Datasets loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: One of the required files was not found: {e}")
    print("Please ensure 'creditcard.csv' and 'synthetic_fraud.csv' exist in the 'Data' folder.")
    exit()

# --- 2. Augment the Dataset ---------------------------------------------------
# Concatenate the original data with the newly generated synthetic fraud data.
augmented_data = pd.concat([original_data, synthetic_fraud_data], ignore_index=True)

print(f"\nOriginal dataset shape: {original_data.shape}")
print(f"Synthetic data shape: {synthetic_fraud_data.shape}")
print(f"Augmented dataset shape: {augmented_data.shape}")

# Save the augmented dataset for future use.
augmented_data_path = os.path.join('C:\\Users\\pvgre\\Desktop\\Proposals\\GNCIPL_mini_projects\\Data', 'augmented_dataset.csv')
augmented_data.to_csv(augmented_data_path, index=False)
print(f"\nAugmented dataset saved to '{augmented_data_path}'.")

# --- 3. Train the Classifier --------------------------------------------------
# Split the data into features (X) and target (y).
X = augmented_data.drop('Class', axis=1)
y = augmented_data['Class']

# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Train a Random Forest Classifier on the augmented data.
print("\nTraining Random Forest Classifier on the augmented data...")
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
print("Model training complete.")

# --- 4. Evaluate Model Performance --------------------------------------------
print("\nEvaluating the model...")

# Make predictions on the test set.
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1] # Get probabilities for ROC AUC

# Print a detailed classification report.
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate and print the AUC score.
print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob):.4f}")

# --- 5. Save the Trained Model ------------------------------------------------
# Create the Model directory if it doesn't exist.
if not os.path.exists('Model'):
    os.makedirs('Model')
    
model_path = os.path.join('C:\\Users\\pvgre\\Desktop\\Proposals\\GNCIPL_mini_projects\\Synthetic_Fraud_AI_Project\\Model', 'fraud_model_rf.pkl')
joblib.dump(clf, model_path)
print(f"\nTrained model saved to '{model_path}'.")

