In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# ---------------------------
# 1. DATA PREPROCESSING & BASELINE MODEL TRAINING
# ---------------------------

# Load the dataset
df = pd.read_csv('/Users/yasserjaghoori/Desktop/Grad School Classes/Spring 2025/AIT 736/Final Project/Credit Card Dataset.csv')

# Separate the features (X) from the target variable (y)
# 'Class' is our target where 1 = fraud, 0 = non-fraud
X = df.drop('Class', axis=1)
y = df['Class']

# Standardize 'Time' and 'Amount' columns
# All other columns are already PCA-transformed and scaled
# We scale these two so their magnitude doesn't bias the model
scaler = StandardScaler()
X[['Time', 'Amount']] = scaler.fit_transform(X[['Time', 'Amount']])

# Handle class imbalance by downsampling the majority class (non-fraud)
# If we don't do this, the model will mostly learn to predict "not fraud"
df_majority = df[df.Class == 0]  # Majority class (non-fraud)
df_minority = df[df.Class == 1]  # Minority class (fraud)

# Randomly downsample the majority class to a 5:1 ratio
df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority) * 5,  # Keep 5x more non-fraud than fraud
    random_state=42
)

# Combine the downsampled majority class with the full minority class
# This results in a smaller but more balanced dataset for training
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Recreate the X and y from the balanced dataset
X_balanced = df_balanced.drop('Class', axis=1)
y_balanced = df_balanced['Class']

# Scale 'Time' and 'Amount' again because X_balanced is a new DataFrame
X_balanced[['Time', 'Amount']] = scaler.fit_transform(X_balanced[['Time', 'Amount']])

# Split the data into training and testing sets (80% train, 20% test)
# Stratify ensures the fraud-to-nonfraud ratio is maintained in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced,
    test_size=0.2,
    random_state=42,
    stratify=y_balanced
)

# Define the models to train
# Logistic Regression = interpretable baseline
# Decision Tree = captures non-linear patterns
# Random Forest = ensemble method for better generalization
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Store model performance results for the baseline models
results = {}

# Train and evaluate each baseline model
for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict class labels
    y_proba = model.predict_proba(X_test)[:, 1]  # Get predicted probabilities

    # Generate classification metrics and ROC-AUC score
    report = classification_report(y_test, y_pred, output_dict=True)
    auc_score = roc_auc_score(y_test, y_proba)
    report['roc_auc'] = auc_score

    results[name] = report  # Store results

# Extract fraud detection metrics (for Class 1) from the baseline models
summary_data = []
for model_name, metrics in results.items():
    fraud_metrics = metrics['1']  # Class 1 = Fraud
    summary_data.append({
        'Model': model_name,
        'Precision': round(fraud_metrics['precision'], 3),  # Out of predicted frauds, how many were correct
        'Recall': round(fraud_metrics['recall'], 3),        # Out of actual frauds, how many we caught
        'F1-Score': round(fraud_metrics['f1-score'], 3),      # Balance of precision and recall
        'ROC-AUC': round(metrics['roc_auc'], 3)             # Overall probability ranking quality
    })

# Create and display summary DataFrame for baseline models
summary_df = pd.DataFrame(summary_data)
print("=== Baseline Model Evaluation Summary ===")
print(summary_df)



=== Baseline Model Evaluation Summary ===
                 Model  Precision  Recall  F1-Score  ROC-AUC
0  Logistic Regression      0.928   0.918     0.923    0.979
1        Decision Tree      0.827   0.878     0.851    0.921
2        Random Forest      0.967   0.898     0.931    0.981


In [4]:

# ========== ENSEMBLE MODEL USING VOTING CLASSIFIER + SMOTE + GRIDSEARCH ==========
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Define base classifiers with class weighting to further handle imbalance
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
tree = DecisionTreeClassifier(class_weight='balanced', random_state=42)
forest = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Create a soft voting classifier (uses average of predicted probabilities)
voting_clf = VotingClassifier(
    estimators=[('lr', log_reg), ('dt', tree), ('rf', forest)],
    voting='soft'
)

# Build a pipeline with SMOTE and the VotingClassifier to address imbalance in each fold
ensemble_pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', voting_clf)
])

# Use Stratified K-Fold cross-validation to maintain class balance during tuning
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Hyperparameter grid for tuning the ensemble
param_grid = {
    'classifier__lr__C': [0.1, 1.0],
    'classifier__dt__max_depth': [None, 10],
    'classifier__rf__n_estimators': [100, 200],
}

# Setup GridSearchCV to find the best ensemble model
grid_search = GridSearchCV(
    estimator=ensemble_pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

# Fit the ensemble model on training data
grid_search.fit(X_train, y_train)

# Predict and evaluate on test set using the best ensemble model
best_ensemble = grid_search.best_estimator_
y_pred_ensemble = best_ensemble.predict(X_test)
y_proba_ensemble = best_ensemble.predict_proba(X_test)[:, 1]

# Generate final classification report and ROC-AUC for the ensemble model
ensemble_report = classification_report(y_test, y_pred_ensemble)
ensemble_auc = roc_auc_score(y_test, y_proba_ensemble)

print("\n=== Best Ensemble Voting Classifier Report ===")
print(ensemble_report)
print("ROC-AUC Score:", round(ensemble_auc, 3))


# ---------------------------
# 4. EXTENDED EVALUATION: CONFUSION MATRIX, AUPRC, AND AUC-ROC
# ---------------------------
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc

def extended_evaluation(model, X_test, y_test):
    """
    Evaluates the given model on test data, computing:
      - Confusion Matrix
      - Precision, Recall, and F1-score for the fraud class (Class 1)
      - Area Under the Precision-Recall Curve (AUPRC)
      - Area Under the ROC Curve (AUC-ROC)
    """
    # Predict class labels and probabilities
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Compute the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Generate classification report to extract precision, recall, and F1-score for Class 1
    report = classification_report(y_test, y_pred, output_dict=True)
    precision_val = report['1']['precision']
    recall_val = report['1']['recall']
    f1_val = report['1']['f1-score']
    
    # Compute ROC curve and AUC-ROC
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc_roc_val = auc(fpr, tpr)
    
    # Compute Precision-Recall curve and AUPRC
    precisions, recalls, _ = precision_recall_curve(y_test, y_proba)
    auprc_val = auc(recalls, precisions)
    
    return cm, precision_val, recall_val, f1_val, auprc_val, auc_roc_val

# Evaluate each baseline model using the extended metrics
evaluation_results = {}
for model_name, model in models.items():
    cm, prec, rec, f1, auprc, auc_roc_val = extended_evaluation(model, X_test, y_test)
    evaluation_results[model_name] = {
        'Confusion Matrix': cm,
        'Precision': round(prec, 3),
        'Recall': round(rec, 3),
        'F1-Score': round(f1, 3),
        'AUPRC': round(auprc, 3),
        'AUC-ROC': round(auc_roc_val, 3)
    }

# Evaluate the ensemble model using the extended metrics
cm, prec, rec, f1, auprc, auc_roc_val = extended_evaluation(best_ensemble, X_test, y_test)
evaluation_results['Ensemble (Voting)'] = {
    'Confusion Matrix': cm,
    'Precision': round(prec, 3),
    'Recall': round(rec, 3),
    'F1-Score': round(f1, 3),
    'AUPRC': round(auprc, 3),
    'AUC-ROC': round(auc_roc_val, 3)
}

# Compile all extended evaluation results into a summary DataFrame for comparison
summary_data_extended = []
for model_name, metrics in evaluation_results.items():
    summary_data_extended.append({
        'Model': model_name,
        'Precision': metrics['Precision'],
        'Recall': metrics['Recall'],
        'F1-Score': metrics['F1-Score'],
        'AUPRC': metrics['AUPRC'],
        'AUC-ROC': metrics['AUC-ROC']
    })

summary_df_extended = pd.DataFrame(summary_data_extended)
print("\n=== Extended Evaluation Metrics for All Models ===")
print(summary_df_extended)

# Optionally, print the confusion matrices for each model
print("\n--- Confusion Matrices ---")
for model_name, metrics in evaluation_results.items():
    print(f"\n{model_name}:")
    print(metrics['Confusion Matrix'])


Fitting 5 folds for each of 8 candidates, totalling 40 fits

=== Best Ensemble Voting Classifier Report ===
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       493
           1       0.89      0.92      0.90        98

    accuracy                           0.97       591
   macro avg       0.94      0.95      0.94       591
weighted avg       0.97      0.97      0.97       591

ROC-AUC Score: 0.983

=== Extended Evaluation Metrics for All Models ===
                 Model  Precision  Recall  F1-Score  AUPRC  AUC-ROC
0  Logistic Regression      0.928   0.918     0.923  0.951    0.979
1        Decision Tree      0.827   0.878     0.851  0.862    0.921
2        Random Forest      0.967   0.898     0.931  0.951    0.981
3    Ensemble (Voting)      0.891   0.918     0.905  0.949    0.983

--- Confusion Matrices ---

Logistic Regression:
[[486   7]
 [  8  90]]

Decision Tree:
[[475  18]
 [ 12  86]]

Random Forest:
[[490   3]
 [ 10  88]]

