
## Model Evaluation


In [1]:
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.metrics import (precision_recall_curve, average_precision_score,
                             roc_curve, roc_auc_score, confusion_matrix,
                             classification_report, f1_score)
import joblib

def train_models_enhanced(X_train, y_train, X_test, y_test, dataset_name):
    """
    Train Logistic Regression and XGBoost models with imputation, SMOTE,
    cross-validation, evaluation, and SHAP analysis.
    """

    # Calculate scale_pos_weight for XGBoost
    neg, pos = np.bincount(y_train)
    scale_pos_weight = neg / pos

    # Imputer to handle missing values before SMOTE
    imputer = SimpleImputer(strategy='mean')
    smote = SMOTE(random_state=42, sampling_strategy=0.3)

    # Logistic Regression pipeline
    lr_pipe = make_pipeline(
        imputer,
        smote,
        LogisticRegression(
            max_iter=1000,
            class_weight='balanced',
            solver='liblinear',
            random_state=42
        )
    )

    # XGBoost pipeline
    xgb_pipe = make_pipeline(
        imputer,
        smote,
        XGBClassifier(
            scale_pos_weight=scale_pos_weight,
            eval_metric='logloss',
            use_label_encoder=False,
            n_estimators=200,
            max_depth=5,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        )
    )

    print(f"\n=== Training Models for {dataset_name} ===")

    # Cross-validation AUC scores
    lr_scores = cross_val_score(lr_pipe, X_train, y_train, cv=3, scoring='roc_auc')
    print(f"Logistic Regression CV AUC: {lr_scores.mean():.3f} (±{lr_scores.std():.3f})")

    xgb_scores = cross_val_score(xgb_pipe, X_train, y_train, cv=3, scoring='roc_auc')
    print(f"XGBoost CV AUC: {xgb_scores.mean():.3f} (±{xgb_scores.std():.3f})")

    # Fit final models on full training data
    lr_pipe.fit(X_train, y_train)
    xgb_pipe.fit(X_train, y_train)

    def evaluate_model(model, X, y, model_name):
        y_pred = model.predict(X)
        y_proba = model.predict_proba(X)[:, 1]

        metrics = {
            'roc_auc': roc_auc_score(y, y_proba),
            'pr_auc': average_precision_score(y, y_proba),
            'f1': f1_score(y, y_pred),
            'precision': classification_report(y, y_pred, output_dict=True)['1']['precision'],
            'recall': classification_report(y, y_pred, output_dict=True)['1']['recall'],
            'confusion_matrix': confusion_matrix(y, y_pred),
            'classification_report': classification_report(y, y_pred, output_dict=True)
        }

        # Plot ROC and PR curves
        plt.figure(figsize=(12, 5))

        plt.subplot(1, 2, 1)
        fpr, tpr, _ = roc_curve(y, y_proba)
        plt.plot(fpr, tpr, label=f"AUC = {metrics['roc_auc']:.3f}")
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {model_name}')
        plt.legend()

        plt.subplot(1, 2, 2)
        precision, recall, _ = precision_recall_curve(y, y_proba)
        plt.plot(recall, precision, label=f"AP = {metrics['pr_auc']:.3f}")
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'PR Curve - {model_name}')
        plt.legend()

        plt.tight_layout()
        plt.savefig(f'../reports/figures/{dataset_name}_{model_name}_curves.png')
        plt.close()

        return metrics

    print("\n=== Evaluation Results ===")
    lr_metrics = evaluate_model(lr_pipe, X_test, y_test, "LogisticRegression")
    xgb_metrics = evaluate_model(xgb_pipe, X_test, y_test, "XGBoost")

    # SHAP analysis
    shap_results = {}
    try:
        explainer = shap.Explainer(xgb_pipe.named_steps['xgbclassifier'])
        shap_values = explainer(X_train.iloc[:1000])

        plt.figure()
        shap.summary_plot(shap_values, X_train.iloc[:1000], show=False)
        plt.savefig(f'../reports/figures/{dataset_name}_shap_summary.png')
        plt.close()

        top_features = np.abs(shap_values.values).mean(0).argsort()[-3:][::-1]
        for feat_idx in top_features:
            feat_name = X_train.columns[feat_idx]
            plt.figure()
            shap.dependence_plot(feat_idx, shap_values.values, X_train.iloc[:1000],
                               interaction_index=None, show=False)
            plt.savefig(f'../reports/figures/{dataset_name}_shap_{feat_name}.png')
            plt.close()

        shap_results['shap_values'] = shap_values
    except Exception as e:
        print(f"SHAP analysis failed: {str(e)}")
        shap_results['error'] = str(e)

    return {
        'logistic_regression': (lr_pipe, lr_metrics),
        'xgboost': (xgb_pipe, xgb_metrics),
        'shap': shap_results
    }

def compare_models(results_dict, dataset_name):
    lr_metrics = results_dict['logistic_regression'][1]
    xgb_metrics = results_dict['xgboost'][1]

    metrics_df = pd.DataFrame({
        'Logistic Regression': lr_metrics,
        'XGBoost': xgb_metrics
    }).T

    print(f"\nModel Comparison for {dataset_name}:")
    print(metrics_df[['roc_auc', 'pr_auc', 'f1', 'precision', 'recall']])

    # Select best model based on F1 score
    best_model = 'XGBoost' if xgb_metrics['f1'] > lr_metrics['f1'] else 'Logistic Regression'
    print(f"\nRecommended model for {dataset_name}: {best_model}")

    return best_model

# ===========================
# Example Usage - Replace with your actual data loading
# ===========================

try:
    fraud_data_fe = pd.read_csv('../data/fraud_data_fe.csv.gz', compression='gzip')
    credit_data_clean = pd.read_csv('../data/credit_data_clean.csv')
    # Drop unwanted columns and separate features and target for ecommerce dataset
    columns_to_drop = ['user_id', 'device_id', 'signup_time', 'purchase_time', 'ip_address', 'class']
    X_fraud = fraud_data_fe.drop(columns=[col for col in columns_to_drop if col in fraud_data_fe.columns])
    y_fraud = fraud_data_fe['class']

    # Split
    X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
        X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
    )

    # Train ecommerce models
    print("\n" + "="*50)
    print("Training E-commerce Fraud Models")
    print("="*50)
    fraud_results = train_models_enhanced(X_train_f, y_train_f, X_test_f, y_test_f, 'ecommerce')

    # Credit card dataset - adjust column names if necessary
    X_credit = credit_data_clean.drop('Class', axis=1)
    y_credit = credit_data_clean['Class']

    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
        X_credit, y_credit, test_size=0.2, random_state=42, stratify=y_credit
    )

    # Train credit card models
    print("\n" + "="*50)
    print("Training Credit Card Fraud Models")
    print("="*50)
    credit_results = train_models_enhanced(X_train_c, y_train_c, X_test_c, y_test_c, 'creditcard')

    # Compare models and select best
    best_ecommerce = compare_models(fraud_results, 'E-commerce')
    best_credit = compare_models(credit_results, 'Credit Card')

    # Save best models
    joblib.dump(fraud_results[best_ecommerce.lower().replace(' ', '_')][0], f'../models/ecommerce/{best_ecommerce.lower().replace(" ", "_")}.pkl')
    joblib.dump(credit_results[best_credit.lower().replace(' ', '_')][0], f'../models/creditcard/{best_credit.lower().replace(" ", "_")}.pkl')

    # Generate final report
    final_report = f"""
# Fraud Detection Model Evaluation Report

## E-commerce Transactions
- Best Model: {best_ecommerce}
- Key Metrics:
  - F1 Score: {fraud_results[best_ecommerce.lower().replace(' ', '_')][1]['f1']:.4f}
  - Precision: {fraud_results[best_ecommerce.lower().replace(' ', '_')][1]['precision']:.4f}
  - Recall: {fraud_results[best_ecommerce.lower().replace(' ', '_')][1]['recall']:.4f}

## Credit Card Transactions
- Best Model: {best_credit}
- Key Metrics:
  - F1 Score: {credit_results[best_credit.lower().replace(' ', '_')][1]['f1']:.4f}
  - Precision: {credit_results[best_credit.lower().replace(' ', '_')][1]['precision']:.4f}
  - Recall: {credit_results[best_credit.lower().replace(' ', '_')][1]['recall']:.4f}

## Recommendations
1. For e-commerce fraud detection, the {best_ecommerce} model provides the best balance between precision and recall.
2. For credit card fraud detection, the {best_credit} model performs best.
3. Both models should be monitored regularly as fraud patterns evolve.
4. Consider implementing a hybrid approach with rule-based systems for high-risk transactions.
"""

    with open('../reports/final_report.md', 'w') as f:
        f.write(final_report)

except Exception as e:
    print(f"Error during model training or evaluation: \n{str(e)}")

  from .autonotebook import tqdm as notebook_tqdm



Training E-commerce Fraud Models

=== Training Models for ecommerce ===




Logistic Regression CV AUC: 0.844 (±0.005)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost CV AUC: 0.843 (±0.004)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Evaluation Results ===




SHAP analysis failed: [21:32:54] C:\actions-runner\_work\xgboost\xgboost\src\c_api\c_api_utils.h:129: Check failed: std::accumulate(shape.cbegin(), shape.cend(), static_cast<bst_ulong>(1), std::multiplies<>{}) == chunksize * rows (17000 vs. 16000) : 

Training Credit Card Fraud Models

=== Training Models for creditcard ===
Logistic Regression CV AUC: 0.972 (±0.005)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost CV AUC: 0.970 (±0.010)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Evaluation Results ===

Model Comparison for E-commerce:
                      roc_auc    pr_auc        f1 precision    recall
Logistic Regression   0.84334  0.669174  0.627122  0.609058   0.64629
XGBoost              0.843203  0.718822  0.612544  0.539081  0.709187

Recommended model for E-commerce: Logistic Regression

Model Comparison for Credit Card:
                      roc_auc    pr_auc        f1 precision    recall
Logistic Regression  0.971496  0.711308  0.143897  0.078139  0.908163
XGBoost              0.987122  0.860784  0.461538  0.311828  0.887755

Recommended model for Credit Card: XGBoost
Error during model training or evaluation: 
[Errno 2] No such file or directory: '../models/ecommerce/logistic_regression.pkl'


<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>