
## Model Training


In [1]:


import numpy as np
import shap
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer  # <--- added

def train_models_enhanced(X_train, y_train, X_test, y_test, dataset_name):
    """
    Enhanced model training with better imbalance handling and error checking
    
    Args:
        X_train: Training features
        y_train: Training labels
        X_test: Test features
        y_test: Test labels
        dataset_name: Name for saving results
        
    Returns:
        Dictionary containing trained models and metrics
    """
    
    # =============================================
    # 1. Improved Class Imbalance Handling
    # =============================================
    
    neg, pos = np.bincount(y_train)
    scale_pos_weight = neg / pos  # Recommended for XGBoost
    
    # =============================================
    # 2. Model Pipelines with Imputer before SMOTE
    # =============================================
    
    lr_pipe = make_pipeline(
        SimpleImputer(strategy='median'),  # <--- impute missing values first
        SMOTE(random_state=42, sampling_strategy=0.3),
        LogisticRegression(
            max_iter=1000,
            class_weight='balanced',
            solver='liblinear',
            random_state=42
        )
    )
    
    xgb_pipe = make_pipeline(
        SimpleImputer(strategy='median'),  # <--- impute missing values first
        SMOTE(random_state=42, sampling_strategy=0.3),
        XGBClassifier(
            scale_pos_weight=scale_pos_weight,
            eval_metric='logloss',
            use_label_encoder=False,
            n_estimators=200,
            max_depth=5,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        )
    )
    
    # =============================================
    # 3. Training with Cross-Validation
    # =============================================
    
    from sklearn.model_selection import cross_val_score
    
    print(f"\n=== Training Models for {dataset_name} ===")
    
    lr_scores = cross_val_score(lr_pipe, X_train, y_train, cv=3, scoring='roc_auc')
    print(f"Logistic Regression CV AUC: {lr_scores.mean():.3f} (±{lr_scores.std():.3f})")
    
    xgb_scores = cross_val_score(xgb_pipe, X_train, y_train, cv=3, scoring='roc_auc')
    print(f"XGBoost CV AUC: {xgb_scores.mean():.3f} (±{xgb_scores.std():.3f})")
    
    # =============================================
    # 4. Final Model Training
    # =============================================
    
    lr_pipe.fit(X_train, y_train)
    xgb_pipe.fit(X_train, y_train)
    
    # =============================================
    # 5. Enhanced Evaluation
    # =============================================
    
    def evaluate_model(model, X, y, model_name):
        from sklearn.metrics import (precision_recall_curve, average_precision_score,
                                     roc_curve, roc_auc_score,
                                     confusion_matrix, classification_report)
        
        y_pred = model.predict(X)
        y_proba = model.predict_proba(X)[:, 1]
        
        metrics = {
            'roc_auc': roc_auc_score(y, y_proba),
            'pr_auc': average_precision_score(y, y_proba),
            'confusion_matrix': confusion_matrix(y, y_pred),
            'classification_report': classification_report(y, y_pred, output_dict=True)
        }
        
        plt.figure(figsize=(12, 5))
        
        # ROC Curve
        plt.subplot(1, 2, 1)
        fpr, tpr, _ = roc_curve(y, y_proba)
        plt.plot(fpr, tpr, label=f"AUC = {metrics['roc_auc']:.3f}")
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {model_name}')
        plt.legend()
        
        # PR Curve
        plt.subplot(1, 2, 2)
        precision, recall, _ = precision_recall_curve(y, y_proba)
        plt.plot(recall, precision, label=f"AP = {metrics['pr_auc']:.3f}")
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'PR Curve - {model_name}')
        plt.legend()
        
        plt.tight_layout()
        plt.savefig(f'../reports/figures/{dataset_name}_{model_name}_curves.png')
        plt.close()
        
        return metrics
    
    print("\n=== Evaluation Results ===")
    lr_metrics = evaluate_model(lr_pipe, X_test, y_test, "LogisticRegression")
    xgb_metrics = evaluate_model(xgb_pipe, X_test, y_test, "XGBoost")
    
    # =============================================
    # 6. SHAP Analysis (Conditional Import)
    # =============================================
    
    shap_results = {}
    try:
        explainer = shap.Explainer(xgb_pipe.named_steps['xgbclassifier'])
        shap_values = explainer(X_train.iloc[:1000])
        
        plt.figure()
        shap.summary_plot(shap_values, X_train.iloc[:1000], show=False)
        plt.savefig(f'../reports/figures/{dataset_name}_shap_summary.png')
        plt.close()
        
        top_features = np.abs(shap_values.values).mean(0).argsort()[-3:][::-1]
        for feat_idx in top_features:
            feat_name = X_train.columns[feat_idx]
            plt.figure()
            shap.dependence_plot(feat_idx, shap_values.values, X_train.iloc[:1000],
                                interaction_index=None, show=False)
            plt.savefig(f'../reports/figures/{dataset_name}_shap_{feat_name}.png')
            plt.close()
            
        shap_results['shap_values'] = shap_values
    except Exception as e:
        print(f"SHAP analysis failed: {str(e)}")
        shap_results['error'] = str(e)
    
    return {
        'logistic_regression': (lr_pipe, lr_metrics),
        'xgboost': (xgb_pipe, xgb_metrics),
        'shap': shap_results
    }

# =============================================
# Execute Training with Error Handling
# =============================================

try:
    fraud_data_fe = pd.read_csv('../data/fraud_data_fe.csv.gz', compression='gzip')
    credit_data_clean = pd.read_csv('../data/credit_data_clean.csv')
    columns_to_drop = ['user_id', 'device_id', 'signup_time', 'purchase_time', 'ip_address', 'class']
    X_fraud = fraud_data_fe.drop(columns=[col for col in columns_to_drop if col in fraud_data_fe.columns])

    y_fraud = fraud_data_fe['class']
    
    # Split data
    X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
        X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud)
    
    print("\n" + "="*50)
    print("Training E-commerce Fraud Models")
    print("="*50)
    fraud_results = train_models_enhanced(X_train_f, y_train_f, X_test_f, y_test_f, 'ecommerce')
    
    # Credit card data
    X_credit = credit_data_clean.drop('Class', axis=1)
    y_credit = credit_data_clean['Class']
    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
        X_credit, y_credit, test_size=0.2, random_state=42, stratify=y_credit)
    
    print("\n" + "="*50)
    print("Training Credit Card Fraud Models")
    print("="*50)
    credit_results = train_models_enhanced(X_train_c, y_train_c, X_test_c, y_test_c, 'creditcard')
    
except Exception as e:
    print(f"Error during model training: {str(e)}")

  from .autonotebook import tqdm as notebook_tqdm



Training E-commerce Fraud Models

=== Training Models for ecommerce ===




Logistic Regression CV AUC: 0.844 (±0.005)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost CV AUC: 0.843 (±0.004)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Evaluation Results ===




SHAP analysis failed: [21:40:23] C:\actions-runner\_work\xgboost\xgboost\src\c_api\c_api_utils.h:129: Check failed: std::accumulate(shape.cbegin(), shape.cend(), static_cast<bst_ulong>(1), std::multiplies<>{}) == chunksize * rows (17000 vs. 16000) : 

Training Credit Card Fraud Models

=== Training Models for creditcard ===
Logistic Regression CV AUC: 0.972 (±0.005)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost CV AUC: 0.970 (±0.010)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Evaluation Results ===


<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>