# Fraud Detection Model Training

This notebook implements a comprehensive fraud detection model training pipeline.

## Overview
- Data preprocessing and feature engineering
- Model training with multiple algorithms
- Hyperparameter tuning
- Comprehensive evaluation with fraud-specific metrics
- Model interpretation and feature importance

## 1. Setup and Imports

In [None]:
# Import libraries with error handling
import os
import sys
import warnings
warnings.filterwarnings('ignore')

print("Starting imports...")

# Basic libraries
try:
    import pandas as pd
    import numpy as np
    import matplotlib
    matplotlib.use('Agg')  # Use non-interactive backend for Docker
    import matplotlib.pyplot as plt
    import seaborn as sns
    print("✅ Basic libraries imported successfully")
except Exception as e:
    print(f"❌ Error importing basic libraries: {e}")
    raise

# Machine learning libraries
try:
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import (
        classification_report, confusion_matrix, roc_auc_score, 
        precision_recall_curve, roc_curve, average_precision_score,
        precision_score, recall_score, f1_score
    )
    from sklearn.model_selection import cross_val_score, StratifiedKFold
    print("✅ Scikit-learn imported successfully")
except Exception as e:
    print(f"❌ Error importing scikit-learn: {e}")
    raise

try:
    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.pipeline import Pipeline as ImbPipeline
    print("✅ Imbalanced-learn imported successfully")
except Exception as e:
    print(f"❌ Error importing imbalanced-learn: {e}")
    raise

try:
    import xgboost as xgb
    print("✅ XGBoost imported successfully")
except Exception as e:
    print(f"❌ Error importing XGBoost: {e}")
    raise

try:
    import lightgbm as lgb
    print("✅ LightGBM imported successfully")  
except Exception as e:
    print(f"❌ Error importing LightGBM: {e}")
    raise

try:
    import joblib
    import optuna
    from tqdm import tqdm
    print("✅ Utility libraries imported successfully")
except Exception as e:
    print(f"❌ Error importing utility libraries: {e}")
    raise

# Custom modules with proper path handling
try:
    # Add src directory to path
    src_path = '/app/src'
    if src_path not in sys.path:
        sys.path.insert(0, src_path)
    
    # Check if we can access the module file
    module_path = os.path.join(src_path, 'data_preprocessing.py')
    if not os.path.exists(module_path):
        raise FileNotFoundError(f"Module file not found: {module_path}")
    
    from data_preprocessing import FraudDataProcessor
    print("✅ Custom data preprocessing module imported successfully")
except Exception as e:
    print(f"❌ Error importing custom modules: {e}")
    print(f"Current working directory: {os.getcwd()}")
    print(f"Python path: {sys.path[:3]}...")
    if os.path.exists('/app/src'):
        print(f"Contents of /app/src: {os.listdir('/app/src')}")
    raise

# Settings
pd.set_option('display.max_columns', None)
plt.style.use('default')  # Use default style for better Docker compatibility
np.random.seed(42)

print("✅ Setup completed successfully!")
print("✅ All imports working correctly!")
print(f"Working directory: {os.getcwd()}")
print(f"Available memory: {os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') / (1024.**3):.1f} GB")

## 2. Data Loading and Preprocessing

In [None]:
# Initialize the data processor and load data with error handling
import gc
import psutil

print("Checking system resources...")
memory = psutil.virtual_memory()
print(f"Available memory: {memory.available / (1024**3):.1f} GB")
print(f"Memory usage: {memory.percent:.1f}%")

try:
    # Initialize the data processor
    processor = FraudDataProcessor()
    print("✅ Data processor initialized successfully")
    
    # Check if data file exists
    data_path = "/app/data/fraud_mock.csv"
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"Data file not found: {data_path}")
    
    # Get file size
    file_size = os.path.getsize(data_path) / (1024 * 1024)  # MB
    print(f"Data file size: {file_size:.1f} MB")
    
    # Process the data with temporal split (recommended for fraud detection)
    print("Starting data preprocessing pipeline...")
    print("This may take a few minutes for large datasets...")
    
    train_df, val_df, test_df = processor.process_pipeline(
        file_path=data_path,
        test_size=0.2,
        val_size=0.1,
        temporal_split=True
    )
    
    print("✅ Data preprocessing completed successfully!")
    print(f"Training set shape: {train_df.shape}")
    print(f"Validation set shape: {val_df.shape}")
    print(f"Test set shape: {test_df.shape}")
    
    # Check memory usage after loading
    memory_after = psutil.virtual_memory()
    print(f"Memory usage after loading: {memory_after.percent:.1f}%")
    
    # Force garbage collection
    gc.collect()
    print("✅ Garbage collection completed")

except FileNotFoundError as e:
    print(f"❌ Data file error: {e}")
    raise
except MemoryError as e:
    print(f"❌ Not enough memory to load the full dataset: {e}")
    print("Try reducing the dataset size or increasing Docker memory limits")
    raise
except Exception as e:
    print(f"❌ Error during data preprocessing: {e}")
    print("This might be due to:")
    print("- Insufficient memory")
    print("- Corrupted data file") 
    print("- Missing dependencies")
    raise

In [None]:
# Prepare features and targets
feature_columns = [col for col in train_df.columns if col not in ['is_fraud', 'is_flagged_fraud']]

X_train = train_df[feature_columns]
y_train = train_df['is_fraud']

X_val = val_df[feature_columns]
y_val = val_df['is_fraud']

X_test = test_df[feature_columns]
y_test = test_df['is_fraud']

print(f"Number of features: {len(feature_columns)}")
print(f"Training fraud rate: {y_train.mean():.4f}")
print(f"Validation fraud rate: {y_val.mean():.4f}")
print(f"Test fraud rate: {y_test.mean():.4f}")

## 3. Class Imbalance Analysis and Handling

In [None]:
# Analyze class imbalance
fraud_counts = y_train.value_counts()
fraud_ratio = fraud_counts[1] / fraud_counts[0]

print(f"Non-fraud transactions: {fraud_counts[0]:,}")
print(f"Fraud transactions: {fraud_counts[1]:,}")
print(f"Fraud ratio: 1:{fraud_counts[0]//fraud_counts[1]}")
print(f"Imbalance ratio: {fraud_ratio:.6f}")

# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Count plot
y_train.value_counts().plot(kind='bar', ax=axes[0])
axes[0].set_title('Training Set Class Distribution')
axes[0].set_xlabel('Class (0: Non-fraud, 1: Fraud)')
axes[0].set_ylabel('Count')

# Percentage plot
(y_train.value_counts(normalize=True) * 100).plot(kind='bar', ax=axes[1])
axes[1].set_title('Training Set Class Distribution (%)')
axes[1].set_xlabel('Class (0: Non-fraud, 1: Fraud)')
axes[1].set_ylabel('Percentage')

plt.tight_layout()
plt.show()

## 4. Feature Analysis and Selection

In [None]:
# Feature correlation with target
feature_target_corr = pd.DataFrame({
    'feature': feature_columns,
    'correlation': [train_df[feature].corr(train_df['is_fraud']) for feature in feature_columns]
})

feature_target_corr = feature_target_corr.sort_values('correlation', key=abs, ascending=False)
print("Top 15 features by correlation with fraud:")
print(feature_target_corr.head(15))

# Visualize top correlations
plt.figure(figsize=(10, 8))
top_features = feature_target_corr.head(20)
plt.barh(range(len(top_features)), top_features['correlation'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Correlation with Fraud')
plt.title('Top 20 Features by Correlation with Fraud')
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Model Training and Evaluation Functions

In [None]:
def evaluate_model(y_true, y_pred, y_pred_proba=None, model_name="Model"):
    """
    Comprehensive model evaluation for fraud detection.
    """
    results = {}
    
    # Basic metrics
    results['precision'] = precision_score(y_true, y_pred)
    results['recall'] = recall_score(y_true, y_pred)
    results['f1'] = f1_score(y_true, y_pred)
    
    if y_pred_proba is not None:
        results['auc_roc'] = roc_auc_score(y_true, y_pred_proba)
        results['auc_pr'] = average_precision_score(y_true, y_pred_proba)
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    results['true_negatives'] = tn
    results['false_positives'] = fp
    results['false_negatives'] = fn
    results['true_positives'] = tp
    
    # Cost-sensitive metrics (assuming FP costs 1 and FN costs 10)
    fp_cost = 1
    fn_cost = 10
    results['total_cost'] = fp * fp_cost + fn * fn_cost
    
    print(f"\n{model_name} Evaluation Results:")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall: {results['recall']:.4f}")
    print(f"F1-Score: {results['f1']:.4f}")
    if 'auc_roc' in results:
        print(f"AUC-ROC: {results['auc_roc']:.4f}")
        print(f"AUC-PR: {results['auc_pr']:.4f}")
    print(f"Total Cost (FP:1, FN:10): {results['total_cost']}")
    
    # Confusion matrix visualization
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    
    return results

def plot_precision_recall_curve(y_true, y_pred_proba, model_name="Model"):
    """
    Plot precision-recall curve.
    """
    precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
    auc_pr = average_precision_score(y_true, y_pred_proba)
    
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, label=f'{model_name} (AUC-PR = {auc_pr:.4f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'{model_name} - Precision-Recall Curve')
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_roc_curve(y_true, y_pred_proba, model_name="Model"):
    """
    Plot ROC curve.
    """
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    auc_roc = roc_auc_score(y_true, y_pred_proba)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{model_name} (AUC-ROC = {auc_roc:.4f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} - ROC Curve')
    plt.legend()
    plt.grid(True)
    plt.show()

print("Evaluation functions defined successfully!")

## 6. Baseline Model - Logistic Regression

In [None]:
# Train baseline logistic regression with class balancing
print("Training Baseline Logistic Regression...")

# Create pipeline with SMOTE for handling class imbalance
smote_lr_pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Train the model
smote_lr_pipeline.fit(X_train, y_train)

# Predictions
y_val_pred_lr = smote_lr_pipeline.predict(X_val)
y_val_pred_proba_lr = smote_lr_pipeline.predict_proba(X_val)[:, 1]

# Evaluation
lr_results = evaluate_model(y_val, y_val_pred_lr, y_val_pred_proba_lr, "Logistic Regression (SMOTE)")
plot_precision_recall_curve(y_val, y_val_pred_proba_lr, "Logistic Regression (SMOTE)")
plot_roc_curve(y_val, y_val_pred_proba_lr, "Logistic Regression (SMOTE)")

## 7. Random Forest Model

In [None]:
# Train Random Forest with class balancing
print("Training Random Forest...")

# Random Forest with class weight balancing
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# Train the model
rf_model.fit(X_train, y_train)

# Predictions
y_val_pred_rf = rf_model.predict(X_val)
y_val_pred_proba_rf = rf_model.predict_proba(X_val)[:, 1]

# Evaluation
rf_results = evaluate_model(y_val, y_val_pred_rf, y_val_pred_proba_rf, "Random Forest")
plot_precision_recall_curve(y_val, y_val_pred_proba_rf, "Random Forest")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features (Random Forest):")
print(feature_importance.head(15))

# Plot feature importance
plt.figure(figsize=(10, 8))
top_features_rf = feature_importance.head(20)
plt.barh(range(len(top_features_rf)), top_features_rf['importance'])
plt.yticks(range(len(top_features_rf)), top_features_rf['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 20 Features by Random Forest Importance')
plt.tight_layout()
plt.show()

## 8. XGBoost Model with Hyperparameter Tuning

In [None]:
# XGBoost hyperparameter tuning with Optuna
def objective_xgb(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'random_state': 42
    }
    
    # Calculate scale_pos_weight for class imbalance
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    params['scale_pos_weight'] = scale_pos_weight
    
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    auc_score = roc_auc_score(y_val, y_pred_proba)
    
    return auc_score

print("Starting XGBoost hyperparameter tuning...")
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=50, show_progress_bar=True)

print(f"Best AUC-ROC: {study_xgb.best_value:.4f}")
print(f"Best parameters: {study_xgb.best_params}")

In [None]:
# Train final XGBoost model with best parameters
print("Training final XGBoost model...")

# Add scale_pos_weight to best params
best_params_xgb = study_xgb.best_params.copy()
best_params_xgb['scale_pos_weight'] = (y_train == 0).sum() / (y_train == 1).sum()
best_params_xgb['random_state'] = 42

xgb_model = xgb.XGBClassifier(**best_params_xgb)
xgb_model.fit(X_train, y_train)

# Predictions
y_val_pred_xgb = xgb_model.predict(X_val)
y_val_pred_proba_xgb = xgb_model.predict_proba(X_val)[:, 1]

# Evaluation
xgb_results = evaluate_model(y_val, y_val_pred_xgb, y_val_pred_proba_xgb, "XGBoost (Tuned)")
plot_precision_recall_curve(y_val, y_val_pred_proba_xgb, "XGBoost (Tuned)")

# XGBoost feature importance
xgb_feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features (XGBoost):")
print(xgb_feature_importance.head(15))

# Plot XGBoost feature importance
plt.figure(figsize=(10, 8))
top_features_xgb = xgb_feature_importance.head(20)
plt.barh(range(len(top_features_xgb)), top_features_xgb['importance'])
plt.yticks(range(len(top_features_xgb)), top_features_xgb['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 20 Features by XGBoost Importance')
plt.tight_layout()
plt.show()

## 9. LightGBM Model

In [None]:
# Train LightGBM model
print("Training LightGBM...")

# Calculate class weights
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

lgb_model = lgb.LGBMClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    verbosity=-1
)

lgb_model.fit(X_train, y_train)

# Predictions
y_val_pred_lgb = lgb_model.predict(X_val)
y_val_pred_proba_lgb = lgb_model.predict_proba(X_val)[:, 1]

# Evaluation
lgb_results = evaluate_model(y_val, y_val_pred_lgb, y_val_pred_proba_lgb, "LightGBM")
plot_precision_recall_curve(y_val, y_val_pred_proba_lgb, "LightGBM")

## 10. Model Comparison

In [None]:
# Compare all models
model_comparison = pd.DataFrame({
    'Model': ['Logistic Regression (SMOTE)', 'Random Forest', 'XGBoost (Tuned)', 'LightGBM'],
    'Precision': [lr_results['precision'], rf_results['precision'], 
                 xgb_results['precision'], lgb_results['precision']],
    'Recall': [lr_results['recall'], rf_results['recall'], 
              xgb_results['recall'], lgb_results['recall']],
    'F1-Score': [lr_results['f1'], rf_results['f1'], 
                xgb_results['f1'], lgb_results['f1']],
    'AUC-ROC': [lr_results['auc_roc'], rf_results['auc_roc'], 
               xgb_results['auc_roc'], lgb_results['auc_roc']],
    'AUC-PR': [lr_results['auc_pr'], rf_results['auc_pr'], 
              xgb_results['auc_pr'], lgb_results['auc_pr']],
    'Total Cost': [lr_results['total_cost'], rf_results['total_cost'], 
                  xgb_results['total_cost'], lgb_results['total_cost']]
})

print("Model Comparison on Validation Set:")
print(model_comparison.round(4))

# Visualize model comparison
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

metrics = ['Precision', 'Recall', 'F1-Score', 'AUC-ROC', 'AUC-PR']
for i, metric in enumerate(metrics):
    ax = axes[i//3, i%3]
    model_comparison.plot(x='Model', y=metric, kind='bar', ax=ax, legend=False)
    ax.set_title(f'{metric} Comparison')
    ax.tick_params(axis='x', rotation=45)

# Total cost comparison (lower is better)
ax = axes[1, 2]
model_comparison.plot(x='Model', y='Total Cost', kind='bar', ax=ax, legend=False, color='red')
ax.set_title('Total Cost Comparison (Lower is Better)')
ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 11. Final Model Selection and Test Set Evaluation

In [None]:
# Select best model based on AUC-PR (most important for imbalanced fraud detection)
best_model_idx = model_comparison['AUC-PR'].idxmax()
best_model_name = model_comparison.loc[best_model_idx, 'Model']

print(f"Best model based on AUC-PR: {best_model_name}")

# Map model name to actual model object
models_dict = {
    'Logistic Regression (SMOTE)': smote_lr_pipeline,
    'Random Forest': rf_model,
    'XGBoost (Tuned)': xgb_model,
    'LightGBM': lgb_model
}

best_model = models_dict[best_model_name]

# Evaluate on test set
print(f"\nEvaluating {best_model_name} on test set...")
y_test_pred = best_model.predict(X_test)
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]

final_results = evaluate_model(y_test, y_test_pred, y_test_pred_proba, f"{best_model_name} (Test Set)")
plot_precision_recall_curve(y_test, y_test_pred_proba, f"{best_model_name} (Test Set)")
plot_roc_curve(y_test, y_test_pred_proba, f"{best_model_name} (Test Set)")

## 12. Business Impact Analysis

In [None]:
# Business impact analysis
print("Business Impact Analysis:")
print("=" * 50)

# Test set statistics
total_transactions = len(y_test)
actual_frauds = y_test.sum()
actual_legit = total_transactions - actual_frauds

print(f"Total test transactions: {total_transactions:,}")
print(f"Actual fraud transactions: {actual_frauds:,} ({actual_frauds/total_transactions*100:.2f}%)")
print(f"Actual legitimate transactions: {actual_legit:,}")

# Model predictions
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()

print(f"\nModel Performance:")
print(f"True Positives (Correctly identified frauds): {tp:,}")
print(f"False Negatives (Missed frauds): {fn:,}")
print(f"True Negatives (Correctly identified legitimate): {tn:,}")
print(f"False Positives (Incorrectly flagged legitimate): {fp:,}")

# Business metrics
fraud_detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0
false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

print(f"\nKey Business Metrics:")
print(f"Fraud Detection Rate: {fraud_detection_rate:.2%} ({tp} out of {tp+fn} frauds caught)")
print(f"False Alarm Rate: {false_alarm_rate:.2%} ({fp} out of {fp+tn} legitimate transactions flagged)")

# Cost analysis
avg_fraud_amount = 1000  # Assumed average fraud amount
review_cost = 10  # Cost to manually review a flagged transaction

# Savings from caught frauds
fraud_savings = tp * avg_fraud_amount

# Cost of manual reviews
review_costs = (tp + fp) * review_cost

# Cost of missed frauds
missed_fraud_cost = fn * avg_fraud_amount

net_benefit = fraud_savings - review_costs - missed_fraud_cost

print(f"\nCost-Benefit Analysis (Estimated):")
print(f"Fraud amount prevented: ${fraud_savings:,.2f}")
print(f"Manual review costs: ${review_costs:,.2f}")
print(f"Missed fraud losses: ${missed_fraud_cost:,.2f}")
print(f"Net benefit: ${net_benefit:,.2f}")

# Implications
print(f"\nImplications:")
print(f"• False Positives: {fp:,} legitimate customers may experience inconvenience")
print(f"• False Negatives: {fn:,} fraudulent transactions will go undetected")
print(f"• The model catches {fraud_detection_rate:.1%} of all fraud attempts")
print(f"• Only {false_alarm_rate:.1%} of legitimate transactions are incorrectly flagged")

## 13. Model Interpretation and Feature Insights

In [None]:
# Feature importance analysis for the best model
if hasattr(best_model, 'feature_importances_'):
    # Tree-based model
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
elif hasattr(best_model, 'named_steps') and 'classifier' in best_model.named_steps:
    # Pipeline model (like SMOTE + LogisticRegression)
    if hasattr(best_model.named_steps['classifier'], 'coef_'):
        feature_importance = pd.DataFrame({
            'feature': feature_columns,
            'importance': np.abs(best_model.named_steps['classifier'].coef_[0])
        }).sort_values('importance', ascending=False)

print(f"Top 20 Most Important Features for {best_model_name}:")
print(feature_importance.head(20))

# Plot feature importance
plt.figure(figsize=(12, 10))
top_20_features = feature_importance.head(20)
plt.barh(range(len(top_20_features)), top_20_features['importance'])
plt.yticks(range(len(top_20_features)), top_20_features['feature'])
plt.xlabel('Feature Importance')
plt.title(f'Top 20 Features by Importance - {best_model_name}')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Feature insights
print("\nKey Feature Insights:")
print("=" * 30)
for i, (_, row) in enumerate(feature_importance.head(10).iterrows()):
    print(f"{i+1}. {row['feature']}: {row['importance']:.4f}")
    
    # Provide business interpretation for key features
    feature_name = row['feature']
    if 'amount' in feature_name.lower():
        print("   → Transaction amount patterns are key indicators of fraud")
    elif 'balance' in feature_name.lower():
        print("   → Account balance changes help identify suspicious activity")
    elif 'time' in feature_name.lower() or 'hour' in feature_name.lower() or 'day' in feature_name.lower():
        print("   → Temporal patterns reveal fraud timing preferences")
    elif 'acc' in feature_name.lower() and 'frequency' in feature_name.lower():
        print("   → Account usage frequency indicates normal vs suspicious behavior")
    elif 'cash_out' in feature_name.lower() or 'transfer' in feature_name.lower():
        print("   → Transaction type is a strong fraud predictor")
    print()

## 14. Model Persistence and Deployment Preparation

In [None]:
# Save the best model and preprocessing components
import os

model_dir = '../models'
os.makedirs(model_dir, exist_ok=True)

# Save the model
model_path = os.path.join(model_dir, 'best_fraud_model.joblib')
joblib.dump(best_model, model_path)
print(f"Best model saved to: {model_path}")

# Save the data processor (with fitted scalers and encoders)
processor_path = os.path.join(model_dir, 'data_processor.joblib')
joblib.dump(processor, processor_path)
print(f"Data processor saved to: {processor_path}")

# Save feature names
feature_names_path = os.path.join(model_dir, 'feature_names.joblib')
joblib.dump(feature_columns, feature_names_path)
print(f"Feature names saved to: {feature_names_path}")

# Save model metadata
model_metadata = {
    'model_name': best_model_name,
    'model_type': type(best_model).__name__,
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'features_count': len(feature_columns),
    'test_performance': final_results,
    'feature_importance': feature_importance.head(10).to_dict('records'),
    'training_data_size': len(X_train),
    'validation_data_size': len(X_val),
    'test_data_size': len(X_test)
}

metadata_path = os.path.join(model_dir, 'model_metadata.joblib')
joblib.dump(model_metadata, metadata_path)
print(f"Model metadata saved to: {metadata_path}")

print("\nModel artifacts saved successfully!")
print(f"All files saved in: {model_dir}")

## 15. Summary and Recommendations

In [None]:
print("FRAUD DETECTION MODEL TRAINING SUMMARY")
print("=" * 50)

print(f"\n1. DATASET:")
print(f"   • Total transactions: {len(train_df) + len(val_df) + len(test_df):,}")
print(f"   • Fraud rate: {(y_train.sum() + y_val.sum() + y_test.sum()) / (len(y_train) + len(y_val) + len(y_test)):.4%}")
print(f"   • Features engineered: {len(feature_columns)}")

print(f"\n2. METHODOLOGY:")
print(f"   • Temporal train/test split (recommended for fraud detection)")
print(f"   • Class imbalance handled with SMOTE and class weights")
print(f"   • Multiple algorithms tested: LR, RF, XGBoost, LightGBM")
print(f"   • Hyperparameter optimization with Optuna")

print(f"\n3. BEST MODEL: {best_model_name}")
print(f"   • Test Set Performance:")
print(f"     - Precision: {final_results['precision']:.4f}")
print(f"     - Recall: {final_results['recall']:.4f}")
print(f"     - F1-Score: {final_results['f1']:.4f}")
print(f"     - AUC-ROC: {final_results['auc_roc']:.4f}")
print(f"     - AUC-PR: {final_results['auc_pr']:.4f}")

print(f"\n4. KEY FINDINGS:")
print(f"   • Top fraud indicators: {', '.join(feature_importance.head(3)['feature'].tolist())}")
print(f"   • Model catches {fraud_detection_rate:.1%} of fraud attempts")
print(f"   • False alarm rate: {false_alarm_rate:.1%}")
print(f"   • Estimated net benefit: ${net_benefit:,.2f}")

print(f"\n5. RECOMMENDATIONS:")
print(f"   • Deploy model with real-time scoring capability")
print(f"   • Implement model monitoring and periodic retraining")
print(f"   • Set up feedback loop for false positive/negative learning")
print(f"   • Consider ensemble methods for improved performance")
print(f"   • Regularly update features based on new fraud patterns")

print(f"\n6. BUSINESS IMPACT:")
print(f"   • {tp:,} fraudulent transactions will be caught")
print(f"   • {fn:,} fraudulent transactions will be missed")
print(f"   • {fp:,} legitimate transactions will require manual review")
print(f"   • Significant cost savings expected from fraud prevention")

print(f"\nTraining completed successfully!")
print(f"Model artifacts saved for deployment.")