# Fraud Detection System with Basel II Compliance
**Objective**: Develop an advanced fraud detection system that:
1. Identifies fraudulent transactions with high accuracy
2. Quantifies risk according to Basel II regulations
3. Provides actionable business insights
**Methodology**:
- Comprehensive EDA and feature engineering
- Advanced modeling with XGBoost and Random Forest
- Risk quantification using Basel II framework
- Business impact analysis

## 1. Configuration and Data Loading
Initial setup with enhanced error handling and data quality checks

In [1]:
# Enhanced Configuration
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime
from scipy import stats
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import mutual_info_classif, SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_auc_score, precision_recall_curve, 
                           average_precision_score, classification_report)
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
import shap

# Enhanced constants and configurations
DATA_PATH = '../data/raw/transactions.csv'
RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.2
N_TOP_FEATURES = 15  # For feature selection
SMOTE_SAMPLING_STRATEGY = 0.5  # For handling class imbalance

# Basel II parameters
MIN_PD = 0.0001  # Minimum probability of default
LGD = 0.45  # Loss given default assumption
EAD_FACTOR = 1.1  # Exposure at default multiplier

# RFM analysis parameters
RFM_WINDOW = '30D'  # Time window for RFM calculations

ModuleNotFoundError: No module named 'imblearn'

### Data Loading with Enhanced Preprocessing
- Time-based feature extraction
- RFM (Recency, Frequency, Monetary) feature creation
- Basel II risk metrics calculation
- Robust outlier detection

In [2]:
# Load data with enhanced error handling
try:
    print("Loading data with enhanced preprocessing...")
    df = pd.read_csv(DATA_PATH, parse_dates=['TransactionStartTime'])
    
    # Add time-based features
    df['TransactionHour'] = df['TransactionStartTime'].dt.hour
    df['TransactionDay'] = df['TransactionStartTime'].dt.day
    df['TransactionDayOfWeek'] = df['TransactionStartTime'].dt.dayofweek
    df['TransactionMonth'] = df['TransactionStartTime'].dt.month
    
    # Create RFM features
    print("\nCalculating RFM metrics...")
    current_date = df['TransactionStartTime'].max()
    
    # Recency: Days since last transaction
    recency = df.groupby('CustomerId')['TransactionStartTime'].max()
    recency = (current_date - recency).dt.days.reset_index()
    recency.columns = ['CustomerId', 'Recency']
    
    # Frequency: Transaction count
    frequency = df.groupby('CustomerId')['TransactionId'].count().reset_index()
    frequency.columns = ['CustomerId', 'Frequency']
    
    # Monetary: Average transaction amount
    monetary = df.groupby('CustomerId')['Amount'].mean().reset_index()
    monetary.columns = ['CustomerId', 'Monetary']
    
    # Merge RFM features
    rfm = recency.merge(frequency, on='CustomerId').merge(monetary, on='CustomerId')
    df = df.merge(rfm, on='CustomerId', how='left')
    
    # Create Basel II relevant features
    print("\nCalculating Basel II risk metrics...")
    df['ExpectedLoss'] = MIN_PD * LGD * (df['Amount'] * EAD_FACTOR)
    
    # Enhanced data quality checks
    print("\nRunning enhanced data quality checks...")
    if df.isnull().sum().sum() > 0:
        print("Warning: Missing values detected. Implementing enhanced imputation...")
        # For numeric columns, fill with median (more robust than mean)
        numeric_cols = df.select_dtypes(include=np.number).columns
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
        
        # For categorical columns, fill with mode
        categorical_cols = df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            df[col] = df[col].fillna(df[col].mode()[0])
    
    # Outlier detection with enhanced methods
    print("\nRunning enhanced outlier detection...")
    numeric_cols = df.select_dtypes(include=np.number).columns
    for col in numeric_cols:
        if col not in ['FraudResult', 'CountryCode']:  # Skip target and constant columns
            # Calculate robust z-scores using median and MAD
            median = df[col].median()
            mad = stats.median_abs_deviation(df[col], scale='normal')
            df[f'{col}_RobustZ'] = 0.6745 * (df[col] - median) / mad  # 0.6745 scales to std normal
            
            # Winsorize extreme outliers (top/bottom 0.5%)
            lower = df[col].quantile(0.005)
            upper = df[col].quantile(0.995)
            df[col] = np.where(df[col] < lower, lower, df[col])
            df[col] = np.where(df[col] > upper, upper, df[col])
    
    print("\nData loaded and preprocessed successfully!")
    
except Exception as e:
    print(f"Error during data loading: {str(e)}")
    raise

Loading data with enhanced preprocessing...
Error during data loading: name 'DATA_PATH' is not defined


NameError: name 'DATA_PATH' is not defined

## 2. Exploratory Data Analysis
Comprehensive data overview with:
- Data structure analysis
- Statistical summaries
- Fraud pattern visualization

In [3]:
# Enhanced Data Structure Overview
def enhanced_data_overview(df):
    """Generate comprehensive data overview with enhanced metrics"""
    
    print("\nEnhanced Data Structure Overview:")
    
    # Basic info
    print(f"\nData Shape: {df.shape}")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Data types
    print("\nEnhanced Data Types:")
    dtype_df = pd.DataFrame({
        'Column': df.columns,
        'Data Type': df.dtypes,
        'Unique Values': df.nunique(),
        'Missing Values': df.isnull().sum(),
        'Missing %': (df.isnull().sum() / len(df)) * 100,
        'Cardinality': df.nunique() / len(df),  # New: Cardinality metric
        'Zero Values': (df == 0).sum(),  # New: Count of zeros
        'Negative Values': (df < 0).sum()  # New: Count of negatives
    }).reset_index(drop=True)
    
    display(dtype_df)
    
    # Enhanced descriptive statistics
    print("\nEnhanced Descriptive Statistics:")
    desc_stats = df.describe(percentiles=[.01, .05, .25, .5, .75, .95, .99]).T
    desc_stats['skewness'] = df.skew(numeric_only=True)
    desc_stats['kurtosis'] = df.kurtosis(numeric_only=True)
    desc_stats['IQR'] = desc_stats['75%'] - desc_stats['25%']  # New: Interquartile range
    desc_stats['CV'] = desc_stats['std'] / desc_stats['mean']  # New: Coefficient of variation
    
    display(desc_stats)
    
    # Enhanced correlation analysis
    print("\nEnhanced Correlation Matrix (Top 10 Correlated Features with Fraud):")
    corr_matrix = df.corr(numeric_only=True)
    fraud_corr = corr_matrix['FraudResult'].abs().sort_values(ascending=False)
    display(fraud_corr.head(10))
    
    # Plot top correlations
    plt.figure(figsize=(10, 6))
    sns.heatmap(corr_matrix.loc[fraud_corr.index[:5], fraud_corr.index[:5]], 
                annot=True, cmap='coolwarm', center=0)
    plt.title('Top 5 Features Correlated with Fraud')
    plt.show()

# Run enhanced overview
enhanced_data_overview(df)

NameError: name 'df' is not defined

In [4]:
# Enhanced Fraud Analysis
def enhanced_fraud_analysis(df):
    """Comprehensive fraud analysis with business context"""
    
    print("\nEnhanced Fraud Analysis:")
    
    # Fraud distribution with business context
    fraud_rate = df['FraudResult'].mean()
    print(f"\nOverall Fraud Rate: {fraud_rate:.4%}")
    print(f"Non-Fraud Cases: {len(df[df['FraudResult']==0]):,}")
    print(f"Fraud Cases: {len(df[df['FraudResult']==1]):,}")
    
    # Financial impact estimation
    fraud_amount = df[df['FraudResult']==1]['Amount'].sum()
    avg_fraud_amount = df[df['FraudResult']==1]['Amount'].mean()
    print(f"\nEstimated Total Fraud Amount: ${fraud_amount:,.2f}")
    print(f"Average Fraud Amount: ${avg_fraud_amount:,.2f}")
    
    # Fraud by category (business segmentation)
    print("\nFraud Rate by Product Category:")
    fraud_by_category = df.groupby('ProductCategory')['FraudResult'].agg(['mean', 'count'])
    fraud_by_category.columns = ['FraudRate', 'TransactionCount']
    fraud_by_category['FraudAmount'] = df[df['FraudResult']==1].groupby('ProductCategory')['Amount'].sum()
    display(fraud_by_category.sort_values('FraudRate', ascending=False))
    
    # Temporal patterns
    print("\nFraud Rate by Hour of Day:")
    fraud_by_hour = df.groupby('TransactionHour')['FraudResult'].mean().reset_index()
    plt.figure(figsize=(12, 6))
    sns.lineplot(x='TransactionHour', y='FraudResult', data=fraud_by_hour)
    plt.title('Fraud Rate by Hour of Day')
    plt.ylabel('Fraud Rate')
    plt.axhline(fraud_rate, color='red', linestyle='--', label='Overall Fraud Rate')
    plt.legend()
    plt.show()
    
    # RFM analysis for fraud
    print("\nRFM Analysis for Fraud Cases:")
    rfm_fraud = df[df['FraudResult']==1][['Recency', 'Frequency', 'Monetary']].describe().T
    display(rfm_fraud)
    
    # Visualization of RFM metrics
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    for i, metric in enumerate(['Recency', 'Frequency', 'Monetary']):
        sns.boxplot(x='FraudResult', y=metric, data=df, ax=axes[i])
        axes[i].set_title(f'{metric} Distribution by Fraud Status')
    plt.tight_layout()
    plt.show()

# Run enhanced fraud analysis
enhanced_fraud_analysis(df)

NameError: name 'df' is not defined

## 3. Feature Engineering
Creating advanced features for modeling:
- Interaction features
- Behavioral patterns
- Rolling statistics
- Risk flags

In [5]:
# Feature Engineering for Modeling
def enhanced_feature_engineering(df):
    """Create advanced features for modeling"""
    
    print("\nRunning Enhanced Feature Engineering...")
    
    # Create interaction features
    df['Amount_Recency_Interaction'] = df['Amount'] * df['Recency']
    df['Amount_Frequency_Interaction'] = df['Amount'] * df['Frequency']
    
    # Create behavioral features
    df['Amount_to_Avg_Amount_Ratio'] = df['Amount'] / df.groupby('CustomerId')['Amount'].transform('mean')
    df['Time_Since_Last_Txn'] = df.groupby('CustomerId')['TransactionStartTime'].diff().dt.total_seconds() / 3600
    
    # Create rolling features (windowed statistics)
    df = df.sort_values(['CustomerId', 'TransactionStartTime'])
    df['Rolling_3Txn_Avg'] = df.groupby('CustomerId')['Amount'].transform(lambda x: x.rolling(3, min_periods=1).mean())
    df['Rolling_24Hr_Count'] = df.groupby('CustomerId')['TransactionStartTime'].transform(
        lambda x: x.rolling(RFM_WINDOW).count())
    
    # Create velocity features (change over time)
    df['Amount_Velocity'] = df.groupby('CustomerId')['Amount'].transform(
        lambda x: x.diff() / x.shift().where(x.shift() != 0, 1))
    
    # Create flags for unusual activity
    df['Large_Transaction_Flag'] = (df['Amount'] > df['Amount'].quantile(0.95)).astype(int)
    df['After_Hours_Flag'] = ((df['TransactionHour'] < 8) | (df['TransactionHour'] > 20)).astype(int)
    
    # Create Basel II relevant features
    df['Risk_Score'] = df['ExpectedLoss'] * df['Recency']  # Simple risk score example
    
    print("Feature engineering completed. Added 10 new features.")
    
    return df

# Apply feature engineering
df = enhanced_feature_engineering(df)

NameError: name 'df' is not defined

## 4. Model Preparation
Data preparation steps:
- Categorical encoding
- Train-test-validation split
- Class imbalance handling with SMOTE
- Feature scaling and selection

In [6]:
# Model Preparation
def prepare_model_data(df):
    """Prepare data for modeling with enhanced methods"""
    
    print("\nPreparing Data for Modeling...")
    
    # Select features and target
    X = df.drop(['FraudResult', 'TransactionId', 'TransactionStartTime'], axis=1)
    y = df['FraudResult']
    
    # Convert categorical variables (enhanced encoding)
    categorical_cols = X.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if X[col].nunique() > 10:  # High cardinality - use target encoding
            # Calculate mean fraud rate per category (smoothed)
            fraud_rate = y.groupby(X[col]).mean()
            counts = y.groupby(X[col]).count()
            global_mean = y.mean()
            smoothing = 100  # Smoothing parameter
            
            # Apply smoothing to avoid overfitting
            X[col] = X[col].map((fraud_rate * counts + global_mean * smoothing) / (counts + smoothing))
        else:  # Low cardinality - use one-hot encoding
            X = pd.get_dummies(X, columns=[col], drop_first=True)
    
    # Train-test split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE)
    
    # Further split train into train and validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=VAL_SIZE/(1-TEST_SIZE), stratify=y_train, random_state=RANDOM_STATE)
    
    # Handle class imbalance with SMOTE
    print("\nOriginal class distribution:")
    print(y_train.value_counts())
    
    smote = SMOTE(sampling_strategy=SMOTE_SAMPLING_STRATEGY, random_state=RANDOM_STATE)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    
    print("\nResampled class distribution:")
    print(y_train_res.value_counts())
    
    # Feature scaling
    scaler = StandardScaler()
    X_train_res = scaler.fit_transform(X_train_res)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    # Feature selection
    selector = SelectKBest(f_classif, k=N_TOP_FEATURES)
    X_train_res = selector.fit_transform(X_train_res, y_train_res)
    X_val = selector.transform(X_val)
    X_test = selector.transform(X_test)
    
    # Get selected feature names
    selected_features = X.columns[selector.get_support()]
    print("\nTop Selected Features:")
    print(selected_features)
    
    return X_train_res, X_val, X_test, y_train_res, y_val, y_test, selected_features

# Prepare model data
X_train, X_val, X_test, y_train, y_val, y_test, selected_features = prepare_model_data(df)

NameError: name 'df' is not defined

## 5. Model Training and Evaluation
Training and evaluating:
- Random Forest
- XGBoost
With comprehensive metrics:
- AUC-ROC
- Precision-Recall
- Feature importance
- SHAP explanations

In [7]:
# Model Training and Evaluation
def train_and_evaluate_models(X_train, X_val, X_test, y_train, y_val, y_test):
    """Train and evaluate multiple models with enhanced metrics"""
    
    print("\nTraining and Evaluating Models...")
    
    # Initialize models
    models = {
        'Random Forest': RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced'),
        'XGBoost': XGBClassifier(random_state=RANDOM_STATE, scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]))
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Train model
        model.fit(X_train, y_train)
        
        # Predict probabilities
        y_val_proba = model.predict_proba(X_val)[:, 1]
        y_test_proba = model.predict_proba(X_test)[:, 1]
        
        # Calculate metrics
        val_auc = roc_auc_score(y_val, y_val_proba)
        test_auc = roc_auc_score(y_test, y_test_proba)
        
        val_ap = average_precision_score(y_val, y_val_proba)
        test_ap = average_precision_score(y_test, y_test_proba)
        
        # Get optimal threshold from precision-recall curve
        precision, recall, thresholds = precision_recall_curve(y_val, y_val_proba)
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
        optimal_idx = np.argmax(f1_scores)
        optimal_threshold = thresholds[optimal_idx]
        
        # Apply optimal threshold to test set
        y_test_pred = (y_test_proba >= optimal_threshold).astype(int)
        
        # Classification report
        report = classification_report(y_test, y_test_pred, output_dict=True)
        
        # Store results
        results[name] = {
            'Validation AUC': val_auc,
            'Test AUC': test_auc,
            'Validation Average Precision': val_ap,
            'Test Average Precision': test_ap,
            'Optimal Threshold': optimal_threshold,
            'Classification Report': report,
            'Model': model
        }
        
        # Print summary
        print(f"\n{name} Performance:")
        print(f"Validation AUC: {val_auc:.4f}")
        print(f"Test AUC: {test_auc:.4f}")
        print(f"Validation Average Precision: {val_ap:.4f}")
        print(f"Test Average Precision: {test_ap:.4f}")
        print(f"Optimal Threshold: {optimal_threshold:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_test_pred))
        
        # Feature importance
        if hasattr(model, 'feature_importances_'):
            print("\nFeature Importances:")
            importances = pd.DataFrame({
                'Feature': selected_features,
                'Importance': model.feature_importances_
            }).sort_values('Importance', ascending=False)
            display(importances.head(10))
            
            # Plot feature importance
            plt.figure(figsize=(10, 6))
            sns.barplot(x='Importance', y='Feature', data=importances.head(10))
            plt.title(f'{name} - Top Feature Importances')
            plt.show()
        
        # SHAP values for model interpretation
        if name == 'Random Forest':  # SHAP works best with tree-based models
            print("\nCalculating SHAP values for model interpretation...")
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_test)
            
            # Summary plot
            plt.figure()
            shap.summary_plot(shap_values[1], X_test, feature_names=selected_features, plot_type="bar")
            plt.title(f'{name} - SHAP Feature Importance')
            plt.show()
            
            # Force plot for a specific example
            sample_idx = np.where(y_test == 1)[0][0]  # First fraud case
            plt.figure()
            shap.force_plot(explainer.expected_value[1], shap_values[1][sample_idx], 
                           X_test[sample_idx], feature_names=selected_features)
            plt.title(f'{name} - SHAP Explanation for Fraud Case')
            plt.show()
    
    return results

# Train and evaluate models
model_results = train_and_evaluate_models(X_train, X_val, X_test, y_train, y_val, y_test)

NameError: name 'X_train' is not defined

## 6. Basel II Capital Calculation
Implementing Basel II framework:
- Probability of Default (PD) estimation
- Loss Given Default (LGD)
- Exposure at Default (EAD)
- Capital requirement calculation

In [8]:
# Basel II Capital Calculation
def calculate_basel_capital(df, model_results, selected_model='Random Forest'):
    """Calculate Basel II capital requirements based on model predictions"""
    
    print("\nCalculating Basel II Capital Requirements...")
    
    # Get the best model
    model = model_results[selected_model]['Model']
    
    # Predict PD (Probability of Default) for all transactions
    # Note: In practice, we'd want to predict PD at customer level over a time horizon
    # This is a simplified transaction-level example
    
    # Prepare full dataset for prediction
    X_full = df.drop(['FraudResult', 'TransactionId', 'TransactionStartTime'], axis=1)
    
    # Convert categorical variables (same as during training)
    categorical_cols = X_full.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if X_full[col].nunique() > 10:  # Target encoding
            fraud_rate = df['FraudResult'].groupby(X_full[col]).mean()
            counts = df['FraudResult'].groupby(X_full[col]).count()
            global_mean = df['FraudResult'].mean()
            smoothing = 100
            X_full[col] = X_full[col].map((fraud_rate * counts + global_mean * smoothing) / (counts + smoothing))
        else:  # One-hot encoding
            X_full = pd.get_dummies(X_full, columns=[col], drop_first=True)
    
    # Scale features
    scaler = StandardScaler()
    X_full_scaled = scaler.fit_transform(X_full)
    
    # Select features
    selector = SelectKBest(f_classif, k=N_TOP_FEATURES)
    X_full_scaled = selector.fit_transform(X_full_scaled, df['FraudResult'])
    
    # Get predictions (as proxy for PD)
    pd_estimates = model.predict_proba(X_full_scaled)[:, 1]
    
    # Apply floor to PD estimates (Basel II requires minimum 0.03% for banks)
    pd_estimates = np.maximum(pd_estimates, MIN_PD)
    
    # Calculate Expected Loss (EL) and Capital Requirement (K)
    df['PD_Estimate'] = pd_estimates
    df['LGD'] = LGD  # Using fixed LGD for simplicity
    df['EAD'] = df['Amount'] * EAD_FACTOR  # Exposure at default
    
    # Expected Loss (EL) = PD × LGD × EAD
    df['ExpectedLoss'] = df['PD_Estimate'] * df['LGD'] * df['EAD']
    
    # Basel II capital requirement formula (simplified)
    # K = [LGD × N((1 - R)^-0.5 × G(PD) + (R / (1 - R))^0.5 × G(0.999)) - PD × LGD] × (1 - 1.5 × b(PD))^-1 × (1 + (M - 2.5) × b(PD))
    # Where:
    # R = 0.12 × (1 - exp(-50 × PD)) / (1 - exp(-50)) + 0.24 × [1 - (1 - exp(-50 × PD)) / (1 - exp(-50))]
    # b(PD) = (0.11852 - 0.05478 × ln(PD))^2
    # M = 1 year (maturity)
    
    # Simplified version using supervisory formula
    df['R'] = 0.12 * (1 - np.exp(-50 * df['PD_Estimate'])) / (1 - np.exp(-50)) + \
              0.24 * (1 - (1 - np.exp(-50 * df['PD_Estimate'])) / (1 - np.exp(-50)))
    
    df['b'] = (0.11852 - 0.05478 * np.log(df['PD_Estimate']))**2
    
    # Standard normal inverse functions
    def g(p):
        return stats.norm.ppf(p)
    
    df['CapitalRequirement'] = (df['LGD'] * stats.norm.cdf(
        (1 - df['R'])**-0.5 * g(df['PD_Estimate']) + 
        (df['R'] / (1 - df['R']))**0.5 * g(0.999)) - 
        df['PD_Estimate'] * df['LGD']) * \
        (1 - 1.5 * df['b'])**-1 * (1 + (1 - 2.5) * df['b'])
    
    # Total capital required
    total_capital = df['CapitalRequirement'].sum()
    total_exposure = df['EAD'].sum()
    capital_ratio = total_capital / total_exposure
    
    print("\nBasel II Capital Calculation Results:")
    print(f"Total Exposure at Default (EAD): ${total_exposure:,.2f}")
    print(f"Total Capital Required: ${total_capital:,.2f}")
    print(f"Capital Ratio: {capital_ratio:.2%}")
    print("\nNote: This is a simplified calculation. Actual Basel II implementation requires more complex modeling.")
    
    return df

# Calculate Basel II capital
df_with_capital = calculate_basel_capital(df, model_results)

NameError: name 'df' is not defined

## 7. Business Impact Analysis
Quantifying the financial impact:
- Fraud prevented vs. missed
- Operational costs
- ROI calculation
- Customer impact

In [9]:
# Business Impact Analysis
def business_impact_analysis(df, model_results, selected_model='Random Forest'):
    """Analyze the business impact of implementing the fraud detection model"""
    
    print("\nBusiness Impact Analysis:")
    
    # Get model predictions and optimal threshold
    model = model_results[selected_model]['Model']
    threshold = model_results[selected_model]['Optimal Threshold']
    
    # Prepare test set for prediction
    X = df.drop(['FraudResult', 'TransactionId', 'TransactionStartTime'], axis=1)
    y = df['FraudResult']
    
    # Convert categorical variables
    categorical_cols = X.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if X[col].nunique() > 10:
            fraud_rate = y.groupby(X[col]).mean()
            counts = y.groupby(X[col]).count()
            global_mean = y.mean()
            smoothing = 100
            X[col] = X[col].map((fraud_rate * counts + global_mean * smoothing) / (counts + smoothing))
        else:
            X = pd.get_dummies(X, columns=[col], drop_first=True)
    
    # Scale and select features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    selector = SelectKBest(f_classif, k=N_TOP_FEATURES)
    X_scaled = selector.fit_transform(X_scaled, y)
    
    # Get predictions
    y_proba = model.predict_proba(X_scaled)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)
    
    # Confusion matrix
    cm = pd.crosstab(y, y_pred, rownames=['Actual'], colnames=['Predicted'])
    display(cm)
    
    # Financial impact metrics
    fraud_caught = cm.loc[1, 1]
    fraud_missed = cm.loc[1, 0]
    false_positives = cm.loc[0, 1]
    
    avg_fraud_amount = df[df['FraudResult']==1]['Amount'].mean()
    avg_transaction_amount = df['Amount'].mean()
    
    fraud_prevented = fraud_caught * avg_fraud_amount
    fraud_loss = fraud_missed * avg_fraud_amount
    operational_cost = false_positives * avg_transaction_amount * 0.10  # Assuming 10% of transaction amount as cost
    
    net_savings = fraud_prevented - fraud_loss - operational_cost
    
    print("\nFinancial Impact Estimation:")
    print(f"Fraud Prevented: ${fraud_prevented:,.2f} ({fraud_caught} transactions)")
    print(f"Fraud Loss (Missed): ${fraud_loss:,.2f} ({fraud_missed} transactions)")
    print(f"Operational Cost (False Positives): ${operational_cost:,.2f} ({false_positives} transactions)")
    print(f"Net Savings: ${net_savings:,.2f}")
    
    # ROI calculation (simplified)
    development_cost = 50000  # Estimated model development cost
    implementation_cost = 20000  # Estimated implementation cost
    annual_maintenance = 10000  # Estimated annual maintenance
    
    first_year_roi = (net_savings - development_cost - implementation_cost) / \
                    (development_cost + implementation_cost)
    ongoing_roi = (net_savings - annual_maintenance) / annual_maintenance
    
    print("\nReturn on Investment (ROI):")
    print(f"First Year ROI: {first_year_roi:.0%}")
    print(f"Ongoing Annual ROI: {ongoing_roi:.0%}")
    
    # Customer impact analysis
    customers_affected = df[y_pred == 1]['CustomerId'].nunique()
    total_customers = df['CustomerId'].nunique()
    
    print("\nCustomer Impact:")
    print(f"Customers Affected by Fraud Alerts: {customers_affected:,} ({customers_affected/total_customers:.1%})")
    
    return {
        'fraud_prevented': fraud_prevented,
        'fraud_loss': fraud_loss,
        'operational_cost': operational_cost,
        'net_savings': net_savings,
        'first_year_roi': first_year_roi,
        'ongoing_roi': ongoing_roi
    }

# Run business impact analysis
impact_results = business_impact_analysis(df, model_results)

NameError: name 'df' is not defined

## 8. Deployment Recommendations
Strategic recommendations for production:
1. Implementation strategy
2. Risk threshold optimization
3. Operational integration
4. Monitoring framework
5. Regulatory compliance
6. Expected benefits

In [10]:
# Model Deployment Recommendations
def deployment_recommendations(model_results, impact_results):
    """Provide strategic recommendations for model deployment"""
    
    print("\nModel Deployment Recommendations:")
    
    # 1. Implementation Strategy
    print("\n1. Implementation Strategy:")
    print("- Phase 1: Pilot implementation with 10% of transactions to validate performance")
    print("- Phase 2: Full deployment with monitoring for concept drift")
    print("- Implement as a real-time scoring system integrated with transaction processing")
    
    # 2. Risk Threshold Optimization
    optimal_threshold = model_results['Random Forest']['Optimal Threshold']
    print(f"\n2. Risk Threshold Optimization (Current: {optimal_threshold:.2f}):")
    print("- Establish threshold tuning process based on changing fraud patterns")
    print("- Create multiple thresholds for different customer segments/products")
    
    # 3. Operational Integration
    print("\n3. Operational Integration:")
    print("- Integrate with case management system for fraud analysts")
    print("- Implement automated alerts for high-risk transactions")
    print("- Create escalation procedures based on risk scores")
    
    # 4. Monitoring Framework
    print("\n4. Monitoring Framework:")
    print("- Track model performance metrics weekly (AUC, precision, recall)")
    print("- Monitor feature distributions for data drift")
    print("- Establish feedback loop from fraud investigation teams")
    
    # 5. Regulatory Compliance
    print("\n5. Regulatory Compliance:")
    print("- Document model development process for audit purposes")
    print("- Validate model meets Basel II requirements for risk quantification")
    print("- Implement governance process for model updates")
    
    # 6. Expected Benefits
    print("\n6. Expected Benefits:")
    print(f"- Annual fraud prevention: ${impact_results['fraud_prevented']:,.2f}")
    print(f"- Operational cost: ${impact_results['operational_cost']:,.2f}")
    print(f"- Net savings: ${impact_results['net_savings']:,.2f}")
    print(f"- First year ROI: {impact_results['first_year_roi']:.0%}")
    print(f"- Ongoing ROI: {impact_results['ongoing_roi']:.0%}")

# Provide deployment recommendations
deployment_recommendations(model_results, impact_results)

NameError: name 'model_results' is not defined

In [11]:
# Model Deployment Recommendations
def deployment_recommendations(model_results, impact_results):
    """Provide strategic recommendations for model deployment"""
    
    print("\nModel Deployment Recommendations:")
    
    # 1. Implementation Strategy
    print("\n1. Implementation Strategy:")
    print("- Phase 1: Pilot implementation with 10% of transactions to validate performance")
    print("- Phase 2: Full deployment with monitoring for concept drift")
    print("- Implement as a real-time scoring system integrated with transaction processing")
    
    # 2. Risk Threshold Optimization
    optimal_threshold = model_results['Random Forest']['Optimal Threshold']
    print(f"\n2. Risk Threshold Optimization (Current: {optimal_threshold:.2f}):")
    print("- Establish threshold tuning process based on changing fraud patterns")
    print("- Create multiple thresholds for different customer segments/products")
    
    # 3. Operational Integration
    print("\n3. Operational Integration:")
    print("- Integrate with case management system for fraud analysts")
    print("- Implement automated alerts for high-risk transactions")
    print("- Create escalation procedures based on risk scores")
    
    # 4. Monitoring Framework
    print("\n4. Monitoring Framework:")
    print("- Track model performance metrics weekly (AUC, precision, recall)")
    print("- Monitor feature distributions for data drift")
    print("- Establish feedback loop from fraud investigation teams")
    
    # 5. Regulatory Compliance
    print("\n5. Regulatory Compliance:")
    print("- Document model development process for audit purposes")
    print("- Validate model meets Basel II requirements for risk quantification")
    print("- Implement governance process for model updates")
    
    # 6. Expected Benefits
    print("\n6. Expected Benefits:")
    print(f"- Annual fraud prevention: ${impact_results['fraud_prevented']:,.2f}")
    print(f"- Operational cost: ${impact_results['operational_cost']:,.2f}")
    print(f"- Net savings: ${impact_results['net_savings']:,.2f}")
    print(f"- First year ROI: {impact_results['first_year_roi']:.0%}")
    print(f"- Ongoing ROI: {impact_results['ongoing_roi']:.0%}")

# Provide deployment recommendations
deployment_recommendations(model_results, impact_results)

NameError: name 'model_results' is not defined

In [12]:
# Model Deployment Recommendations
def deployment_recommendations(model_results, impact_results):
    """Provide strategic recommendations for model deployment"""
    
    print("\nModel Deployment Recommendations:")
    
    # 1. Implementation Strategy
    print("\n1. Implementation Strategy:")
    print("- Phase 1: Pilot implementation with 10% of transactions to validate performance")
    print("- Phase 2: Full deployment with monitoring for concept drift")
    print("- Implement as a real-time scoring system integrated with transaction processing")
    
    # 2. Risk Threshold Optimization
    optimal_threshold = model_results['Random Forest']['Optimal Threshold']
    print(f"\n2. Risk Threshold Optimization (Current: {optimal_threshold:.2f}):")
    print("- Establish threshold tuning process based on changing fraud patterns")
    print("- Create multiple thresholds for different customer segments/products")
    
    # 3. Operational Integration
    print("\n3. Operational Integration:")
    print("- Integrate with case management system for fraud analysts")
    print("- Implement automated alerts for high-risk transactions")
    print("- Create escalation procedures based on risk scores")
    
    # 4. Monitoring Framework
    print("\n4. Monitoring Framework:")
    print("- Track model performance metrics weekly (AUC, precision, recall)")
    print("- Monitor feature distributions for data drift")
    print("- Establish feedback loop from fraud investigation teams")
    
    # 5. Regulatory Compliance
    print("\n5. Regulatory Compliance:")
    print("- Document model development process for audit purposes")
    print("- Validate model meets Basel II requirements for risk quantification")
    print("- Implement governance process for model updates")
    
    # 6. Expected Benefits
    print("\n6. Expected Benefits:")
    print(f"- Annual fraud prevention: ${impact_results['fraud_prevented']:,.2f}")
    print(f"- Operational cost: ${impact_results['operational_cost']:,.2f}")
    print(f"- Net savings: ${impact_results['net_savings']:,.2f}")
    print(f"- First year ROI: {impact_results['first_year_roi']:.0%}")
    print(f"- Ongoing ROI: {impact_results['ongoing_roi']:.0%}")

# Provide deployment recommendations
deployment_recommendations(model_results, impact_results)

NameError: name 'model_results' is not defined

# Fraud Detection System with Basel II Compliance
**Objective**: Develop an advanced fraud detection system that:
1. Identifies fraudulent transactions with high accuracy
2. Quantifies risk according to Basel II regulations
3. Provides actionable business insights
**Methodology**:
- Comprehensive EDA and feature engineering
- Advanced modeling with XGBoost and Random Forest
- Risk quantification using Basel II framework
- Business impact analysis

## 1. Configuration and Data Loading
Initial setup with enhanced error handling and data quality checks

In [13]:
# Enhanced Configuration
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime
from scipy import stats
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import mutual_info_classif, SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_auc_score, precision_recall_curve, 
                           average_precision_score, classification_report)
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
import shap

# Enhanced constants and configurations
DATA_PATH = '../data/raw/transactions.csv'
RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.2
N_TOP_FEATURES = 15  # For feature selection
SMOTE_SAMPLING_STRATEGY = 0.5  # For handling class imbalance

# Basel II parameters
MIN_PD = 0.0001  # Minimum probability of default
LGD = 0.45  # Loss given default assumption
EAD_FACTOR = 1.1  # Exposure at default multiplier

# RFM analysis parameters
RFM_WINDOW = '30D'  # Time window for RFM calculations

ModuleNotFoundError: No module named 'imblearn'

### Data Loading with Enhanced Preprocessing
- Time-based feature extraction
- RFM (Recency, Frequency, Monetary) feature creation
- Basel II risk metrics calculation
- Robust outlier detection

In [None]:
# Load data with enhanced error handling
try:
    print("Loading data with enhanced preprocessing...")
    df = pd.read_csv(DATA_PATH, parse_dates=['TransactionStartTime'])
    
    # Add time-based features
    df['TransactionHour'] = df['TransactionStartTime'].dt.hour
    df['TransactionDay'] = df['TransactionStartTime'].dt.day
    df['TransactionDayOfWeek'] = df['TransactionStartTime'].dt.dayofweek
    df['TransactionMonth'] = df['TransactionStartTime'].dt.month
    
    # Create RFM features
    print("\nCalculating RFM metrics...")
    current_date = df['TransactionStartTime'].max()
    
    # Recency: Days since last transaction
    recency = df.groupby('CustomerId')['TransactionStartTime'].max()
    recency = (current_date - recency).dt.days.reset_index()
    recency.columns = ['CustomerId', 'Recency']
    
    # Frequency: Transaction count
    frequency = df.groupby('CustomerId')['TransactionId'].count().reset_index()
    frequency.columns = ['CustomerId', 'Frequency']
    
    # Monetary: Average transaction amount
    monetary = df.groupby('CustomerId')['Amount'].mean().reset_index()
    monetary.columns = ['CustomerId', 'Monetary']
    
    # Merge RFM features
    rfm = recency.merge(frequency, on='CustomerId').merge(monetary, on='CustomerId')
    df = df.merge(rfm, on='CustomerId', how='left')
    
    # Create Basel II relevant features
    print("\nCalculating Basel II risk metrics...")
    df['ExpectedLoss'] = MIN_PD * LGD * (df['Amount'] * EAD_FACTOR)
    
    # Enhanced data quality checks
    print("\nRunning enhanced data quality checks...")
    if df.isnull().sum().sum() > 0:
        print("Warning: Missing values detected. Implementing enhanced imputation...")
        # For numeric columns, fill with median (more robust than mean)
        numeric_cols = df.select_dtypes(include=np.number).columns
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
        
        # For categorical columns, fill with mode
        categorical_cols = df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            df[col] = df[col].fillna(df[col].mode()[0])
    
    # Outlier detection with enhanced methods
    print("\nRunning enhanced outlier detection...")
    numeric_cols = df.select_dtypes(include=np.number).columns
    for col in numeric_cols:
        if col not in ['FraudResult', 'CountryCode']:  # Skip target and constant columns
            # Calculate robust z-scores using median and MAD
            median = df[col].median()
            mad = stats.median_abs_deviation(df[col], scale='normal')
            df[f'{col}_RobustZ'] = 0.6745 * (df[col] - median) / mad  # 0.6745 scales to std normal
            
            # Winsorize extreme outliers (top/bottom 0.5%)
            lower = df[col].quantile(0.005)
            upper = df[col].quantile(0.995)
            df[col] = np.where(df[col] < lower, lower, df[col])
            df[col] = np.where(df[col] > upper, upper, df[col])
    
    print("\nData loaded and preprocessed successfully!")
    
except Exception as e:
    print(f"Error during data loading: {str(e)}")
    raise

: 

## 2. Exploratory Data Analysis
Comprehensive data overview with:
- Data structure analysis
- Statistical summaries
- Fraud pattern visualization

In [None]:
# Enhanced Data Structure Overview
def enhanced_data_overview(df):
    """Generate comprehensive data overview with enhanced metrics"""
    
    print("\nEnhanced Data Structure Overview:")
    
    # Basic info
    print(f"\nData Shape: {df.shape}")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Data types
    print("\nEnhanced Data Types:")
    dtype_df = pd.DataFrame({
        'Column': df.columns,
        'Data Type': df.dtypes,
        'Unique Values': df.nunique(),
        'Missing Values': df.isnull().sum(),
        'Missing %': (df.isnull().sum() / len(df)) * 100,
        'Cardinality': df.nunique() / len(df),  # New: Cardinality metric
        'Zero Values': (df == 0).sum(),  # New: Count of zeros
        'Negative Values': (df < 0).sum()  # New: Count of negatives
    }).reset_index(drop=True)
    
    display(dtype_df)
    
    # Enhanced descriptive statistics
    print("\nEnhanced Descriptive Statistics:")
    desc_stats = df.describe(percentiles=[.01, .05, .25, .5, .75, .95, .99]).T
    desc_stats['skewness'] = df.skew(numeric_only=True)
    desc_stats['kurtosis'] = df.kurtosis(numeric_only=True)
    desc_stats['IQR'] = desc_stats['75%'] - desc_stats['25%']  # New: Interquartile range
    desc_stats['CV'] = desc_stats['std'] / desc_stats['mean']  # New: Coefficient of variation
    
    display(desc_stats)
    
    # Enhanced correlation analysis
    print("\nEnhanced Correlation Matrix (Top 10 Correlated Features with Fraud):")
    corr_matrix = df.corr(numeric_only=True)
    fraud_corr = corr_matrix['FraudResult'].abs().sort_values(ascending=False)
    display(fraud_corr.head(10))
    
    # Plot top correlations
    plt.figure(figsize=(10, 6))
    sns.heatmap(corr_matrix.loc[fraud_corr.index[:5], fraud_corr.index[:5]], 
                annot=True, cmap='coolwarm', center=0)
    plt.title('Top 5 Features Correlated with Fraud')
    plt.show()

# Run enhanced overview
enhanced_data_overview(df)

: 

In [None]:
# Enhanced Fraud Analysis
def enhanced_fraud_analysis(df):
    """Comprehensive fraud analysis with business context"""
    
    print("\nEnhanced Fraud Analysis:")
    
    # Fraud distribution with business context
    fraud_rate = df['FraudResult'].mean()
    print(f"\nOverall Fraud Rate: {fraud_rate:.4%}")
    print(f"Non-Fraud Cases: {len(df[df['FraudResult']==0]):,}")
    print(f"Fraud Cases: {len(df[df['FraudResult']==1]):,}")
    
    # Financial impact estimation
    fraud_amount = df[df['FraudResult']==1]['Amount'].sum()
    avg_fraud_amount = df[df['FraudResult']==1]['Amount'].mean()
    print(f"\nEstimated Total Fraud Amount: ${fraud_amount:,.2f}")
    print(f"Average Fraud Amount: ${avg_fraud_amount:,.2f}")
    
    # Fraud by category (business segmentation)
    print("\nFraud Rate by Product Category:")
    fraud_by_category = df.groupby('ProductCategory')['FraudResult'].agg(['mean', 'count'])
    fraud_by_category.columns = ['FraudRate', 'TransactionCount']
    fraud_by_category['FraudAmount'] = df[df['FraudResult']==1].groupby('ProductCategory')['Amount'].sum()
    display(fraud_by_category.sort_values('FraudRate', ascending=False))
    
    # Temporal patterns
    print("\nFraud Rate by Hour of Day:")
    fraud_by_hour = df.groupby('TransactionHour')['FraudResult'].mean().reset_index()
    plt.figure(figsize=(12, 6))
    sns.lineplot(x='TransactionHour', y='FraudResult', data=fraud_by_hour)
    plt.title('Fraud Rate by Hour of Day')
    plt.ylabel('Fraud Rate')
    plt.axhline(fraud_rate, color='red', linestyle='--', label='Overall Fraud Rate')
    plt.legend()
    plt.show()
    
    # RFM analysis for fraud
    print("\nRFM Analysis for Fraud Cases:")
    rfm_fraud = df[df['FraudResult']==1][['Recency', 'Frequency', 'Monetary']].describe().T
    display(rfm_fraud)
    
    # Visualization of RFM metrics
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    for i, metric in enumerate(['Recency', 'Frequency', 'Monetary']):
        sns.boxplot(x='FraudResult', y=metric, data=df, ax=axes[i])
        axes[i].set_title(f'{metric} Distribution by Fraud Status')
    plt.tight_layout()
    plt.show()

# Run enhanced fraud analysis
enhanced_fraud_analysis(df)

: 

## 3. Feature Engineering
Creating advanced features for modeling:
- Interaction features
- Behavioral patterns
- Rolling statistics
- Risk flags

In [None]:
# Feature Engineering for Modeling
def enhanced_feature_engineering(df):
    """Create advanced features for modeling"""
    
    print("\nRunning Enhanced Feature Engineering...")
    
    # Create interaction features
    df['Amount_Recency_Interaction'] = df['Amount'] * df['Recency']
    df['Amount_Frequency_Interaction'] = df['Amount'] * df['Frequency']
    
    # Create behavioral features
    df['Amount_to_Avg_Amount_Ratio'] = df['Amount'] / df.groupby('CustomerId')['Amount'].transform('mean')
    df['Time_Since_Last_Txn'] = df.groupby('CustomerId')['TransactionStartTime'].diff().dt.total_seconds() / 3600
    
    # Create rolling features (windowed statistics)
    df = df.sort_values(['CustomerId', 'TransactionStartTime'])
    df['Rolling_3Txn_Avg'] = df.groupby('CustomerId')['Amount'].transform(lambda x: x.rolling(3, min_periods=1).mean())
    df['Rolling_24Hr_Count'] = df.groupby('CustomerId')['TransactionStartTime'].transform(
        lambda x: x.rolling(RFM_WINDOW).count())
    
    # Create velocity features (change over time)
    df['Amount_Velocity'] = df.groupby('CustomerId')['Amount'].transform(
        lambda x: x.diff() / x.shift().where(x.shift() != 0, 1))
    
    # Create flags for unusual activity
    df['Large_Transaction_Flag'] = (df['Amount'] > df['Amount'].quantile(0.95)).astype(int)
    df['After_Hours_Flag'] = ((df['TransactionHour'] < 8) | (df['TransactionHour'] > 20)).astype(int)
    
    # Create Basel II relevant features
    df['Risk_Score'] = df['ExpectedLoss'] * df['Recency']  # Simple risk score example
    
    print("Feature engineering completed. Added 10 new features.")
    
    return df

# Apply feature engineering
df = enhanced_feature_engineering(df)

: 

## 4. Model Preparation
Data preparation steps:
- Categorical encoding
- Train-test-validation split
- Class imbalance handling with SMOTE
- Feature scaling and selection

In [None]:
# Model Preparation
def prepare_model_data(df):
    """Prepare data for modeling with enhanced methods"""
    
    print("\nPreparing Data for Modeling...")
    
    # Select features and target
    X = df.drop(['FraudResult', 'TransactionId', 'TransactionStartTime'], axis=1)
    y = df['FraudResult']
    
    # Convert categorical variables (enhanced encoding)
    categorical_cols = X.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if X[col].nunique() > 10:  # High cardinality - use target encoding
            # Calculate mean fraud rate per category (smoothed)
            fraud_rate = y.groupby(X[col]).mean()
            counts = y.groupby(X[col]).count()
            global_mean = y.mean()
            smoothing = 100  # Smoothing parameter
            
            # Apply smoothing to avoid overfitting
            X[col] = X[col].map((fraud_rate * counts + global_mean * smoothing) / (counts + smoothing))
        else:  # Low cardinality - use one-hot encoding
            X = pd.get_dummies(X, columns=[col], drop_first=True)
    
    # Train-test split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE)
    
    # Further split train into train and validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=VAL_SIZE/(1-TEST_SIZE), stratify=y_train, random_state=RANDOM_STATE)
    
    # Handle class imbalance with SMOTE
    print("\nOriginal class distribution:")
    print(y_train.value_counts())
    
    smote = SMOTE(sampling_strategy=SMOTE_SAMPLING_STRATEGY, random_state=RANDOM_STATE)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    
    print("\nResampled class distribution:")
    print(y_train_res.value_counts())
    
    # Feature scaling
    scaler = StandardScaler()
    X_train_res = scaler.fit_transform(X_train_res)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    # Feature selection
    selector = SelectKBest(f_classif, k=N_TOP_FEATURES)
    X_train_res = selector.fit_transform(X_train_res, y_train_res)
    X_val = selector.transform(X_val)
    X_test = selector.transform(X_test)
    
    # Get selected feature names
    selected_features = X.columns[selector.get_support()]
    print("\nTop Selected Features:")
    print(selected_features)
    
    return X_train_res, X_val, X_test, y_train_res, y_val, y_test, selected_features

# Prepare model data
X_train, X_val, X_test, y_train, y_val, y_test, selected_features = prepare_model_data(df)

: 

## 5. Model Training and Evaluation
Training and evaluating:
- Random Forest
- XGBoost
With comprehensive metrics:
- AUC-ROC
- Precision-Recall
- Feature importance
- SHAP explanations

In [None]:
# Model Training and Evaluation
def train_and_evaluate_models(X_train, X_val, X_test, y_train, y_val, y_test):
    """Train and evaluate multiple models with enhanced metrics"""
    
    print("\nTraining and Evaluating Models...")
    
    # Initialize models
    models = {
        'Random Forest': RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced'),
        'XGBoost': XGBClassifier(random_state=RANDOM_STATE, scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]))
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Train model
        model.fit(X_train, y_train)
        
        # Predict probabilities
        y_val_proba = model.predict_proba(X_val)[:, 1]
        y_test_proba = model.predict_proba(X_test)[:, 1]
        
        # Calculate metrics
        val_auc = roc_auc_score(y_val, y_val_proba)
        test_auc = roc_auc_score(y_test, y_test_proba)
        
        val_ap = average_precision_score(y_val, y_val_proba)
        test_ap = average_precision_score(y_test, y_test_proba)
        
        # Get optimal threshold from precision-recall curve
        precision, recall, thresholds = precision_recall_curve(y_val, y_val_proba)
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
        optimal_idx = np.argmax(f1_scores)
        optimal_threshold = thresholds[optimal_idx]
        
        # Apply optimal threshold to test set
        y_test_pred = (y_test_proba >= optimal_threshold).astype(int)
        
        # Classification report
        report = classification_report(y_test, y_test_pred, output_dict=True)
        
        # Store results
        results[name] = {
            'Validation AUC': val_auc,
            'Test AUC': test_auc,
            'Validation Average Precision': val_ap,
            'Test Average Precision': test_ap,
            'Optimal Threshold': optimal_threshold,
            'Classification Report': report,
            'Model': model
        }
        
        # Print summary
        print(f"\n{name} Performance:")
        print(f"Validation AUC: {val_auc:.4f}")
        print(f"Test AUC: {test_auc:.4f}")
        print(f"Validation Average Precision: {val_ap:.4f}")
        print(f"Test Average Precision: {test_ap:.4f}")
        print(f"Optimal Threshold: {optimal_threshold:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_test_pred))
        
        # Feature importance
        if hasattr(model, 'feature_importances_'):
            print("\nFeature Importances:")
            importances = pd.DataFrame({
                'Feature': selected_features,
                'Importance': model.feature_importances_
            }).sort_values('Importance', ascending=False)
            display(importances.head(10))
            
            # Plot feature importance
            plt.figure(figsize=(10, 6))
            sns.barplot(x='Importance', y='Feature', data=importances.head(10))
            plt.title(f'{name} - Top Feature Importances')
            plt.show()
        
        # SHAP values for model interpretation
        if name == 'Random Forest':  # SHAP works best with tree-based models
            print("\nCalculating SHAP values for model interpretation...")
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_test)
            
            # Summary plot
            plt.figure()
            shap.summary_plot(shap_values[1], X_test, feature_names=selected_features, plot_type="bar")
            plt.title(f'{name} - SHAP Feature Importance')
            plt.show()
            
            # Force plot for a specific example
            sample_idx = np.where(y_test == 1)[0][0]  # First fraud case
            plt.figure()
            shap.force_plot(explainer.expected_value[1], shap_values[1][sample_idx], 
                           X_test[sample_idx], feature_names=selected_features)
            plt.title(f'{name} - SHAP Explanation for Fraud Case')
            plt.show()
    
    return results

# Train and evaluate models
model_results = train_and_evaluate_models(X_train, X_val, X_test, y_train, y_val, y_test)

: 

## 6. Basel II Capital Calculation
Implementing Basel II framework:
- Probability of Default (PD) estimation
- Loss Given Default (LGD)
- Exposure at Default (EAD)
- Capital requirement calculation

In [None]:
# Basel II Capital Calculation
def calculate_basel_capital(df, model_results, selected_model='Random Forest'):
    """Calculate Basel II capital requirements based on model predictions"""
    
    print("\nCalculating Basel II Capital Requirements...")
    
    # Get the best model
    model = model_results[selected_model]['Model']
    
    # Predict PD (Probability of Default) for all transactions
    # Note: In practice, we'd want to predict PD at customer level over a time horizon
    # This is a simplified transaction-level example
    
    # Prepare full dataset for prediction
    X_full = df.drop(['FraudResult', 'TransactionId', 'TransactionStartTime'], axis=1)
    
    # Convert categorical variables (same as during training)
    categorical_cols = X_full.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if X_full[col].nunique() > 10:  # Target encoding
            fraud_rate = df['FraudResult'].groupby(X_full[col]).mean()
            counts = df['FraudResult'].groupby(X_full[col]).count()
            global_mean = df['FraudResult'].mean()
            smoothing = 100
            X_full[col] = X_full[col].map((fraud_rate * counts + global_mean * smoothing) / (counts + smoothing))
        else:  # One-hot encoding
            X_full = pd.get_dummies(X_full, columns=[col], drop_first=True)
    
    # Scale features
    scaler = StandardScaler()
    X_full_scaled = scaler.fit_transform(X_full)
    
    # Select features
    selector = SelectKBest(f_classif, k=N_TOP_FEATURES)
    X_full_scaled = selector.fit_transform(X_full_scaled, df['FraudResult'])
    
    # Get predictions (as proxy for PD)
    pd_estimates = model.predict_proba(X_full_scaled)[:, 1]
    
    # Apply floor to PD estimates (Basel II requires minimum 0.03% for banks)
    pd_estimates = np.maximum(pd_estimates, MIN_PD)
    
    # Calculate Expected Loss (EL) and Capital Requirement (K)
    df['PD_Estimate'] = pd_estimates
    df['LGD'] = LGD  # Using fixed LGD for simplicity
    df['EAD'] = df['Amount'] * EAD_FACTOR  # Exposure at default
    
    # Expected Loss (EL) = PD × LGD × EAD
    df['ExpectedLoss'] = df['PD_Estimate'] * df['LGD'] * df['EAD']
    
    # Basel II capital requirement formula (simplified)
    # K = [LGD × N((1 - R)^-0.5 × G(PD) + (R / (1 - R))^0.5 × G(0.999)) - PD × LGD] × (1 - 1.5 × b(PD))^-1 × (1 + (M - 2.5) × b(PD))
    # Where:
    # R = 0.12 × (1 - exp(-50 × PD)) / (1 - exp(-50)) + 0.24 × [1 - (1 - exp(-50 × PD)) / (1 - exp(-50))]
    # b(PD) = (0.11852 - 0.05478 × ln(PD))^2
    # M = 1 year (maturity)
    
    # Simplified version using supervisory formula
    df['R'] = 0.12 * (1 - np.exp(-50 * df['PD_Estimate'])) / (1 - np.exp(-50)) + \
              0.24 * (1 - (1 - np.exp(-50 * df['PD_Estimate'])) / (1 - np.exp(-50)))
    
    df['b'] = (0.11852 - 0.05478 * np.log(df['PD_Estimate']))**2
    
    # Standard normal inverse functions
    def g(p):
        return stats.norm.ppf(p)
    
    df['CapitalRequirement'] = (df['LGD'] * stats.norm.cdf(
        (1 - df['R'])**-0.5 * g(df['PD_Estimate']) + 
        (df['R'] / (1 - df['R']))**0.5 * g(0.999)) - 
        df['PD_Estimate'] * df['LGD']) * \
        (1 - 1.5 * df['b'])**-1 * (1 + (1 - 2.5) * df['b'])
    
    # Total capital required
    total_capital = df['CapitalRequirement'].sum()
    total_exposure = df['EAD'].sum()
    capital_ratio = total_capital / total_exposure
    
    print("\nBasel II Capital Calculation Results:")
    print(f"Total Exposure at Default (EAD): ${total_exposure:,.2f}")
    print(f"Total Capital Required: ${total_capital:,.2f}")
    print(f"Capital Ratio: {capital_ratio:.2%}")
    print("\nNote: This is a simplified calculation. Actual Basel II implementation requires more complex modeling.")
    
    return df

# Calculate Basel II capital
df_with_capital = calculate_basel_capital(df, model_results)

: 

## 7. Business Impact Analysis
Quantifying the financial impact:
- Fraud prevented vs. missed
- Operational costs
- ROI calculation
- Customer impact

In [None]:
# Business Impact Analysis
def business_impact_analysis(df, model_results, selected_model='Random Forest'):
    """Analyze the business impact of implementing the fraud detection model"""
    
    print("\nBusiness Impact Analysis:")
    
    # Get model predictions and optimal threshold
    model = model_results[selected_model]['Model']
    threshold = model_results[selected_model]['Optimal Threshold']
    
    # Prepare test set for prediction
    X = df.drop(['FraudResult', 'TransactionId', 'TransactionStartTime'], axis=1)
    y = df['FraudResult']
    
    # Convert categorical variables
    categorical_cols = X.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if X[col].nunique() > 10:
            fraud_rate = y.groupby(X[col]).mean()
            counts = y.groupby(X[col]).count()
            global_mean = y.mean()
            smoothing = 100
            X[col] = X[col].map((fraud_rate * counts + global_mean * smoothing) / (counts + smoothing))
        else:
            X = pd.get_dummies(X, columns=[col], drop_first=True)
    
    # Scale and select features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    selector = SelectKBest(f_classif, k=N_TOP_FEATURES)
    X_scaled = selector.fit_transform(X_scaled, y)
    
    # Get predictions
    y_proba = model.predict_proba(X_scaled)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)
    
    # Confusion matrix
    cm = pd.crosstab(y, y_pred, rownames=['Actual'], colnames=['Predicted'])
    display(cm)
    
    # Financial impact metrics
    fraud_caught = cm.loc[1, 1]
    fraud_missed = cm.loc[1, 0]
    false_positives = cm.loc[0, 1]
    
    avg_fraud_amount = df[df['FraudResult']==1]['Amount'].mean()
    avg_transaction_amount = df['Amount'].mean()
    
    fraud_prevented = fraud_caught * avg_fraud_amount
    fraud_loss = fraud_missed * avg_fraud_amount
    operational_cost = false_positives * avg_transaction_amount * 0.10  # Assuming 10% of transaction amount as cost
    
    net_savings = fraud_prevented - fraud_loss - operational_cost
    
    print("\nFinancial Impact Estimation:")
    print(f"Fraud Prevented: ${fraud_prevented:,.2f} ({fraud_caught} transactions)")
    print(f"Fraud Loss (Missed): ${fraud_loss:,.2f} ({fraud_missed} transactions)")
    print(f"Operational Cost (False Positives): ${operational_cost:,.2f} ({false_positives} transactions)")
    print(f"Net Savings: ${net_savings:,.2f}")
    
    # ROI calculation (simplified)
    development_cost = 50000  # Estimated model development cost
    implementation_cost = 20000  # Estimated implementation cost
    annual_maintenance = 10000  # Estimated annual maintenance
    
    first_year_roi = (net_savings - development_cost - implementation_cost) / \
                    (development_cost + implementation_cost)
    ongoing_roi = (net_savings - annual_maintenance) / annual_maintenance
    
    print("\nReturn on Investment (ROI):")
    print(f"First Year ROI: {first_year_roi:.0%}")
    print(f"Ongoing Annual ROI: {ongoing_roi:.0%}")
    
    # Customer impact analysis
    customers_affected = df[y_pred == 1]['CustomerId'].nunique()
    total_customers = df['CustomerId'].nunique()
    
    print("\nCustomer Impact:")
    print(f"Customers Affected by Fraud Alerts: {customers_affected:,} ({customers_affected/total_customers:.1%})")
    
    return {
        'fraud_prevented': fraud_prevented,
        'fraud_loss': fraud_loss,
        'operational_cost': operational_cost,
        'net_savings': net_savings,
        'first_year_roi': first_year_roi,
        'ongoing_roi': ongoing_roi
    }

# Run business impact analysis
impact_results = business_impact_analysis(df, model_results)

: 

## 8. Deployment Recommendations
Strategic recommendations for production:
1. Implementation strategy
2. Risk threshold optimization
3. Operational integration
4. Monitoring framework
5. Regulatory compliance
6. Expected benefits

In [None]:
# Model Deployment Recommendations
def deployment_recommendations(model_results, impact_results):
    """Provide strategic recommendations for model deployment"""
    
    print("\nModel Deployment Recommendations:")
    
    # 1. Implementation Strategy
    print("\n1. Implementation Strategy:")
    print("- Phase 1: Pilot implementation with 10% of transactions to validate performance")
    print("- Phase 2: Full deployment with monitoring for concept drift")
    print("- Implement as a real-time scoring system integrated with transaction processing")
    
    # 2. Risk Threshold Optimization
    optimal_threshold = model_results['Random Forest']['Optimal Threshold']
    print(f"\n2. Risk Threshold Optimization (Current: {optimal_threshold:.2f}):")
    print("- Establish threshold tuning process based on changing fraud patterns")
    print("- Create multiple thresholds for different customer segments/products")
    
    # 3. Operational Integration
    print("\n3. Operational Integration:")
    print("- Integrate with case management system for fraud analysts")
    print("- Implement automated alerts for high-risk transactions")
    print("- Create escalation procedures based on risk scores")
    
    # 4. Monitoring Framework
    print("\n4. Monitoring Framework:")
    print("- Track model performance metrics weekly (AUC, precision, recall)")
    print("- Monitor feature distributions for data drift")
    print("- Establish feedback loop from fraud investigation teams")
    
    # 5. Regulatory Compliance
    print("\n5. Regulatory Compliance:")
    print("- Document model development process for audit purposes")
    print("- Validate model meets Basel II requirements for risk quantification")
    print("- Implement governance process for model updates")
    
    # 6. Expected Benefits
    print("\n6. Expected Benefits:")
    print(f"- Annual fraud prevention: ${impact_results['fraud_prevented']:,.2f}")
    print(f"- Operational cost: ${impact_results['operational_cost']:,.2f}")
    print(f"- Net savings: ${impact_results['net_savings']:,.2f}")
    print(f"- First year ROI: {impact_results['first_year_roi']:.0%}")
    print(f"- Ongoing ROI: {impact_results['ongoing_roi']:.0%}")

# Provide deployment recommendations
deployment_recommendations(model_results, impact_results)

: 