In [34]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import (classification_report, roc_auc_score,
                             average_precision_score, confusion_matrix,
                             accuracy_score, f1_score, precision_score, recall_score)
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import logging
import warnings

In [35]:
# --- CONFIGURATION & LOGGING ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
warnings.filterwarnings('ignore', category=UserWarning) # Suppress minor layout warnings
warnings.filterwarnings('ignore', category=FutureWarning)


In [36]:
# Constants
RANDOM_STATE = 42
N_JOBS = -1  # Use all available cores

def load_and_clean_data(filepath):
    """
    Loads data, handles missing values, and filters for relevant transaction types.
    """
    logging.info(f"Loading dataset from {filepath}...")


    df = pd.read_csv(filepath, low_memory=False)

     handle 'mixed types' errors

    # 'amount' or balance columns contain strings.
    numeric_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    #  Handling Missing Values
    if df.isnull().values.any():
        logging.info("Missing values detected. Dropping incomplete rows...")
        df.dropna(inplace=True)

    #  Domain Knowledge Filtering
    logging.info("Filtering dataset for TRANSFER and CASH_OUT transactions...")
    df = df.loc[df['type'].isin(['TRANSFER', 'CASH_OUT'])].copy()

    logging.info(f"Data Cleaned. Shape: {df.shape}")
    return df


In [37]:
def feature_engineering(df):
    """
    Creates robust features to detect fraud patterns.
    """
    logging.info("Starting Feature Engineering...")

    #  Balance Error Features
    df['errorBalanceOrig'] = df['newbalanceOrig'] + df['amount'] - df['oldbalanceOrg']
    df['errorBalanceDest'] = df['oldbalanceDest'] + df['amount'] - df['newbalanceDest']

    #  Transaction Velocity / Time Features
    df['hour_of_day'] = df['step'] % 24

    #  Ratio Features
    df['amount_to_oldbalance_ratio'] = df['amount'] / (df['oldbalanceOrg'] + 1.0)

    cols_to_drop = ['step', 'nameOrig', 'nameDest', 'isFlaggedFraud', 'isFraud']
    X = df.drop(cols_to_drop, axis=1)
    y = df['isFraud']

    return X, y

In [38]:
def get_pipeline(classifier):
    """
    Builds a robust preprocessing pipeline.

    """
    numeric_features = ['amount', 'oldbalanceOrg', 'newbalanceOrig',
                        'oldbalanceDest', 'newbalanceDest',
                        'errorBalanceOrig', 'errorBalanceDest',
                        'hour_of_day', 'amount_to_oldbalance_ratio']

    categorical_features = ['type']

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ])


    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop',
        verbose_feature_names_out=False # Keeps feature names clean
    )


    preprocessor.set_output(transform="pandas")

    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])


In [39]:

def train_and_evaluate(X, y):
    """
    Trains models and calculates comprehensive evaluation metrics including
    Accuracy, Precision, Recall, F1, and Confusion Matrix.
    """
    neg, pos = np.bincount(y)
    scale_weight = neg / pos
    logging.info(f"Class Imbalance Ratio: 1:{int(scale_weight)}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
    )

    models = {
        'LightGBM': lgb.LGBMClassifier(
            n_estimators=200,
            scale_pos_weight=scale_weight,
            n_jobs=N_JOBS,
            random_state=RANDOM_STATE,
            verbosity=-1
        ),
        'XGBoost': xgb.XGBClassifier(
            n_estimators=200,
            scale_pos_weight=scale_weight,
            tree_method='hist',
            n_jobs=N_JOBS,
            random_state=RANDOM_STATE,
            eval_metric='aucpr'
        ),
        'RandomForest': RandomForestClassifier(
            n_estimators=100,
            class_weight='balanced',
            max_depth=10,
            n_jobs=N_JOBS,
            random_state=RANDOM_STATE
        )
    }

    results = {}

    for name, model in models.items():
        logging.info(f"Training {name}...")
        pipeline = get_pipeline(model)
        pipeline.fit(X_train, y_train)

        y_pred = pipeline.predict(X_test)
        y_prob = pipeline.predict_proba(X_test)[:, 1]

        # ---  Metrics Calculation ---
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        results[name] = {
            'y_test': y_test,
            'y_pred': y_pred,
            'y_prob': y_prob,
            'auc': roc_auc_score(y_test, y_prob),
            'pr_auc': average_precision_score(y_test, y_prob),
            'accuracy': acc,
            'f1': f1,
            'precision': prec,
            'recall': rec,
            'confusion_matrix': cm,
            'pipeline': pipeline
        }

        logging.info(f"{name} Results:")
        logging.info(f"  Accuracy:  {acc:.4f}")
        logging.info(f"  Precision: {prec:.4f}")
        logging.info(f"  Recall:    {rec:.4f}")
        logging.info(f"  F1 Score:  {f1:.4f}")
        logging.info(f"  PR-AUC:    {results[name]['pr_auc']:.4f}")
        logging.info(f"  Confusion Matrix:\n{cm}")

    return results, X_test

In [40]:

def create_visualizations(results, df_sample):
    """
    Generates interactive visualizations.
    1. Feature Correlation Heatmap
    2. Confusion Matrix Heatmaps
    3. Metrics Comparison Bar Chart (NEW)
    4. Feature Importance
    """
    #  Feature Correlation Heatmap
    corr = df_sample.corr(numeric_only=True)
    fig_corr = px.imshow(corr, text_auto=True, title="Feature Correlation Matrix")
    fig_corr.show()

    #  Confusion Matrix Heatmaps
    for name, res in results.items():
        cm = res['confusion_matrix']
        fig_cm = px.imshow(
            cm,
            text_auto=True,
            labels=dict(x="Predicted", y="Actual", color="Count"),
            x=['Not Fraud', 'Fraud'],
            y=['Not Fraud', 'Fraud'],
            title=f"Confusion Matrix: {name}",
            color_continuous_scale='Blues'
        )
        fig_cm.update_layout(width=600, height=500)
        fig_cm.show()

    #  Model Evaluation Metrics Comparison
    #  data  visualization
    metrics_data = []
    for name, res in results.items():
        metrics_data.append({'Model': name, 'Metric': 'Accuracy', 'Score': res['accuracy']})
        metrics_data.append({'Model': name, 'Metric': 'Precision', 'Score': res['precision']})
        metrics_data.append({'Model': name, 'Metric': 'Recall', 'Score': res['recall']})
        metrics_data.append({'Model': name, 'Metric': 'F1 Score', 'Score': res['f1']})

    df_metrics = pd.DataFrame(metrics_data)

    fig_metrics = px.bar(
        df_metrics, x="Metric", y="Score", color="Model", barmode="group",
        text_auto='.3f', title="Model Evaluation Metrics Comparison",
        color_discrete_sequence=px.colors.qualitative.Pastel
    )
    fig_metrics.update_layout(yaxis_title="Score (0-1)")
    fig_metrics.show()

    #  Feature  Top
    best_model_name = max(results, key=lambda k: results[k]['pr_auc'])
    best_pipeline = results[best_model_name]['pipeline']

    feature_names = best_pipeline.named_steps['preprocessor'].get_feature_names_out()

    if hasattr(best_pipeline.named_steps['classifier'], 'feature_importances_'):
        importances = best_pipeline.named_steps['classifier'].feature_importances_

        df_imp = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
        df_imp = df_imp.sort_values('Importance', ascending=False).head(15)

        fig_imp = px.bar(
            df_imp, x='Importance', y='Feature', orientation='h',
            title=f"Top 15 Feature Importance ({best_model_name})",
            color='Importance', color_continuous_scale='Viridis'
        )
        fig_imp.update_layout(yaxis={'categoryorder':'total ascending'}, height=600)
        fig_imp.show()

In [41]:

if __name__ == "__main__":
    # --- EXECUTION ---
    # Replace with your actual path
    filepath = '/content/Fraud.csv'

    try:
        df = load_and_clean_data(filepath)
        X, y = feature_engineering(df)

        results, X_test = train_and_evaluate(X, y)

        logging.info("Generating Visualizations...")
        create_visualizations(results, df.sample(min(10000, len(df))))

        logging.info("Pipeline Completed Successfully.")
    except Exception as e:
        logging.error(f"Execution failed: {e}")
        print("Note: Ensure 'Fraud.csv' is present in the working directory.")