In [84]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import warnings
import os

# Professional Setup
warnings.filterwarnings('ignore')
np.random.seed(42)

print("üöÄ Starting Hybrid AI Fraud Detection System...")

# ==========================================
# 1. LOAD & ENGINEER DATA
# ==========================================
print("Step 1: Loading & Engineering Data...")

def engineer_features(df):
    data = df.copy()

    # Ensure timestamp exists
    if 'timestamp' not in data.columns:
        # Create dummy timestamps if missing to prevent crash
        data['timestamp'] = pd.date_range('2024-01-01', periods=len(data), freq='T')
    else:
        data['timestamp'] = pd.to_datetime(data['timestamp'])

    data = data.sort_values(['customer_id', 'timestamp'])

    # 1. Velocity (Behavioral)
    data['prev_time'] = data.groupby('customer_id')['timestamp'].shift(1)
    data['seconds_diff'] = (data['timestamp'] - data['prev_time']).dt.total_seconds().fillna(0)

    # 2. Z-Score (Statistical)
    cust_stats = data.groupby('customer_id')['amount'].agg(['mean', 'std']).reset_index()
    data = data.merge(cust_stats, on='customer_id', how='left')
    data['std'] = data['std'].fillna(1)
    data['z_score'] = (data['amount'] - data['mean']) / (data['std'] + 1e-5)

    # 3. Time & Category
    data['hour'] = data['timestamp'].dt.hour
    data['merchant_category'] = data['merchant_category'].astype(str)

    # 4. Encoding
    le = LabelEncoder()
    data['cat_code'] = le.fit_transform(data['merchant_category'])

    return data

# Load Training Data
try:
    if os.path.exists("fraud_dataset.csv"):
        df_raw = pd.read_csv("fraud_dataset.csv")
    else:
        print("‚ö†Ô∏è 'fraud_dataset.csv' not found. Generating SYNTHETIC training data...")
        df_raw = pd.DataFrame({
            'transaction_id': range(5000),
            'customer_id': np.random.randint(1000, 1100, 5000),
            'amount': np.random.exponential(50, 5000),
            'merchant_category': np.random.choice(['food','travel','tech'], 5000),
            'is_fraud': 0
        })
        df_raw.loc[0:100, 'is_fraud'] = 1 # Inject fraud

    df_proc = engineer_features(df_raw)
    FEATURES = ['amount', 'seconds_diff', 'z_score', 'hour', 'cat_code']
    X = df_proc[FEATURES]
    y = df_proc['is_fraud']

    # Scale Data (Crucial for Neural Networks)
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # ==========================================
    # 2. TRAIN HYBRID SYSTEM
    # ==========================================
    print("Step 2: Training Hybrid Engines...")

    # A. Autoencoder (Unsupervised Anomaly Detection)
    # Train ONLY on Normal transactions to learn "Normality"
    X_normal = X_scaled[y == 0]
    autoencoder = MLPRegressor(hidden_layer_sizes=(16, 8, 16), random_state=42, max_iter=200)
    autoencoder.fit(X_normal, X_normal)
    print("   ‚úÖ Autoencoder Trained (Anomaly Engine)")

    # B. XGBoost (Supervised Pattern Recognition)
    # Use SMOTE to fix Class Imbalance
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X_scaled, y)

    xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=4, use_label_encoder=False, eval_metric='logloss')
    xgb_model.fit(X_res, y_res)
    print("   ‚úÖ XGBoost Trained with SMOTE (Pattern Engine)")

    # ==========================================
    # 3. GENERATE SUBMISSION (Traffic Light Logic)
    # ==========================================
    print("Step 3: Generating Self-Correcting Submission...")

    # Load Test Data (UPDATED FILENAME: test.csv)
    if os.path.exists("test.csv"):
        df_test = pd.read_csv("test.csv")
        print("   üìÇ Loaded 'test.csv'")
    else:
        print("‚ö†Ô∏è 'test.csv' not found. Generating mock test data...")
        df_test = df_raw.iloc[:2000].drop('is_fraud', axis=1).copy()

    # Process
    df_test_proc = engineer_features(df_test)
    X_test_scaled = scaler.transform(df_test_proc[FEATURES])

    # 1. Get Raw Scores
    pred_ae = autoencoder.predict(X_test_scaled)
    mse = np.mean(np.power(X_test_scaled - pred_ae, 2), axis=1) # Anomaly Score
    prob_xgb = xgb_model.predict_proba(X_test_scaled)[:, 1]     # Pattern Score

    # Hybrid Fusion (40% Anomaly + 60% Pattern)
    raw_risk = (0.4 * mse) + (0.6 * prob_xgb)

    # 2. QUANTILE NORMALIZATION (Traffic Light Logic)
    # Force Operational Stability: Top 2% Block, Next 5% Review
    cutoff_block = np.percentile(raw_risk, 98)
    cutoff_review = np.percentile(raw_risk, 93)

    def assign_action(score):
        if score >= cutoff_block: return "BLOCK"
        elif score >= cutoff_review: return "REVIEW"
        else: return "APPROVE"

    actions = [assign_action(s) for s in raw_risk]

    # Output
    submission = pd.DataFrame({
        'transaction_id': df_test['transaction_id'],
        'is_fraud': (raw_risk >= cutoff_review).astype(int), # 1 if Review or Block
        'risk_score': np.round(raw_risk, 4),
        'action': actions
    })

    submission.to_csv("submission_final.csv", index=False)
    print("üéâ SUCCESS: 'submission_final.csv' generated.")
    print("\nüìä Final Operational Distribution:")
    print(submission['action'].value_counts())

except Exception as e:
    print(f"‚ùå Error: {e}")

üöÄ Starting Hybrid AI Fraud Detection System...
Step 1: Loading & Engineering Data...
Step 2: Training Hybrid Engines...
   ‚úÖ Autoencoder Trained (Anomaly Engine)
   ‚úÖ XGBoost Trained with SMOTE (Pattern Engine)
Step 3: Generating Self-Correcting Submission...
   üìÇ Loaded 'test.csv'
üéâ SUCCESS: 'submission_final.csv' generated.

üìä Final Operational Distribution:
action
APPROVE    46
REVIEW      3
BLOCK       1
Name: count, dtype: int64
