In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Load your data
DATA_DIR = "../data/processed"
trainX = pd.read_csv(f"{DATA_DIR}/IEEE_Train.csv")
y = pd.read_csv(f"{DATA_DIR}/IEEE_Target.csv")
testX = pd.read_csv(f"{DATA_DIR}/IEEE_Test.csv")
train = trainX.merge(y, on="TransactionID", how="left")

print("="*60)
print("ONE-CLASS SVM FOR FRAUD DETECTION")
print("="*60)
print("Shapes -> train:", train.shape, "test:", testX.shape)

# Prepare features and target
X = train.drop(['TransactionID', 'uid', 'isFraud'], axis=1)
y_target = train['isFraud']

# Check class distribution
print("\nOriginal Class Distribution:")
print(y_target.value_counts())
fraud_rate = y_target.mean()
print(f"Fraud Rate: {fraud_rate:.4f} ({fraud_rate*100:.2f}%)")
print(f"Imbalance Ratio: {y_target.value_counts()[0] / y_target.value_counts()[1]:.2f}:1")

# Split the data
# Calculate the split point for 80% of the data
split_point = int(0.8 * len(X))

# Split chronologically - first 80% for training, last 20% for validation
X_train = X[:split_point]
X_val = X[split_point:]
y_train = y_target[:split_point]
y_val = y_target[split_point:]

print(f"\nTraining set class distribution:")
print(f"Class 0 (Normal): {sum(y_train == 0)}")
print(f"Class 1 (Fraud): {sum(y_train == 1)}")

ONE-CLASS SVM FOR FRAUD DETECTION
Shapes -> train: (590540, 293) test: (506691, 292)

Original Class Distribution:
isFraud
0    569877
1     20663
Name: count, dtype: int64
Fraud Rate: 0.0350 (3.50%)
Imbalance Ratio: 27.58:1

Training set class distribution:
Class 0 (Normal): 455833
Class 1 (Fraud): 16599


In [2]:
# =============================================================================
# DATA PREPROCESSING FOR ONE-CLASS SVM
# =============================================================================

print("\n" + "="*50)
print("DATA PREPROCESSING")
print("="*50)

# One-Class SVM is sensitive to feature scaling - try both StandardScaler and RobustScaler
scalers = {
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler()
}

scaled_data = {}
for scaler_name, scaler in scalers.items():
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    scaled_data[scaler_name] = {
        'X_train': X_train_scaled,
        'X_val': X_val_scaled,
        'scaler': scaler
    }
    print(f"{scaler_name} - Train shape: {X_train_scaled.shape}, Val shape: {X_val_scaled.shape}")


DATA PREPROCESSING
StandardScaler - Train shape: (472432, 290), Val shape: (118108, 290)
RobustScaler - Train shape: (472432, 290), Val shape: (118108, 290)


In [3]:
# =============================================================================
# ONE-CLASS SVM TRAINING WITH DIFFERENT STRATEGIES
# =============================================================================

print("\n" + "="*50)
print("ONE-CLASS SVM TRAINING STRATEGIES")
print("="*50)

# Strategy 1: Train on normal transactions only (traditional One-Class SVM approach)
print("\n--- Strategy 1: Train on Normal Transactions Only ---")
normal_indices = y_train == 0
X_train_normal = X_train[normal_indices]

strategy1_results = {}
for scaler_name, data in scaled_data.items():
    X_train_normal_scaled = data['scaler'].fit_transform(X_train_normal)
    X_val_scaled = data['scaler'].transform(X_val)
    
    # Test different nu values (expected fraction of outliers)
    nu_values = [0.01, 0.05, 0.1, 0.15, 0.2]
    
    for nu in nu_values:
        print(f"\n{scaler_name} with nu={nu}")
        
        # Train One-Class SVM
        oc_svm = OneClassSVM(
            kernel='rbf',
            gamma='scale',
            nu=nu
        )
        
        oc_svm.fit(X_train_normal_scaled)
        
        # Predict on validation set (-1 for anomaly/fraud, 1 for normal)
        val_predictions = oc_svm.predict(X_val_scaled)
        val_scores = oc_svm.decision_function(X_val_scaled)
        
        # Convert to binary (1 for fraud, 0 for normal)
        val_pred_binary = (val_predictions == -1).astype(int)
        
        # Calculate metrics
        if len(np.unique(val_pred_binary)) > 1:
            precision = precision_score(y_val, val_pred_binary)
            recall = recall_score(y_val, val_pred_binary)
            f1 = f1_score(y_val, val_pred_binary)
        else:
            precision = recall = f1 = 0
        
        fraud_detection_rate = np.mean(val_pred_binary)
        
        strategy1_results[f"{scaler_name}_nu_{nu}"] = {
            'scaler': scaler_name,
            'nu': nu,
            'model': oc_svm,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'fraud_rate': fraud_detection_rate,
            'predictions': val_pred_binary,
            'scores': val_scores
        }
        
        print(f"  Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
        print(f"  Detected fraud rate: {fraud_detection_rate:.4f}")

# Strategy 2: Train on all data with contamination parameter
print("\n--- Strategy 2: Train on All Data with Contamination ---")

strategy2_results = {}
contamination_values = [fraud_rate, fraud_rate * 1.5, fraud_rate * 2.0, 0.05, 0.1]

for scaler_name, data in scaled_data.items():
    for contamination in contamination_values:
        print(f"\n{scaler_name} with contamination={contamination:.4f}")
        
        # Train One-Class SVM with contamination
        oc_svm = OneClassSVM(
            kernel='rbf',
            gamma='scale',
            nu=contamination  # nu is equivalent to contamination in this context
        )
        
        oc_svm.fit(data['X_train'])
        
        # Predict on validation set
        val_predictions = oc_svm.predict(data['X_val'])
        val_scores = oc_svm.decision_function(data['X_val'])
        val_pred_binary = (val_predictions == -1).astype(int)
        
        # Calculate metrics
        if len(np.unique(val_pred_binary)) > 1:
            precision = precision_score(y_val, val_pred_binary)
            recall = recall_score(y_val, val_pred_binary)
            f1 = f1_score(y_val, val_pred_binary)
        else:
            precision = recall = f1 = 0
        
        fraud_detection_rate = np.mean(val_pred_binary)
        
        strategy2_results[f"{scaler_name}_cont_{contamination:.4f}"] = {
            'scaler': scaler_name,
            'contamination': contamination,
            'model': oc_svm,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'fraud_rate': fraud_detection_rate,
            'predictions': val_pred_binary,
            'scores': val_scores
        }
        
        print(f"  Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
        print(f"  Detected fraud rate: {fraud_detection_rate:.4f}")


ONE-CLASS SVM TRAINING STRATEGIES

--- Strategy 1: Train on Normal Transactions Only ---

StandardScaler with nu=0.01
  Precision: 0.1329, Recall: 0.1206, F1: 0.1264
  Detected fraud rate: 0.0312

StandardScaler with nu=0.05


In [None]:
# =============================================================================
# HYPERPARAMETER OPTIMIZATION
# =============================================================================

print("\n" + "="*50)
print("HYPERPARAMETER OPTIMIZATION")
print("="*50)

# Find best performing configuration from both strategies
all_results = {**strategy1_results, **strategy2_results}
best_config_key = max(all_results.keys(), key=lambda k: all_results[k]['f1'])
best_config = all_results[best_config_key]

print(f"Best configuration: {best_config_key}")
print(f"Best F1 Score: {best_config['f1']:.4f}")

# Advanced hyperparameter tuning for the best scaler
best_scaler_name = best_config['scaler']
best_scaler_data = scaled_data[best_scaler_name]

print(f"\nAdvanced tuning with {best_scaler_name}...")

# Grid search for optimal parameters
param_grid = {
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'nu': [0.01, 0.05, 0.1, 0.15, 0.2]
}

# Custom scoring function for imbalanced data
def custom_f1_scorer(estimator, X, y_true):
    predictions = estimator.predict(X)
    pred_binary = (predictions == -1).astype(int)
    if len(np.unique(pred_binary)) > 1:
        return f1_score(y_true, pred_binary)
    else:
        return 0

# Perform grid search (on a subset due to computational constraints)
print("Performing grid search optimization...")

# Use normal transactions for training (One-Class SVM approach)
if 'nu' in best_config:
    # Strategy 1 approach
    X_train_for_tuning = best_scaler_data['scaler'].fit_transform(X_train[y_train == 0])
else:
    # Strategy 2 approach  
    X_train_for_tuning = best_scaler_data['X_train']

# Simplified grid search
best_params = {'kernel': 'rbf', 'gamma': 'scale', 'nu': 0.1}
best_score = -1

# Test a subset of combinations
kernels = ['rbf', 'poly']
gammas = ['scale', 0.01, 0.1]
nus = [0.05, 0.1, 0.15]

for kernel in kernels:
    for gamma in gammas:
        for nu in nus:
            try:
                oc_svm_test = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu)
                oc_svm_test.fit(X_train_for_tuning)
                
                val_pred = oc_svm_test.predict(best_scaler_data['X_val'])
                val_pred_binary = (val_pred == -1).astype(int)
                
                if len(np.unique(val_pred_binary)) > 1:
                    f1 = f1_score(y_val, val_pred_binary)
                    if f1 > best_score:
                        best_score = f1
                        best_params = {'kernel': kernel, 'gamma': gamma, 'nu': nu}
                        
            except Exception as e:
                continue

print(f"Best parameters found: {best_params}")
print(f"Best F1 score: {best_score:.4f}")


HYPERPARAMETER OPTIMIZATION


NameError: name 'strategy2_results' is not defined

In [None]:
# =============================================================================
# FINAL OPTIMIZED MODEL
# =============================================================================

print("\n" + "="*50)
print("FINAL OPTIMIZED ONE-CLASS SVM MODEL")
print("="*50)

# Train final model with best parameters
final_scaler = scalers[best_scaler_name]
X_train_final_scaled = final_scaler.fit_transform(X_train[y_train == 0])  # Train on normal only
X_val_final_scaled = final_scaler.transform(X_val)

final_oc_svm = OneClassSVM(
    kernel=best_params['kernel'],
    gamma=best_params['gamma'],
    nu=best_params['nu']
)

final_oc_svm.fit(X_train_final_scaled)

# Get predictions and scores
final_val_predictions = final_oc_svm.predict(X_val_final_scaled)
final_val_scores = final_oc_svm.decision_function(X_val_final_scaled)
final_val_pred_binary = (final_val_predictions == -1).astype(int)

# Calculate final metrics
final_precision = precision_score(y_val, final_val_pred_binary)
final_recall = recall_score(y_val, final_val_pred_binary)
final_f1 = f1_score(y_val, final_val_pred_binary)

# Convert scores to probabilities for AUC calculation
final_val_proba = 1 / (1 + np.exp(-final_val_scores))  # Sigmoid transformation
final_auc = roc_auc_score(y_val, final_val_proba)
final_ap = average_precision_score(y_val, final_val_proba)

print(f"Final Model Performance:")
print(f"Precision: {final_precision:.4f}")
print(f"Recall: {final_recall:.4f}")
print(f"F1-Score: {final_f1:.4f}")
print(f"ROC-AUC: {final_auc:.4f}")
print(f"PR-AUC: {final_ap:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_val, final_val_pred_binary)
print(f"\nConfusion Matrix:")
print(cm)

print(f"\nClassification Report:")
print(classification_report(y_val, final_val_pred_binary))

In [None]:
# =============================================================================
# THRESHOLD OPTIMIZATION
# =============================================================================

print("\n" + "="*50)
print("THRESHOLD OPTIMIZATION")
print("="*50)

# Test different percentile thresholds based on decision function scores
percentiles = [1, 2, 3, 4, 5, 10, 15, 20, 25]
threshold_results = []

for percentile in percentiles:
    threshold = np.percentile(final_val_scores, percentile)
    pred_threshold = (final_val_scores < threshold).astype(int)
    
    if len(np.unique(pred_threshold)) > 1:
        precision = precision_score(y_val, pred_threshold)
        recall = recall_score(y_val, pred_threshold)
        f1 = f1_score(y_val, pred_threshold)
    else:
        precision = recall = f1 = 0
    
    detected_fraud_rate = np.mean(pred_threshold)
    
    threshold_results.append({
        'Percentile': percentile,
        'Threshold': threshold,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Detected_Fraud_Rate': detected_fraud_rate
    })

threshold_df = pd.DataFrame(threshold_results)
print("Threshold Optimization Results:")
print(threshold_df.round(4))

# Find optimal threshold
optimal_idx = threshold_df['F1-Score'].idxmax()
optimal_threshold = threshold_df.loc[optimal_idx, 'Threshold']
optimal_percentile = threshold_df.loc[optimal_idx, 'Percentile']

print(f"\nOptimal threshold: {optimal_threshold:.4f} (at {optimal_percentile}th percentile)")
print(f"Optimal F1-Score: {threshold_df.loc[optimal_idx, 'F1-Score']:.4f}")

In [None]:
# =============================================================================
# KERNEL COMPARISON
# =============================================================================

print("\n" + "="*50)
print("KERNEL COMPARISON")
print("="*50)

kernels_to_test = ['rbf', 'linear', 'poly', 'sigmoid']
kernel_results = {}

for kernel in kernels_to_test:
    print(f"\nTesting {kernel} kernel...")
    
    try:
        oc_svm_kernel = OneClassSVM(
            kernel=kernel,
            gamma='scale' if kernel != 'linear' else 'auto',
            nu=best_params['nu']
        )
        
        oc_svm_kernel.fit(X_train_final_scaled)
        
        # Predictions
        kernel_pred = oc_svm_kernel.predict(X_val_final_scaled)
        kernel_scores = oc_svm_kernel.decision_function(X_val_final_scaled)
        kernel_pred_binary = (kernel_pred == -1).astype(int)
        
        # Metrics
        if len(np.unique(kernel_pred_binary)) > 1:
            precision = precision_score(y_val, kernel_pred_binary)
            recall = recall_score(y_val, kernel_pred_binary)
            f1 = f1_score(y_val, kernel_pred_binary)
            
            # AUC calculation
            kernel_proba = 1 / (1 + np.exp(-kernel_scores))
            auc = roc_auc_score(y_val, kernel_proba)
        else:
            precision = recall = f1 = auc = 0
        
        kernel_results[kernel] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'auc': auc,
            'model': oc_svm_kernel,
            'scores': kernel_scores,
            'predictions': kernel_pred_binary
        }
        
        print(f"  Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}")
        
    except Exception as e:
        print(f"  Error with {kernel} kernel: {str(e)}")
        kernel_results[kernel] = {'f1': 0}

# Find best kernel
best_kernel = max(kernel_results.keys(), key=lambda k: kernel_results[k].get('f1', 0))
print(f"\nBest kernel: {best_kernel} (F1: {kernel_results[best_kernel]['f1']:.4f})")

In [None]:
# =============================================================================
# VISUALIZATIONS
# =============================================================================

print("\n" + "="*50)
print("CREATING VISUALIZATIONS")
print("="*50)

# Create comprehensive visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Confusion Matrix for best model
best_model_results = kernel_results[best_kernel]
cm = confusion_matrix(y_val, best_model_results['predictions'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0,0])
axes[0,0].set_title(f'Confusion Matrix\n({best_kernel} kernel)')
axes[0,0].set_xlabel('Predicted')
axes[0,0].set_ylabel('Actual')

# 2. Threshold optimization plot
axes[0,1].plot(threshold_df['Percentile'], threshold_df['Precision'], 'o-', label='Precision')
axes[0,1].plot(threshold_df['Percentile'], threshold_df['Recall'], 's-', label='Recall')
axes[0,1].plot(threshold_df['Percentile'], threshold_df['F1-Score'], '^-', label='F1-Score')
axes[0,1].axvline(x=optimal_percentile, color='red', linestyle='--', alpha=0.7)
axes[0,1].set_xlabel('Percentile Threshold')
axes[0,1].set_ylabel('Score')
axes[0,1].set_title('Threshold Optimization')
axes[0,1].legend()
axes[0,1].grid(True, alpha=0.3)

# 3. Kernel comparison
kernel_comparison_df = pd.DataFrame({
    'Kernel': list(kernel_results.keys()),
    'Precision': [kernel_results[k].get('precision', 0) for k in kernel_results.keys()],
    'Recall': [kernel_results[k].get('recall', 0) for k in kernel_results.keys()],
    'F1-Score': [kernel_results[k].get('f1', 0) for k in kernel_results.keys()]
})

x_pos = np.arange(len(kernel_comparison_df))
width = 0.25

axes[0,2].bar(x_pos - width, kernel_comparison_df['Precision'], width, label='Precision', alpha=0.8)
axes[0,2].bar(x_pos, kernel_comparison_df['Recall'], width, label='Recall', alpha=0.8)
axes[0,2].bar(x_pos + width, kernel_comparison_df['F1-Score'], width, label='F1-Score', alpha=0.8)

axes[0,2].set_xlabel('Kernel')
axes[0,2].set_ylabel('Score')
axes[0,2].set_title('Kernel Performance Comparison')
axes[0,2].set_xticks(x_pos)
axes[0,2].set_xticklabels(kernel_comparison_df['Kernel'])
axes[0,2].legend()
axes[0,2].grid(True, alpha=0.3)

# 4. Decision Function Score Distribution
fraud_scores = best_model_results['scores'][y_val == 1]
normal_scores = best_model_results['scores'][y_val == 0]

axes[1,0].hist(normal_scores, bins=50, alpha=0.7, label='Normal', density=True, color='blue')
axes[1,0].hist(fraud_scores, bins=50, alpha=0.7, label='Fraud', density=True, color='red')
axes[1,0].axvline(x=0, color='black', linestyle='-', alpha=0.8, label='Default Threshold')
axes[1,0].axvline(x=optimal_threshold, color='green', linestyle='--', alpha=0.8, label='Optimal Threshold')
axes[1,0].set_xlabel('Decision Function Score')
axes[1,0].set_ylabel('Density')
axes[1,0].set_title('Score Distribution by Class')
axes[1,0].legend()

# 5. Precision-Recall Curve
best_proba = 1 / (1 + np.exp(-best_model_results['scores']))
precision_curve, recall_curve, _ = precision_recall_curve(y_val, best_proba)
ap_score = average_precision_score(y_val, best_proba)

axes[1,1].plot(recall_curve, precision_curve, marker='.')
axes[1,1].set_xlabel('Recall')
axes[1,1].set_ylabel('Precision')
axes[1,1].set_title(f'Precision-Recall Curve\n(AP Score: {ap_score:.4f})')
axes[1,1].grid(True, alpha=0.3)

# 6. Feature scaling impact
scaler_comparison = []
for scaler_name in ['StandardScaler', 'RobustScaler']:
    # Get best result for each scaler
    scaler_results = [result for key, result in all_results.items() if result['scaler'] == scaler_name]
    if scaler_results:
        best_scaler_result = max(scaler_results, key=lambda x: x['f1'])
        scaler_comparison.append({
            'Scaler': scaler_name,
            'Best_F1': best_scaler_result['f1'],
            'Best_Precision': best_scaler_result['precision'],
            'Best_Recall': best_scaler_result['recall']
        })

if scaler_comparison:
    scaler_comp_df = pd.DataFrame(scaler_comparison)
    x_pos = np.arange(len(scaler_comp_df))
    
    axes[1,2].bar(x_pos - width/2, scaler_comp_df['Best_Precision'], width, label='Precision', alpha=0.8)
    axes[1,2].bar(x_pos + width/2, scaler_comp_df['Best_Recall'], width, label='Recall', alpha=0.8)
    
    axes[1,2].set_xlabel('Scaler Type')
    axes[1,2].set_ylabel('Best Score')
    axes[1,2].set_title('Scaler Impact on Performance')
    axes[1,2].set_xticks(x_pos)
    axes[1,2].set_xticklabels(scaler_comp_df['Scaler'], rotation=45)
    axes[1,2].legend()
    axes[1,2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# MODEL COMPARISON WITH OTHER METHODS
# =============================================================================

print("\n" + "="*50)
print("COMPARISON WITH OTHER METHODS")
print("="*50)

# Train comparison models
print("Training comparison models...")

# Logistic Regression with class balancing
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(best_scaler_data['X_train'], y_train)
lr_pred = lr.predict(best_scaler_data['X_val'])
lr_proba = lr.predict_proba(best_scaler_data['X_val'])[:, 1]

# Random Forest with class balancing
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
rf.fit(X_train, y_train)  # Use original unscaled data for RF
rf_pred = rf.predict(X_val)
rf_proba = rf.predict_proba(X_val)[:, 1]

# Calculate metrics for comparison
comparison_results = []

# One-Class SVM
comparison_results.append({
    'Model': 'One-Class SVM',
    'Precision': final_precision,
    'Recall': final_recall,
    'F1-Score': final_f1,
    'ROC-AUC': final_auc,
    'PR-AUC': final_ap
})

# Logistic Regression
lr_precision = precision_score(y_val, lr_pred)
lr_recall = recall_score(y_val, lr_pred)
lr_f1 = f1_score(y_val, lr_pred)
lr_auc = roc_auc_score(y_val, lr_proba)
lr_ap = average_precision_score(y_val, lr_proba)

comparison_results.append({
    'Model': 'Logistic Regression',
    'Precision': lr_precision,
    'Recall': lr_recall,
    'F1-Score': lr_f1,
    'ROC-AUC': lr_auc,
    'PR-AUC': lr_ap
})

# Random Forest
rf_precision = precision_score(y_val, rf_pred)
rf_recall = recall_score(y_val, rf_pred)
rf_f1 = f1_score(y_val, rf_pred)
rf_auc = roc_auc_score(y_val, rf_proba)
rf_ap = average_precision_score(y_val, rf_proba)

comparison_results.append({
    'Model': 'Random Forest',
    'Precision': rf_precision,
    'Recall': rf_recall,
    'F1-Score': rf_f1,
    'ROC-AUC': rf_auc,
    'PR-AUC': rf_ap
})

# Display comparison
comparison_df = pd.DataFrame(comparison_results)
print("\nModel Performance Comparison:")
print(comparison_df.round(4))

# Highlight best performing model for each metric
print("\nBest performing model for each metric:")
for metric in ['Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'PR-AUC']:
    best_model = comparison_df.loc[comparison_df[metric].idxmax(), 'Model']
    best_score = comparison_df[metric].max()
    print(f"{metric}: {best_model} ({best_score:.4f})")

# =============================================================================
# PREDICTIONS ON TEST SET
# =============================================================================

print("\n" + "="*50)
print("PREDICTIONS ON TEST SET")
print("="*50)

# Prepare test data
X_test = testX.drop(['TransactionID', 'uid'], axis=1)
X_test_scaled = final_scaler.transform(X_test)

# Make predictions using final model
test_predictions = final_oc_svm.predict(X_test_scaled)
test_scores = final_oc_svm.decision_function(X_test_scaled)

# Apply default threshold
test_pred_default = (test_predictions == -1).astype(int)

# Apply optimized threshold
test_pred_optimized = (test_scores < optimal_threshold).astype(int)

print(f"Test set shape: {X_test.shape}")
print(f"Default threshold fraud detection rate: {np.mean(test_pred_default):.4f}")
print(f"Optimized threshold fraud detection rate: {np.mean(test_pred_optimized):.4f}")

# Create submission files
submission_default = pd.DataFrame({
    'TransactionID': testX['TransactionID'],
    'isFraud': test_pred_default
})

submission_optimized = pd.DataFrame({
    'TransactionID': testX['TransactionID'],
    'isFraud': test_pred_optimized
})

print(f"\nSample predictions (first 10 rows - optimized threshold):")
print(submission_optimized.head(10))

# =============================================================================
# DETAILED ANALYSIS AND INSIGHTS
# =============================================================================

print("\n" + "="*50)
print("DETAILED ANALYSIS AND INSIGHTS")
print("="*50)

# Analyze prediction confidence
test_proba = 1 / (1 + np.exp(-test_scores))
confidence_bins = pd.cut(test_proba, bins=[0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0], 
                        labels=['Very Low', 'Low', 'Medium', 'High', 'Very High', 'Extreme'])

print("Prediction Confidence Distribution:")
print(confidence_bins.value_counts().sort_index())

# Analyze score statistics
print(f"\nDecision Function Score Statistics:")
print(f"Mean: {test_scores.mean():.4f}")
print(f"Std: {test_scores.std():.4f}")
print(f"Min: {test_scores.min():.4f}")
print(f"Max: {test_scores.max():.4f}")
print(f"25th percentile: {np.percentile(test_scores, 25):.4f}")
print(f"50th percentile: {np.percentile(test_scores, 50):.4f}")
print(f"75th percentile: {np.percentile(test_scores, 75):.4f}")

# Most anomalous transactions (potential fraud)
most_anomalous_indices = np.argsort(test_scores)[:10]
print(f"\nTop 10 most anomalous transactions (lowest scores):")
for i, idx in enumerate(most_anomalous_indices):
    score = test_scores[idx]
    transaction_id = testX.iloc[idx]['TransactionID']
    print(f"  {i+1}. TransactionID: {transaction_id}, Score: {score:.4f}")

# =============================================================================
# ADVANCED ONE-CLASS SVM TECHNIQUES
# =============================================================================

print("\n" + "="*50)
print("ADVANCED ONE-CLASS SVM TECHNIQUES")
print("="*50)

# 1. Ensemble of One-Class SVMs with different parameters
print("\n--- Ensemble One-Class SVM ---")

ensemble_models = []
ensemble_params = [
    {'kernel': 'rbf', 'gamma': 'scale', 'nu': 0.05},
    {'kernel': 'rbf', 'gamma': 'scale', 'nu': 0.1},
    {'kernel': 'rbf', 'gamma': 'scale', 'nu': 0.15},
    {'kernel': 'rbf', 'gamma': 0.01, 'nu': 0.1},
    {'kernel': 'rbf', 'gamma': 0.1, 'nu': 0.1}
]

ensemble_val_scores = []
ensemble_test_scores = []

for i, params in enumerate(ensemble_params):
    print(f"Training ensemble model {i+1}/5...")
    
    oc_svm_ensemble = OneClassSVM(**params)
    oc_svm_ensemble.fit(X_train_final_scaled)
    
    # Get scores
    val_scores_ens = oc_svm_ensemble.decision_function(X_val_final_scaled)
    test_scores_ens = oc_svm_ensemble.decision_function(X_test_scaled)
    
    ensemble_val_scores.append(val_scores_ens)
    ensemble_test_scores.append(test_scores_ens)
    ensemble_models.append(oc_svm_ensemble)

# Average ensemble scores
ensemble_val_avg = np.mean(ensemble_val_scores, axis=0)
ensemble_test_avg = np.mean(ensemble_test_scores, axis=0)

# Ensemble predictions with optimized threshold
ensemble_val_pred = (ensemble_val_avg < optimal_threshold).astype(int)
ensemble_test_pred = (ensemble_test_avg < optimal_threshold).astype(int)

# Ensemble metrics
ensemble_precision = precision_score(y_val, ensemble_val_pred)
ensemble_recall = recall_score(y_val, ensemble_val_pred)
ensemble_f1 = f1_score(y_val, ensemble_val_pred)
ensemble_proba = 1 / (1 + np.exp(-ensemble_val_avg))
ensemble_auc = roc_auc_score(y_val, ensemble_proba)

print(f"\nEnsemble One-Class SVM Results:")
print(f"Precision: {ensemble_precision:.4f}")
print(f"Recall: {ensemble_recall:.4f}")
print(f"F1-Score: {ensemble_f1:.4f}")
print(f"ROC-AUC: {ensemble_auc:.4f}")

# 2. Feature Selection for One-Class SVM
print("\n--- Feature Selection Analysis ---")

# Analyze feature variance in normal vs fraud transactions
normal_transactions = X_train[y_train == 0]
fraud_transactions = X_train[y_train == 1]

feature_analysis = []
for feature in X_train.columns[:20]:  # Analyze first 20 features
    normal_mean = normal_transactions[feature].mean()
    fraud_mean = fraud_transactions[feature].mean()
    normal_std = normal_transactions[feature].std()
    fraud_std = fraud_transactions[feature].std()
    
    # Calculate difference in means (normalized by combined std)
    combined_std = np.sqrt((normal_std**2 + fraud_std**2) / 2)
    if combined_std > 0:
        effect_size = abs(normal_mean - fraud_mean) / combined_std
    else:
        effect_size = 0
    
    feature_analysis.append({
        'Feature': feature,
        'Normal_Mean': normal_mean,
        'Fraud_Mean': fraud_mean,
        'Effect_Size': effect_size
    })

feature_analysis_df = pd.DataFrame(feature_analysis).sort_values('Effect_Size', ascending=False)
print("\nTop 10 features with largest effect sizes (Normal vs Fraud):")
print(feature_analysis_df.head(10).round(4))

# 3. Outlier Detection on Training Set
print("\n--- Outlier Analysis on Training Set ---")

train_predictions_final = final_oc_svm.predict(X_train_final_scaled)
train_scores_final = final_oc_svm.decision_function(X_train_final_scaled)

# Since we trained on normal transactions only, all predictions should be on validation/test
# Let's analyze the training normal transactions
train_normal_pred = (train_predictions_final == -1).astype(int)
outlier_rate_in_normal = np.mean(train_normal_pred)

print(f"Outlier rate in 'normal' training data: {outlier_rate_in_normal:.4f}")
print(f"Expected outlier rate (nu parameter): {best_params['nu']:.4f}")

if outlier_rate_in_normal > 0:
    print(f"Number of normal transactions flagged as outliers: {np.sum(train_normal_pred)}")

# =============================================================================
# BUSINESS IMPACT ANALYSIS
# =============================================================================

print("\n" + "="*50)
print("BUSINESS IMPACT ANALYSIS")
print("="*50)

# Assuming some business metrics (you can adjust these based on your domain)
avg_transaction_value = 100  # Average transaction value in dollars
fraud_investigation_cost = 50  # Cost to investigate each flagged transaction
fraud_loss_prevented = 200  # Average loss prevented per caught fraud

# Calculate business impact for validation set
true_positives = np.sum((y_val == 1) & (ensemble_val_pred == 1))
false_positives = np.sum((y_val == 0) & (ensemble_val_pred == 1))
false_negatives = np.sum((y_val == 1) & (ensemble_val_pred == 0))
true_negatives = np.sum((y_val == 0) & (ensemble_val_pred == 0))

total_investigations = true_positives + false_positives
investigation_cost = total_investigations * fraud_investigation_cost
fraud_prevented_value = true_positives * fraud_loss_prevented
fraud_missed_cost = false_negatives * fraud_loss_prevented

net_benefit = fraud_prevented_value - investigation_cost - fraud_missed_cost

print(f"Business Impact Analysis (Validation Set):")
print(f"True Positives (Fraud Caught): {true_positives}")
print(f"False Positives (Normal Flagged): {false_positives}")
print(f"False Negatives (Fraud Missed): {false_negatives}")
print(f"True Negatives (Normal Passed): {true_negatives}")
print(f"\nCost Analysis:")
print(f"Investigation Cost: ${investigation_cost:,.2f}")
print(f"Fraud Loss Prevented: ${fraud_prevented_value:,.2f}")
print(f"Fraud Loss from Missed Cases: ${fraud_missed_cost:,.2f}")
print(f"Net Benefit: ${net_benefit:,.2f}")

# =============================================================================
# FINAL RECOMMENDATIONS AND SUMMARY
# =============================================================================

print("\n" + "="*60)
print("FINAL RECOMMENDATIONS AND SUMMARY")
print("="*60)

print(f"""
ONE-CLASS SVM FRAUD DETECTION SUMMARY:

Best Configuration:
- Kernel: {best_kernel}
- Scaler: {best_scaler_name}
- Nu parameter: {best_params['nu']:.4f}
- Optimal threshold: {optimal_threshold:.4f}

Performance Metrics:
- Precision: {final_precision:.4f} (of flagged transactions, {final_precision*100:.1f}% are actually fraud)
- Recall: {final_recall:.4f} (catches {final_recall*100:.1f}% of all fraud cases)
- F1-Score: {final_f1:.4f}
- ROC-AUC: {final_auc:.4f}

Strengths of One-Class SVM for Fraud Detection:
✓ Excellent for highly imbalanced datasets
✓ No need for fraud examples during training
✓ Robust to noise and outliers
✓ Works well with high-dimensional data
✓ Can detect novel fraud patterns not seen in training

Considerations:
⚠ Computationally intensive for large datasets
⚠ Sensitive to hyperparameter selection
⚠ Less interpretable than tree-based models
⚠ Performance depends on quality of 'normal' training data

Recommendations:
1. Use One-Class SVM as a complement to supervised methods
2. Regularly retrain on recent normal transaction patterns
3. Consider ensemble approaches for improved robustness
4. Monitor performance over time due to concept drift
5. Implement threshold monitoring and adjustment based on business feedback

Business Value:
- Net benefit of ${net_benefit:,.2f} on validation set
- Efficient resource allocation for fraud investigations
- Proactive fraud prevention capabilities
""")

# Optional: Save models and results
print(f"\nModels and predictions ready for deployment!")
print(f"- Best One-Class SVM model: final_oc_svm")
print(f"- Ensemble predictions available: ensemble_test_pred")
print(f"- Submission files: submission_default, submission_optimized")

In [None]:
# =============================================================================
# EXPORT FUNCTIONS FOR PRODUCTION USE
# =============================================================================

def predict_fraud_ocsvm(new_transactions, model=final_oc_svm, scaler=final_scaler, threshold=optimal_threshold):
    """
    Function to predict fraud on new transactions using trained One-Class SVM
    
    Parameters:
    - new_transactions: DataFrame with same features as training data
    - model: Trained One-Class SVM model
    - scaler: Fitted scaler object
    - threshold: Decision threshold for fraud classification
    
    Returns:
    - predictions: Binary predictions (1 for fraud, 0 for normal)
    - scores: Anomaly scores (lower = more anomalous)
    - probabilities: Fraud probabilities
    """
    # Remove ID columns if present
    features = new_transactions.drop(['TransactionID', 'uid'], axis=1, errors='ignore')
    
    # Scale features
    features_scaled = scaler.transform(features)
    
    # Get predictions and scores
    raw_predictions = model.predict(features_scaled)
    scores = model.decision_function(features_scaled)
    
    # Apply threshold
    binary_predictions = (scores < threshold).astype(int)
    
    # Convert to probabilities
    probabilities = 1 / (1 + np.exp(-scores))
    
    return binary_predictions, scores, probabilities

def get_fraud_confidence(scores, thresholds_dict):
    """
    Categorize fraud confidence based on decision function scores
    """
    conditions = [
        scores < thresholds_dict['high_confidence'],
        scores < thresholds_dict['medium_confidence'],
        scores < thresholds_dict['low_confidence']
    ]
    choices = ['High Confidence Fraud', 'Medium Confidence Fraud', 'Low Confidence Fraud']
    
    return np.select(conditions, choices, default='Normal Transaction')

# Define confidence thresholds based on percentiles
confidence_thresholds = {
    'high_confidence': np.percentile(final_val_scores, 1),
    'medium_confidence': np.percentile(final_val_scores, 5),
    'low_confidence': np.percentile(final_val_scores, 10)
}

# Apply to test set
test_confidence = get_fraud_confidence(test_scores, confidence_thresholds)
confidence_distribution = pd.Series(test_confidence).value_counts()

print(f"\nTest Set Fraud Confidence Distribution:")
print(confidence_distribution)

# Create detailed submission with confidence levels
submission_detailed = pd.DataFrame({
    'TransactionID': testX['TransactionID'],
    'isFraud': test_pred_optimized,
    'fraud_score': test_scores,
    'fraud_probability': 1 / (1 + np.exp(-test_scores)),
    'confidence_level': test_confidence
})

print(f"\nDetailed submission sample (first 10 high-risk transactions):")
high_risk_sample = submission_detailed[submission_detailed['isFraud'] == 1].head(10)
print(high_risk_sample[['TransactionID', 'fraud_probability', 'confidence_level']].round(4))

print(f"\n{'='*60}")
print("ONE-CLASS SVM ANALYSIS COMPLETE!")
print(f"{'='*60}")
print(f"Total fraudulent transactions detected: {np.sum(test_pred_optimized)}")
print(f"Detection rate: {np.mean(test_pred_optimized)*100:.2f}%")
print(f"Ready for production deployment!")

# Optional: Uncomment to save results
# submission_default.to_csv('one_class_svm_predictions_default.csv', index=False)
# submission_optimized.to_csv('one_class_svm_predictions_optimized.csv', index=False)
# submission_detailed.to_csv('one_class_svm_predictions_detailed.csv', index=False)