# Autoencoder-Based Anomaly Detection for CMS Open Payments

**Project:** AAI-540 Machine Learning Operations - Final Team Project  
**Context:** Continuation of notebook 03 - Feature Engineering & Model Preparation  
**Objective:** Train an Autoencoder model to detect anomalous payment patterns using reconstruction error as anomaly score

---

## Table of Contents
1. [Setup & Data Loading](#setup)
2. [Load Data from Stored Variables](#loading)
3. [Data Preparation & Normalization](#preparation)
4. [Autoencoder Architecture Design](#architecture)
5. [Model Training with Early Stopping](#training)
6. [Performance Evaluation](#evaluation)
7. [Anomaly Score Calculation](#scoring)
8. [Visualizations & Metrics](#visualizations)
9. [Confusion Matrix & ROC Analysis](#confusion)
10. [Summary & Outputs](#summary)

---

## 1. Setup & Data Loading

Load dependencies and restore configuration from notebook 03 (Feature Engineering).

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import time
import boto3

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
%store -r cms_payments_feature_group_name
%store -r record_identifier_feature_name
%store -r region
%store -r bucket
%store -r s3_athena_staging
%store -r database_name
%store -r table_name_parquet
%store -r offline_s3_uri
%store -r df

if 'df' not in dir() or df is None:
    raise NameError("Missing required variable 'df'. Run notebook 03 first.")

## 2. Load Data from Stored Variables

Use the engineered dataset from notebook 03.

In [None]:
df_payments = df.copy()
display(df_payments.head(3))

## 3. Data Preparation & Normalization

Prepare features for Autoencoder training with appropriate scaling.

In [None]:
numeric_cols = df_payments.select_dtypes(include=[np.number]).columns.tolist()

cols_to_exclude = [
    'EventTime', 'covered_recipient_profile_id', 'index',
    'teaching_hospital_id', 'covered_recipient_npi',
    'covered_recipient_first_name', 'covered_recipient_middle_name',
    'covered_recipient_last_name', 'covered_recipient_name_suffix',
    'recipient_primary_business_street_address_line2',
    'recipient_zip_code', 'recipient_province', 'recipient_postal_code',
    'submitting_applicable_manufacturer_or_applicable_gpo_name',
    'applicable_manufacturer_or_applicable_gpo_making_payment_id',
    'applicable_manufacturer_or_applicable_gpo_making_payment_name'
]

numeric_features = [col for col in numeric_cols 
                   if col not in cols_to_exclude 
                   and not any(x in col.lower() for x in ['_id', '_name', '_address', '_code', '_province', '_postal'])]

X = df_payments[numeric_features].copy().astype(float)
X = X.replace([np.inf, -np.inf], np.nan)

missing_pct = (X.isnull().sum() / len(X)) * 100
cols_to_keep = missing_pct[missing_pct <= 50].index.tolist()
X = X[cols_to_keep]

for col in X.columns:
    q1, q3 = X[col].quantile(0.25), X[col].quantile(0.75)
    iqr = q3 - q1
    X[col] = X[col].clip(lower=q1 - 3*iqr, upper=q3 + 3*iqr)

X = X.fillna(X.median())

print(f"Data prepared: {X.shape} | Range: [{X.min().min():.2f}, {X.max().max():.2f}]")

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print(f"Scaled: {X_scaled.shape} | Range: [{X_scaled.min().min():.4f}, {X_scaled.max().max():.4f}]")

X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)

print(f"Train: {len(X_train):,} | Test: {len(X_test):,}")

## 4. Autoencoder Architecture Design

Design a deep Autoencoder with bottleneck layer for feature compression.

In [None]:
input_dim = X_scaled.shape[1]
encoding_dim_1 = max(input_dim // 2, 32)
encoding_dim_2 = max(input_dim // 4, 16)
bottleneck_dim = max(input_dim // 8, 8)

print(f"Architecture: {input_dim} -> {encoding_dim_1} -> {encoding_dim_2} -> {bottleneck_dim} (bottleneck)")

autoencoder = Sequential([
    layers.Dense(encoding_dim_1, activation='relu', input_shape=(input_dim,), name='encoder_input'),
    layers.Dropout(0.2),
    layers.Dense(encoding_dim_2, activation='relu', name='encoder_middle'),
    layers.Dropout(0.2),
    layers.Dense(bottleneck_dim, activation='relu', name='bottleneck'),
    layers.Dense(encoding_dim_2, activation='relu', name='decoder_middle'),
    layers.Dropout(0.2),
    layers.Dense(encoding_dim_1, activation='relu', name='decoder_layer'),
    layers.Dropout(0.2),
    layers.Dense(input_dim, activation='sigmoid', name='decoder_output')
], name='Autoencoder')

autoencoder.summary()

## 5. Model Training with Early Stopping

Train the Autoencoder with optimal configuration and early stopping to prevent overfitting.

In [None]:
# Compile model with Adam optimizer
optimizer = Adam(learning_rate=0.001)
autoencoder.compile(
    optimizer=optimizer,
    loss='mse',
    metrics=['mae']
)

In [None]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=0,
    min_delta=1e-5
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-6,
    verbose=0
)

In [None]:
start_time = time.time()

history = autoencoder.fit(
    X_train, X_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

training_time = time.time() - start_time
print(f"Training: {len(history.history['loss'])} epochs in {training_time:.2f}s")

## 6. Performance Evaluation

Evaluate model performance on training and test sets.

In [None]:
train_predictions = autoencoder.predict(X_train, verbose=0)
train_mse = np.mean(np.square(X_train - train_predictions), axis=1)

test_predictions = autoencoder.predict(X_test, verbose=0)
test_mse = np.mean(np.square(X_test - test_predictions), axis=1)

train_final_loss = history.history['loss'][-1]
val_final_loss = history.history['val_loss'][-1]

print(f"MSE - Train: {train_mse.mean():.6f} | Test: {test_mse.mean():.6f}")
print(f"Loss - Train: {train_final_loss:.6f} | Val: {val_final_loss:.6f}")

## 7. Anomaly Score Calculation

Generate anomaly scores and identify outliers using reconstruction error threshold.

In [None]:
all_data = np.vstack([X_train, X_test])
all_predictions = autoencoder.predict(all_data, verbose=0)
all_reconstruction_errors = np.mean(np.square(all_data - all_predictions), axis=1)

threshold_percentile = 95
threshold = np.percentile(train_mse, threshold_percentile)

anomaly_labels = (all_reconstruction_errors > threshold).astype(int)
anomaly_count = anomaly_labels.sum()
anomaly_percentage = (anomaly_count / len(anomaly_labels)) * 100

print(f"Threshold (95th percentile): {threshold:.6f}")
print(f"Anomalies: {anomaly_count:,}/{len(anomaly_labels):,} ({anomaly_percentage:.2f}%)")
print(f"Score: Mean={all_reconstruction_errors.mean():.6f} | Median={np.median(all_reconstruction_errors):.6f} | Std={all_reconstruction_errors.std():.6f}")

## 8. Visualizations & Metrics

Visualize training history, loss distributions, and anomaly scores.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

ax1 = axes[0, 0]
ax1.plot(history.history['loss'], label='Training Loss', linewidth=2)
ax1.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
ax1.set_xlabel('Epoch', fontsize=11, fontweight='bold')
ax1.set_ylabel('Loss (MSE)', fontsize=11, fontweight='bold')
ax1.set_title('Model Training History', fontsize=12, fontweight='bold')
ax1.legend(fontsize=10)
ax1.grid(True, alpha=0.3)

ax2 = axes[0, 1]
ax2.plot(history.history['mae'], label='Training MAE', linewidth=2)
ax2.plot(history.history['val_mae'], label='Validation MAE', linewidth=2)
ax2.set_xlabel('Epoch', fontsize=11, fontweight='bold')
ax2.set_ylabel('MAE', fontsize=11, fontweight='bold')
ax2.set_title('Model Training MAE', fontsize=12, fontweight='bold')
ax2.legend(fontsize=10)
ax2.grid(True, alpha=0.3)

ax3 = axes[1, 0]
ax3.hist(train_mse, bins=50, alpha=0.7, label='Training', color='blue', edgecolor='black')
ax3.hist(test_mse, bins=50, alpha=0.7, label='Test', color='red', edgecolor='black')
ax3.axvline(threshold, color='green', linestyle='--', linewidth=2.5, label=f'Threshold')
ax3.set_xlabel('Reconstruction Error (MSE)', fontsize=11, fontweight='bold')
ax3.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax3.set_title('Reconstruction Error Distribution', fontsize=12, fontweight='bold')
ax3.legend(fontsize=10)
ax3.grid(True, alpha=0.3)

ax4 = axes[1, 1]
ax4.hist(all_reconstruction_errors, bins=100, alpha=0.8, color='purple', edgecolor='black')
ax4.axvline(threshold, color='red', linestyle='--', linewidth=2.5, label='Threshold')
ax4.axvline(all_reconstruction_errors.mean(), color='orange', linestyle=':', linewidth=2.5, label='Mean')
ax4.set_xlabel('Anomaly Score', fontsize=11, fontweight='bold')
ax4.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax4.set_title('Anomaly Score Distribution', fontsize=12, fontweight='bold')
ax4.legend(fontsize=10)
ax4.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('autoencoder_training_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

percentiles = np.arange(1, 101)
ax.plot(percentiles, np.percentile(all_reconstruction_errors, percentiles), 
        linewidth=2.5, color='darkblue', marker='o', markersize=3)
ax.axhline(y=threshold, color='red', linestyle='--', linewidth=2.5, 
           label=f'95th Percentile: {threshold:.6f}')
ax.fill_between(percentiles, 0, np.percentile(all_reconstruction_errors, percentiles), 
                alpha=0.2, color='blue')
ax.set_xlabel('Percentile', fontsize=12, fontweight='bold')
ax.set_ylabel('Anomaly Score', fontsize=12, fontweight='bold')
ax.set_title('Anomaly Score Percentile Distribution', fontsize=13, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 9. Confusion Matrix & ROC Analysis

Analyze model performance with statistical metrics.

In [None]:
y_train = np.zeros(len(X_train))
y_test = np.zeros(len(X_test))

y_true = np.hstack([y_train, y_test])
y_pred = anomaly_labels

cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()

sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
f1 = 2 * (precision * sensitivity) / (precision + sensitivity) if (precision + sensitivity) > 0 else 0

print(f"Confusion Matrix:\n{cm}")
print(f"\nMetrics: Sensitivity={sensitivity:.4f} | Specificity={specificity:.4f} | Precision={precision:.4f} | F1={f1:.4f}")

try:
    roc_auc = roc_auc_score(y_true, all_reconstruction_errors)
    print(f"ROC-AUC: {roc_auc:.4f}")
except:
    print("ROC-AUC: Unable to compute")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax1 = axes[0]
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True, ax=ax1,
            xticklabels=['Normal', 'Anomaly'],
            yticklabels=['Normal', 'Anomaly'])
ax1.set_ylabel('True Label', fontsize=11, fontweight='bold')
ax1.set_xlabel('Predicted Label', fontsize=11, fontweight='bold')
ax1.set_title('Confusion Matrix', fontsize=12, fontweight='bold')

ax2 = axes[1]
metrics = ['Sensitivity', 'Specificity', 'Precision', 'F1-Score']
values = [sensitivity, specificity, precision, f1]
colors_bar = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']

bars = ax2.bar(metrics, values, color=colors_bar, edgecolor='black', linewidth=1.5)
ax2.set_ylabel('Score', fontsize=11, fontweight='bold')
ax2.set_title('Performance Metrics', fontsize=12, fontweight='bold')
ax2.set_ylim([0, 1.1])
ax2.grid(True, alpha=0.3, axis='y')

for bar, val in zip(bars, values):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
            f'{val:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
try:
    fpr, tpr, thresholds = roc_curve(y_true, all_reconstruction_errors)
    roc_auc = auc(fpr, tpr)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.plot(fpr, tpr, color='darkorange', lw=2.5, label=f'ROC Curve (AUC = {roc_auc:.3f})')
    ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
    ax.set_xlim([-0.01, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate', fontsize=12, fontweight='bold')
    ax.set_ylabel('True Positive Rate', fontsize=12, fontweight='bold')
    ax.set_title('ROC Curve - Autoencoder Anomaly Detection', fontsize=13, fontweight='bold')
    ax.legend(loc="lower right", fontsize=11)
    ax.grid(True, alpha=0.3)    
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"ROC curve: {str(e)[:80]}")

## 10. Summary & Outputs

Save model and anomaly detection results for downstream analysis.

In [None]:
model_path = 'cms_autoencoder_model'
autoencoder.save(model_path)
print(f"Model saved: {model_path}")

import pickle
scaler_path = 'feature_scaler.pkl'
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)
print(f"Scaler saved: {scaler_path}")

results_summary = pd.DataFrame({
    'Metric': ['Total Records', 'Anomalies Detected', 'Anomaly Percentage', 
               'Threshold', 'Mean Score', 'Training Time (sec)',
               'Epochs', 'Final Train Loss', 'Final Val Loss'],
    'Value': [len(anomaly_labels), anomaly_count, f'{anomaly_percentage:.2f}%',
              f'{threshold:.6f}', f'{all_reconstruction_errors.mean():.6f}',
              f'{training_time:.2f}', len(history.history['loss']),
              f'{train_final_loss:.6f}', f'{val_final_loss:.6f}']
})

print("\nExecution Summary:")
display(results_summary)