In [1]:
# Core libraries for data processing and machine learning
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score,
                             recall_score, f1_score, roc_auc_score)
from sklearn.decomposition import PCA
import time
from sklearn.neural_network import MLPRegressor
from sklearn.exceptions import ConvergenceWarning
# ConvergenceWarning 무시
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)


print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# ==========================================
# Data Loading and Preprocessing Pipeline
# ==========================================

# Load preprocessed credit card fraud dataset
df = pd.read_csv("preprocessed-creditcard.csv")
X = df.drop("Class", axis=1).values  # Feature matrix
y = df["Class"].values                # Target labels (0: normal, 1: fraud)

print(f"Dataset loaded: {X.shape[0]} samples, {X.shape[1]} features")
print(f"Fraud rate: {np.mean(y):.4f} ({np.sum(y)} fraud cases)")

# Stratified train-test split to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Feature standardization using Z-score normalization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# Dimensionality reduction using PCA to match quantum register size
pca = PCA(n_components=4, random_state=42)
X_train_4d = pca.fit_transform(X_train)
X_test_4d  = pca.transform(X_test)

print(f"\nTraining set: {X_train_4d.shape}")
print(f"Test set: {X_test_4d.shape}")
print(f"PCA explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {np.sum(pca.explained_variance_ratio_):.4f}")

Dataset loaded: 946 samples, 30 features
Fraud rate: 0.5000 (473 fraud cases)

Training set: (756, 4)
Test set: (190, 4)
PCA explained variance ratio: [0.38421646 0.10954544 0.06067923 0.05752846]
Total variance explained: 0.6120


In [3]:
# ==========================================
# Configuration
# ==========================================


# TRAINING CONFIGURATION
TRAINING_CONFIG = {
    'epochs_classical': 100,
    'batch_size_classical' : 16,
    'learning_rate': 0.001      # Adam optimizer stepsize
}

print(f"\nTraining Configuration: {TRAINING_CONFIG}")
print("="*80)


Training Configuration: {'epochs_classical': 100, 'batch_size_classical': 16, 'learning_rate': 0.001}


In [4]:
def compute_classical_batch_cost(samples, model):
    """
    Classical AE용 배치 손실 계산 함수.
    
    Args:
        samples: (배치) 원본 데이터, shape = (batch_size, n_features)
        model: 학습된 MLPRegressor 오토인코더
    
    Returns:
        linear_loss: 평균 절댓값 오차 (MAE)
        squared_loss: 평균 제곱 오차 (MSE)
    """
    recon = model.predict(samples)
    errors = recon - samples
    linear_loss   = np.mean(np.abs(errors))
    squared_loss  = np.mean(errors**2)
    return linear_loss, squared_loss

In [19]:
def train_classical_ae_strategy():
    print(f"\n{'='*60}")
    print("TRAINING: CLASSICAL AUTOENCODER STRATEGY")
    print(f"{'='*60}")

    ae = MLPRegressor(
        hidden_layer_sizes=(2),
        activation='relu',
        solver='adam',
        learning_rate_init=TRAINING_CONFIG['learning_rate'],
        max_iter=1,           # 한 에포크씩 학습
        warm_start=True,
        batch_size=TRAINING_CONFIG['batch_size_classical'],
        random_state=42,
        verbose=False
    )

    linear_losses  = []
    squared_losses = []

    for epoch in range(1, TRAINING_CONFIG['epochs_classical']+1):
        # 한 에포크 학습
        ae.fit(X_train_4d, X_train_4d)

        # 전체 학습 세트에 대한 loss 계산
        lin_loss, sq_loss = compute_classical_batch_cost(X_train_4d, ae)
        linear_losses.append(lin_loss)
        squared_losses.append(sq_loss)

        # 5 에포크마다 출력
        if epoch == 1 or epoch % 5 == 0:
            print(f"  Epoch {epoch:2d}/{TRAINING_CONFIG['epochs_classical']} — "
                  f"Linear Loss: {lin_loss:.6f}, "
                  f"Squared Loss: {sq_loss:.6f}")

    # 테스트 세트 평가
    recon_test  = ae.predict(X_test_4d)
    test_errors = np.mean((X_test_4d - recon_test)**2, axis=1)
    threshold   = np.percentile(
        np.mean((X_train_4d - ae.predict(X_train_4d))**2, axis=1),
        TRAINING_CONFIG.get('threshold_percentile', 95)
    )
    y_pred = (test_errors > threshold).astype(int)

    print(f"\n  - Threshold (95th percentile of train MSE): {threshold:.6f}")
    print(f"  - Test set anomaly rate: {y_pred.mean():.4f}")

    return {
        'strategy'       : 'classical_ae',
        'model'          : ae,
        'threshold'      : threshold,
        'y_pred'         : y_pred,
        'linear_losses'  : linear_losses,
        'squared_losses' : squared_losses
    }

In [20]:
# Train Angle strategy
results = {}
total_start_time = time.time()
try:
    classical_result = train_classical_ae_strategy()
    results['classical'] = classical_result
    print(f"✓ classical strategy completed successfully")
except Exception as e:
    print(f"✗ classical strategy failed: {str(e)}")
    results['classical'] = {'error': str(e)}



TRAINING: CLASSICAL AUTOENCODER STRATEGY
  Epoch  1/100 — Linear Loss: 1.710141, Squared Loss: 6.008520
  Epoch  5/100 — Linear Loss: 1.370326, Squared Loss: 4.306587
  Epoch 10/100 — Linear Loss: 1.100025, Squared Loss: 2.972238
  Epoch 15/100 — Linear Loss: 0.921455, Squared Loss: 2.203886
  Epoch 20/100 — Linear Loss: 0.811215, Squared Loss: 1.832543
  Epoch 25/100 — Linear Loss: 0.736866, Squared Loss: 1.634872
  Epoch 30/100 — Linear Loss: 0.687462, Squared Loss: 1.514719
  Epoch 35/100 — Linear Loss: 0.643693, Squared Loss: 1.432691
  Epoch 40/100 — Linear Loss: 0.602901, Squared Loss: 1.374535
  Epoch 45/100 — Linear Loss: 0.566910, Squared Loss: 1.332252
  Epoch 50/100 — Linear Loss: 0.537200, Squared Loss: 1.302266
  Epoch 55/100 — Linear Loss: 0.520728, Squared Loss: 1.283323
  Epoch 60/100 — Linear Loss: 0.512190, Squared Loss: 1.270455
  Epoch 65/100 — Linear Loss: 0.507241, Squared Loss: 1.260618
  Epoch 70/100 — Linear Loss: 0.504338, Squared Loss: 1.252431
  Epoch 75/10

In [21]:
# Compute test reconstruction MSE and convert to “fidelity”
recon_test = classical_result['model'].predict(X_test_4d)
errors = np.mean((X_test_4d - recon_test)**2, axis=1)
fidelities = 1.0 - errors

print("Fidelity statistics:")
print(f"  Mean: {fidelities.mean():.4f}")
print(f"  Std:  {fidelities.std():.4f}")
print(f"  Min:  {fidelities.min():.4f}")
print(f"  Max:  {fidelities.max():.4f}\n")

# Threshold optimization
thresholds = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
print("Threshold optimization:")
for T in thresholds:
    y_pred = (fidelities < T).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec  = recall_score(y_test, y_pred, zero_division=0)
    f1   = f1_score(y_test, y_pred, zero_division=0)
    spec = tn / (tn + fp) if (tn + fp) else 0.0
    gmean = (rec * spec)**0.5
    print(f"  T={T:.1f}: Acc={acc:.3f} Prec={prec:.3f} Rec={rec:.3f}"
          f" F1={f1:.3f} G-Mean={gmean:.3f}")

# Results summary
auc = roc_auc_score(y_test, 1.0 - fidelities)
best_T, best_g = max(
    ((T, (recall_score(y_test, (fidelities<T).astype(int), zero_division=0) *
         (confusion_matrix(y_test, (fidelities<T).astype(int), labels=[0,1]).ravel()[0] /
          (confusion_matrix(y_test, (fidelities<T).astype(int), labels=[0,1]).ravel()[0] +
           confusion_matrix(y_test, (fidelities<T).astype(int), labels=[0,1]).ravel()[1])))**0.5)
     for T in thresholds),
    key=lambda x: x[1]
)
# Compute metrics at best threshold
y_best = (fidelities < best_T).astype(int)
m = {
    'Accuracy': accuracy_score(y_test, y_best),
    'Precision': precision_score(y_test, y_best, zero_division=0),
    'Recall': recall_score(y_test, y_best, zero_division=0),
    'F1': f1_score(y_test, y_best, zero_division=0),
    'Gmean': (recall_score(y_test, y_best, zero_division=0) *
              (confusion_matrix(y_test, y_best, labels=[0,1]).ravel()[0] /
               (confusion_matrix(y_test, y_best, labels=[0,1]).ravel()[0] +
                confusion_matrix(y_test, y_best, labels=[0,1]).ravel()[1])))**0.5
}

print("\nRESULTS SUMMARY:")
print(f"  AUC-ROC Score: {auc:.4f}")
print(f"  Best Threshold: {best_T:.1f} (G-Mean: {best_g:.3f})")
print(f"  Best Performance: Acc={m['Accuracy']:.3f}, Prec={m['Precision']:.3f},"
      f" Rec={m['Recall']:.3f}, F1={m['F1']:.3f}")

Fidelity statistics:
  Mean: -0.1032
  Std:  4.7260
  Min:  -52.9669
  Max:  0.9990

Threshold optimization:
  T=0.3: Acc=0.616 Prec=0.806 Rec=0.305 F1=0.443 G-Mean=0.532
  T=0.4: Acc=0.637 Prec=0.795 Rec=0.368 F1=0.504 G-Mean=0.578
  T=0.5: Acc=0.658 Prec=0.800 Rec=0.421 F1=0.552 G-Mean=0.614
  T=0.6: Acc=0.700 Prec=0.817 Rec=0.516 F1=0.632 G-Mean=0.675
  T=0.7: Acc=0.742 Prec=0.795 Rec=0.653 F1=0.717 G-Mean=0.737
  T=0.8: Acc=0.753 Prec=0.767 Rec=0.726 F1=0.746 G-Mean=0.752

RESULTS SUMMARY:
  AUC-ROC Score: 0.7935
  Best Threshold: 0.8 (G-Mean: 0.752)
  Best Performance: Acc=0.753, Prec=0.767, Rec=0.726, F1=0.746
