In [1]:
# Set the working directory to the parent directory
import sys
sys.path.append('..')
sys.dont_write_bytecode = True

# Import relevant packages
from src.eda import data_info
from src.models import AutoencoderTrainer, AnomalyDetector, HybridLoss, AutoencoderTuner
import pandas as pd
import warnings
import numpy as np
from IPython.display import display

# Config
pd.set_option('display.max_columns', None) # Ensure all columns are displayed
warnings.filterwarnings("ignore")

In [2]:
# Read relevant files
X_train = pd.read_feather("../data/processed/X_train.feather")
X_train_validate = pd.read_feather("../data/processed/X_train_validate.feather")

# Get data info
var_info = data_info(X_train)
all_cols = X_train.columns
real_cols = var_info[var_info["var_type"]=="numerical"]["var_name"].tolist()
binary_cols = var_info[var_info["var_type"]=="binary"]["var_name"].tolist()

## Test the trainer

In [3]:
ae = AutoencoderTrainer(
    input_dim=X_train.shape[1],
    real_cols=real_cols,
    binary_cols=binary_cols,
    all_cols=all_cols,
    hidden_dims=[64],
    learning_rate=1e-2,
    dropout_rate=0.2
)
autoencoder = ae.train(X_train, X_train_validate)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## Test the anomaly detector

In [4]:
# Read relevant files
X_validate = pd.read_feather("../data/processed/X_validate.feather")
y_validate = pd.read_feather("../data/processed/y_validate.feather")

# After training
detector = AnomalyDetector(
    model=autoencoder,
    real_cols=real_cols,
    binary_cols=binary_cols,
    all_cols=all_cols,
    lam=ae.lam,
)

# Compute scores
scores = detector.compute_anomaly_scores(X_validate)

# Detect
y_pred = detector.detect(scores, 0.3)

# Evaluate
metrics = detector.evaluate(y_pred, y_validate, scores)
print(metrics)

{'accuracy': 0.7097844112769486, 'precision': 0.369431643625192, 'recall': 0.6260303687635574, 'f1_score': 0.4646594751247786, 'auc': 0.726876360488752}


## Hyperparameter Tuning

In [None]:
param_grid = {
    'hidden_dims': [[64], [64, 32]],
    'learning_rate': [1e-2, 1e-3],
    'lam': [1e-3, 1e-4],
    'dropout_rate': [None, 0.2],
    'activation': ['relu', 'tanh'],
    'batch_size': [32, 64],
}

tuner = AutoencoderTuner(X_train, X_train_validate, X_validate, y_validate, real_cols, binary_cols, all_cols, verbose=False)
best_model, best_params, best_auc, results_df = tuner.tune(param_grid, metric="f1_score")


Training with: {'hidden_dims': [64], 'learning_rate': 0.01, 'lam': 0.001}
  Threshold 0.6370 → F1_score = 0.4894
  Threshold 0.6900 → F1_score = 0.4970
  Threshold 0.7819 → F1_score = 0.5007
  Threshold 1.1300 → F1_score = 0.4513
  Threshold 2.1073 → F1_score = 0.2752

Training with: {'hidden_dims': [64, 32], 'learning_rate': 0.01, 'lam': 0.001}
  Threshold 0.4441 → F1_score = 0.4751
  Threshold 0.4940 → F1_score = 0.4815
  Threshold 0.5921 → F1_score = 0.4534
  Threshold 0.8437 → F1_score = 0.3996
  Threshold 2.2144 → F1_score = 0.2446

Best parameters found:
- hidden_dims: [64]
- learning_rate: 0.01
- lam: 0.001
- threshold: 0.7818549156188963
Best validation F1_score: 0.5007


In [10]:
results_df

Unnamed: 0,hidden_dims,learning_rate,lam,threshold,accuracy,precision,recall,f1_score,auc
0,[64],0.01,0.001,0.636967,0.744087,0.408787,0.609544,0.489377,0.740467
1,[64],0.01,0.001,0.689969,0.779349,0.459022,0.541866,0.497016,0.740467
2,[64],0.01,0.001,0.781855,0.812167,0.538155,0.468113,0.500696,0.740467
3,[64],0.01,0.001,1.13003,0.827878,0.62917,0.351844,0.451308,0.740467
4,[64],0.01,0.001,2.107344,0.817928,0.691099,0.1718,0.275191,0.740467
5,"[64, 32]",0.01,0.001,0.444139,0.736929,0.396858,0.591757,0.475096,0.722153
6,"[64, 32]",0.01,0.001,0.493959,0.772541,0.444689,0.524946,0.481496,0.722153
7,"[64, 32]",0.01,0.001,0.592141,0.794362,0.487282,0.423861,0.453364,0.722153
8,"[64, 32]",0.01,0.001,0.843698,0.811644,0.557021,0.311497,0.399555,0.722153
9,"[64, 32]",0.01,0.001,2.214415,0.810247,0.614311,0.152711,0.244614,0.722153


In [12]:
# Read relevant files
X_test = pd.read_feather("../data/processed/X_test.feather")
y_test = pd.read_feather("../data/processed/y_test.feather")

# After training
detector = AnomalyDetector(
    model=best_model,
    real_cols=real_cols,
    binary_cols=binary_cols,
    all_cols=all_cols,
    lam=ae.lam,
)

# Compute scores
scores = detector.compute_anomaly_scores(X_test)

# Detect
y_pred = detector.detect(scores, best_params['threshold'])

# Evaluate
metrics = detector.evaluate(y_pred, y_test, scores)
print(metrics)

{'accuracy': 0.8253469494632103, 'precision': 0.6333865814696485, 'recall': 0.33961456102783727, 'f1_score': 0.44215221633677165, 'auc': 0.7433049826125699}
