In [None]:
from IPython import get_ipython

import optuna
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import Tensor
from torch import nn
import torch.optim as optim

import os

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

# do local imports
import sys

sys.path.append("..")

from src.models.autoencoder import Autoencoder
from src.model_selection.objective import AEObjective, split_train_test
from src.optim.early_stopping import EarlyStopping

## Load data

In [None]:
root_path = ""
if "google.colab" in str(get_ipython()):
    root_path = "/content/drive/MyDrive/data/"
else:
    root_path = "../data/"

In [None]:
data = pd.read_pickle(f"{root_path}preprocessed/load.pkl")
data.head()

In [None]:
features = data.columns.to_list()

# remove target from features and save to separate df
X = data[features]
y = data[["S_TOT"]]
# convert to series
y = y.iloc[:, 0]

X_train, y_train, X_val, y_val, _, _ = split_train_test(X, y)


## Manual search

In [None]:
epochs = 2048
batch_size = 1024
lr = 3e-4
bottleneck_capacity = 64
num_layers = 3
dropout = 0.5
activation = "ReLU"
num_features = X_train.shape[1]

dataset_train = TensorDataset(Tensor(X_train.astype(np.float32).values))
dataset_val = TensorDataset(Tensor(X_val.astype(np.float32).values))

train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

#  use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load model to device
model = Autoencoder(
    num_features=num_features,
    bottleneck_capacity=bottleneck_capacity,
    num_layers=num_layers,
    dropout=dropout,
    activation=activation,
).to(device)

print(model)

optimizer = optim.Adam(model.parameters(), lr=lr)
loss = nn.MSELoss()

# keep track of val loss and do early stopping
early_stopping = EarlyStopping(patience=5)
for epoch in range(epochs):

    loss_in_epoch_train = 0

    # perform training
    model.train()
    for batch_features in train_loader:

        # reshape mini-batch data to [N, [X.shape[1]] matrix
        batch_features = batch_features[0].to(device)

        optimizer.zero_grad()

        outputs, _ = model(batch_features)
        train_loss = loss(outputs, batch_features)

        # compute accumulated gradients
        train_loss.backward()

        # perform parameter update based on current gradients
        optimizer.step()

        # add the mini-batch training loss to epoch loss
        loss_in_epoch_train += train_loss.item()

        # Validation of the model e. g., disable Dropout when testing
        model.eval()

    loss_in_epoch_test = 0

    with torch.no_grad():
        for batch_features in val_loader:
            # reshape mini-batch data to [N, [X.shape[1]] matrix
            batch_features = batch_features[0].to(device)
            outputs, _ = model(batch_features)

            test_loss = loss(outputs, batch_features)
            loss_in_epoch_test += test_loss.item()

    train_loss = loss_in_epoch_train / len(train_loader)
    test_loss = loss_in_epoch_test / len(val_loader)

    # return early if test loss doesn't decrease for several iterations
    early_stopping(test_loss)
    if early_stopping.early_stop:
        break

    print(f"epoch : {epoch + 1}/{epochs},", end=" ")
    print(f"loss (train) = {train_loss:.8f}, loss (test) = {test_loss:.8f}")

## Bayesian search
Bayesian search build on top of [optuna](https://optuna.org/).

In [None]:
N_TRIALS = 64
name = "load_outlier"

study = optuna.create_study(direction="minimize")
objective = AEObjective(X, y, name)

study.optimize(objective, n_trials=N_TRIALS)

optimized_params = study.best_trial.params
print(f"params: {optimized_params}")
print(f"no: {study.best_trial.number}")
study_all_trails = study.trials_dataframe()
study_all_trails.to_csv(f"../docs/study_AE_{name}_all_parameters.csv")


In [None]:
optuna.visualization.matplotlib.plot_optimization_history(study)
optuna.visualization.matplotlib.plot_slice(study)
optuna.visualization.matplotlib.plot_contour(
    study,
    ["lr", "batch_size", "num_layers", "bottleneck_capacity", "activation", "dropout"],
)


## Reconstruction

In [None]:
# overwrite manually if needed
no_trial = study.best_trial.number
num_layers_trial = optimized_params.get("num_layers")
bottleneck_capacity_trial = optimized_params.get("bottleneck_capacity")
dropout_trial = optimized_params.get("dropout")
batch_size_trial = optimized_params.get("batch_size")
activation_trial = optimized_params.get("activation")

model = Autoencoder(
    num_features=X.shape[1],
    num_layers=num_layers_trial,
    bottleneck_capacity=bottleneck_capacity_trial,
    dropout=dropout_trial,
    activation=activation_trial,
)
model.load_state_dict(
    torch.load(f"../models/{model.__class__.__name__}_{name}_trial_{no_trial}.pth")
)
model.eval()
encoder = model.encoder
print(encoder)

# process entire dataset
dataset_full = TensorDataset(Tensor(X.astype(np.float32).values))
full_loader = DataLoader(dataset_full, batch_size=batch_size_trial, shuffle=False)

low_dim = []

for batch_features in full_loader:
    latent = encoder(batch_features[0])
    low_dim.append(latent.detach().cpu().numpy())

X_low_dim = pd.DataFrame(np.concatenate(low_dim), index=X.index)
X_low_dim = X_low_dim.add_prefix("pc_")

# add original target back to data set
X_low_dim["S_TOT"] = y

X_low_dim.to_pickle(f"../data/preprocessed/{name}_low_dim.pkl")
