In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader

import sys

sys.path.append("..")

from src.dataset import TreeHealthDataset
from src.model import TabularModel
from src.utils.hyperoptimization import save_trials

In [2]:
data = pd.read_csv("../data/processed/tree-data-processed-vec.csv")

In [3]:
cat_features = [
    "spc_latin",
    "user_type",
    "address",
    "postcode",
    "nta",
    "boro_ct",
    "month",
    "day",
]
num_features = ["tree_dbh", "problems", "tg_conditions"]
target_column = "health"

In [4]:
cat_cardinalities = [data.nunique()[feature] for feature in cat_features]
num_numeric_features = len(num_features)
num_classes = data[target_column].nunique()

In [5]:
X = data[cat_features + num_features]
y = data[target_column]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
y_train_np = y_train.values
class_weights = compute_class_weight(
    class_weight="balanced", classes=np.unique(y_train_np), y=y_train_np
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

In [8]:
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)

In [9]:
train_dataset = TreeHealthDataset(train_df, cat_features, num_features, target_column)
val_dataset = TreeHealthDataset(val_df, cat_features, num_features, target_column)

In [10]:
space = {}
num_cat = len(cat_features)
for i in range(num_cat):
    space[f"cat_emb_dim_{i}"] = hp.quniform(f"cat_emb_dim_{i}", 1, 20, 1)

num_hidden_dims = 2
for i in range(num_hidden_dims):
    space[f"hidden_dim_{i}"] = hp.quniform(f"hidden_dim_{i}", 4, 128, 1)

space["negative_slope"] = hp.uniform("negative_slope", 0.0, 0.9)
space["dropout_p"] = hp.uniform("dropout_p", 0.0, 0.9)
space["batch_size"] = hp.quniform("batch_size", 256, 2048, 1)
space["learning_rate"] = hp.loguniform("learning_rate", np.log(1e-4), np.log(1e-2))
space["weight_decay"] = hp.loguniform("weight_decay", np.log(1e-6), np.log(1e-3))

In [11]:
def objective(params):
    cat_embedding_dims = [int(params[f"cat_emb_dim_{i}"]) for i in range(num_cat)]
    hidden_dims = [int(params[f"hidden_dim_{i}"]) for i in range(num_hidden_dims)]
    dropout_p = params["dropout_p"]
    batch_size = int(params["batch_size"])
    learning_rate = params["learning_rate"]
    weight_decay = params["weight_decay"]
    negative_slope = params["negative_slope"]

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = TabularModel(
        cat_cardinalities,
        cat_embedding_dims,
        num_numeric_features,
        hidden_dims,
        num_classes,
        dropout_p=dropout_p,
        negative_slope=negative_slope,
    )
    model = model.to(device)

    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.Adam(
        model.parameters(), lr=learning_rate, weight_decay=weight_decay
    )

    num_epochs = 10
    for epoch in range(num_epochs):
        model.train_epoch(train_loader, train_dataset, optimizer, criterion, device)

    val_accuracy = model.validate_model(val_loader, device)
    print(f"Params: {params}, Val Accuracy: {val_accuracy:.4f}")

    return {"loss": -val_accuracy, "status": STATUS_OK, "params": params}

In [12]:
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

print("Best hyperparameters:", best)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

Params: {'batch_size': 492.0, 'cat_emb_dim_0': 11.0, 'cat_emb_dim_1': 17.0, 'cat_emb_dim_2': 6.0, 'cat_emb_dim_3': 5.0, 'cat_emb_dim_4': 19.0, 'cat_emb_dim_5': 7.0, 'cat_emb_dim_6': 12.0, 'cat_emb_dim_7': 13.0, 'dropout_p': 0.014296189490474244, 'hidden_dim_0': 15.0, 'hidden_dim_1': 18.0, 'learning_rate': 0.0007526593040586736, 'negative_slope': 0.7355929358952529, 'weight_decay': 0.00044617183843611376}, Val Accuracy: 0.7119
Params: {'batch_size': 330.0, 'cat_emb_dim_0': 11.0, 'cat_emb_dim_1': 15.0, 'cat_emb_dim_2': 10.0, 'cat_emb_dim_3': 10.0, 'cat_emb_dim_4': 12.0, 'cat_emb_dim_5': 11.0, 'cat_emb_dim_6': 19.0, 'cat_emb_dim_7': 18.0, 'dropout_p': 0.2589777389699995, 'hidden_dim_0': 43.0, 'hidden_dim_1': 58.0, 'learning_rate': 0.008224306271697015, 'negative_slope': 0.37213979934233493, 'weight_decay': 0.0006128240961070584}, Val Accuracy: 0.5638
Params: {'batch_size': 1253.0, 'cat_emb_dim_0': 15.0, 'cat_emb_dim_1': 3.0, 'cat_emb_dim_2': 10.0, 'cat_emb_dim_3': 2.0, 'cat_emb_dim_4': 5.

KeyboardInterrupt: 

In [None]:
save_trials(trials, "hyperopt_result_1.json")