In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import numpy as np
import pandas as pd

import sys

sys.path.append("..")

from src.model import TabularModel
from src.dataset import TreeHealthDataset
from src.utils.hyperoptimization import load_best_trial

In [22]:
data = pd.read_csv("../data/processed/tree-data-processed-vec.csv")

In [23]:
data.head()

Unnamed: 0,tree_dbh,health,spc_latin,user_type,problems,address,postcode,nta,boro_ct,month,day,tg_conditions,curb_loc,steward,guards,sidewalk
0,0.007059,0,11,1,-0.206868,15310,157,124,1859,7,26,-0.286018,1,3,2,1
1,0.049412,0,108,1,0.030875,77475,140,151,1932,8,2,-0.372935,1,3,2,0
2,0.007059,1,57,2,-0.206868,245012,107,45,1007,8,4,0.236712,1,0,2,0
3,0.023529,1,57,2,0.030875,6283,107,45,1007,8,4,-0.372935,1,3,2,0
4,0.049412,1,124,2,0.030875,307673,111,16,741,7,29,-0.372935,1,3,2,0


In [24]:
cat_features = [
    "spc_latin",
    "user_type",
    "address",
    "postcode",
    "nta",
    "boro_ct",
    "month",
    "day",
]
num_features = ["tree_dbh", "problems", "tg_conditions"]
target_column = "health"

In [25]:
best_hyperparams = load_best_trial("..\hyperopt\hyperopt_results_1.json")["result"][
    "params"
]

In [26]:
best_hyperparams

{'batch_size': 1479.0,
 'cat_emb_dim_0': 10.0,
 'cat_emb_dim_1': 5.0,
 'cat_emb_dim_2': 6.0,
 'cat_emb_dim_3': 8.0,
 'cat_emb_dim_4': 9.0,
 'cat_emb_dim_5': 2.0,
 'cat_emb_dim_6': 6.0,
 'cat_emb_dim_7': 8.0,
 'dropout_p': 0.3757636133377259,
 'hidden_dim': 25.0,
 'learning_rate': 0.0020986773983276454,
 'weight_decay': 1.0060248893757618e-06}

In [27]:
# HYPERPARAMS
CAT_EMBEDDING_DIMS = [int(best_hyperparams[f"cat_emb_dim_{i}"]) for i in range(8)]
HIDDEN_DIMS = [int(best_hyperparams["hidden_dim"])]
DROPOUT_P = best_hyperparams["dropout_p"]
BATCH_SIZE = int(best_hyperparams["batch_size"])
LEARNING_RATE = best_hyperparams["learning_rate"]
weight_decay = best_hyperparams["weight_decay"]

In [28]:
cat_cardinalities = [data.nunique()[feature] for feature in cat_features]
num_numeric_features = len(num_features)
num_classes = data[target_column].nunique()

In [33]:
X = data[cat_features + num_features]
y = data[target_column]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

In [34]:
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [35]:
train_dataset = TreeHealthDataset(train_df, cat_features, num_features, target_column)
val_dataset = TreeHealthDataset(val_df, cat_features, num_features, target_column)
test_dataset = TreeHealthDataset(test_df, cat_features, num_features, target_column)

In [36]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [38]:
model = torch.load("../models/torch_models/model_v1.pth", weights_only=False)
model = model.to(device)

In [40]:
model = TabularModel(
    cat_cardinalities,
    CAT_EMBEDDING_DIMS,
    num_numeric_features,
    HIDDEN_DIMS,
    num_classes,
    dropout_p=DROPOUT_P,
)
model = model.to(device)

In [41]:
y_train_np = y_train.values
class_weights = compute_class_weight(
    class_weight="balanced", classes=np.unique(y_train_np), y=y_train_np
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
class_weights

tensor([2.2530, 0.4110, 8.1212])

In [42]:
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=weight_decay)

In [45]:
num_epochs = 2
for epoch in range(num_epochs):
    epoch_loss = model.train_epoch(
        train_loader, train_dataset, optimizer, criterion, device
    )
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}")

    val_accuracy = model.validate_model(val_loader, device)
    print(f"Validation Accuracy: {val_accuracy:.4f}")

Epoch 1/2, Training Loss: 0.5798
Validation Accuracy: 0.7586
Epoch 2/2, Training Loss: 0.5560
Validation Accuracy: 0.7562


In [19]:
model = torch.load("../models/torch_models/model_v1.pth", weights_only=False)

In [46]:
print(classification_report(y_test, model.predict(test_loader, device)))

              precision    recall  f1-score   support

           0       0.31      0.30      0.30      9160
           1       0.86      0.87      0.86     50208
           2       0.18      0.16      0.17      2541

    accuracy                           0.76     61909
   macro avg       0.45      0.44      0.45     61909
weighted avg       0.75      0.76      0.75     61909



In [66]:
model.validate_model(test_loader, device)

0.8438667714805681

In [23]:
torch.save(model, "../models/torch_models/model_v2.pth")