In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.1-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
import torch
import optuna
import random
import pandas as pd
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [None]:
class TransformerBlock(nn.Module): #build a transformer block with dropout and normilization
    def __init__(self, input_dim, num_heads, ffn_dim, dropout):
        super(TransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, dropout=dropout) #multi-head self-attention layer
        self.norm1 = nn.LayerNorm(input_dim) #first normalization after attention
        self.norm2 = nn.LayerNorm(input_dim) #second normalization after FFN
        self.ffn = nn.Sequential( #feed-forward network with ReLU activation
            nn.Linear(input_dim, ffn_dim),
            nn.ReLU(),
            nn.Linear(ffn_dim, input_dim)
        )
        self.dropout = nn.Dropout(dropout) #dropout for regularization

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x) #self-attention: query, key, value are the same
        x = self.norm1(x + self.dropout(attn_output)) #dropout + norm after attention

        ffn_output = self.ffn(x) #pass through feed-forward network
        x = self.norm2(x + self.dropout(ffn_output)) #dropout + norm after FFN

        return x

class TransformerMLP(nn.Module): #transformer MLP for numerical data
    def __init__(self, input_dim=62, hidden_dim=128, num_heads=4, num_layers=2, ffn_dim=128, dropout=0.1, output_dim=8):
        super(TransformerMLP, self).__init__()

        assert hidden_dim % num_heads == 0, "hidden_dim must be divisible by num_heads" #assertion for optuna optimization, otherwise model will error

        #standard transformer with embedding layer, attention heads, and output layer
        self.embedding = nn.Linear(input_dim, hidden_dim) #linear layer to project input into hidden_dim
        self.transformer_layers = nn.ModuleList([ #stack of transformer blocks
            TransformerBlock(hidden_dim, num_heads, ffn_dim, dropout) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(hidden_dim, output_dim) #final linear layer for output

    def forward(self, x):
        x = self.embedding(x) #project input to hidden space
        x = x.unsqueeze(1) #add sequence dimension (seq_len=1 for tabular input)

        for layer in self.transformer_layers:
            x = layer(x) #pass through each transformer block

        x = x.squeeze(1) #remove sequence dimension
        x = self.fc(x) #map to output dimension
        return x

In [None]:
#standard train loop
def train(model, data_loader, crit, opt, device): #model, trainset data loader, criterion, optimizer, device
  model.train()
  total_loss, correct, total = 0, 0, 0
  total_size = len(data_loader)
  for data, label in data_loader:
    data= data.to(device)
    label = label.to(device)

    opt.zero_grad()

    out = model(data)
    loss = crit(out, label)
    loss.backward()
    opt.step()

    total_loss += loss.item()
    _, y = torch.max(out, 1)
    total += label.size(0)
    correct += (y == label).sum().item()

  epoch_loss = total_loss / total_size
  epoch_score = correct/total

  return epoch_loss, epoch_score

In [None]:
#standard eval loop
def eval(model, data_loader, crit, device): #model, testset data loader, criterion, device
  model.eval()
  total_loss, correct, total = 0, 0, 0
  total_size = len(data_loader)
  with torch.no_grad():
    for data, label in data_loader:
      data= data.to(device)
      label = label.to(device)

      out = model(data)
      loss = crit(out, label)

      total_loss += loss.item()
      _, y = torch.max(out, 1)
      total += label.size(0)
      correct += (y == label).sum().item()

  epoch_loss = total_loss / total_size
  epoch_score = correct/total
  return epoch_loss, epoch_score

In [None]:
def run_model(model, train_loader, test_loader, epochs, crit, opt, device):
  final_test_score = 0
  for epoch in range(epochs):
    train_loss, train_score = train(model, train_loader, crit, opt, device)
    test_loss, test_score = eval(model, test_loader, crit, device)

    print(f'Epoch {epoch+1}/{epochs}')
    print(f'Train Loss: {train_loss:.4f}, Train Score: {train_score:.4f}')
    print(f'Test Loss: {test_loss:.4f}, Test Score: {test_score:.4f}')
    final_test_score = test_score
  return final_test_score

In [None]:
#Custom pandas dataset given csvs are read in as dataframes
class PandasDataset(Dataset):
    def __init__(self, dataframe, target_column=None):
        self.features = dataframe.drop(columns=[target_column]).values if target_column else dataframe.values
        self.features = torch.tensor(self.features, dtype=torch.float)

        if target_column:
            self.labels = torch.tensor(dataframe[target_column].values, dtype=torch.int64)
        else:
            self.labels = None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        return self.features[idx]

In [None]:
label_data = pd.read_csv('test_dataset_label_encoded.csv', delimiter='\t') #read in label encoded data
label_data['Placement'] = label_data['Placement'] -1 #get placement aka independant variable
label_data_traitless = label_data.copy(deep=True).iloc[:,:55] #make a subset without trait data
label_data_traitless["Placement"] = label_data['Placement']
match_data = pd.read_csv('test_dataset_encoded.csv', delimiter='\t') #repeat steps for binary encoded data
match_data['Placement'] = match_data['Placement'] -1
match_data_traitless = label_data.copy(deep=True).iloc[:,:344]
match_data_traitless["Placement"] = match_data['Placement']
match_data

Unnamed: 0,Champion 1_0,Champion 1_1,Champion 1_2,Champion 1_3,Champion 1_4,Champion 1_5,Champion 1_6,Level 1,Item 1 1_0,Item 1 1_1,...,Trait 7_3,Trait 7_4,Tier 7,Trait 8_0,Trait 8_1,Trait 8_2,Trait 8_3,Trait 8_4,Tier 8,Placement
0,0,0,0,0,0,0,1,2,0,0,...,0,1,3,0,0,0,0,1,0,1
1,0,0,0,0,0,1,0,2,0,0,...,1,0,0,0,0,0,0,1,0,2
2,0,0,0,0,0,1,1,2,0,0,...,1,0,0,0,0,0,0,1,0,6
3,0,0,0,0,1,0,0,2,0,0,...,1,0,0,0,0,0,0,1,0,4
4,0,0,0,0,0,1,0,2,0,0,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270891,0,0,0,0,0,0,1,2,0,0,...,0,1,3,0,0,0,0,1,0,2
270892,0,0,0,0,1,0,0,3,1,0,...,1,0,0,0,0,0,0,1,0,5
270893,0,0,0,0,1,0,1,2,0,0,...,1,0,0,0,0,0,0,1,0,6
270894,0,0,0,0,0,0,1,2,0,0,...,1,0,0,0,0,0,0,1,0,4


**Training on the 4 different datasets to see what performs the best before optimizing it**

In [None]:
train_binary, test_binary = train_test_split(match_data, test_size=0.2, random_state=42, shuffle=True)
train_binary_traitless, test_binary_traitless = train_test_split(match_data_traitless, test_size=0.2, random_state=42, shuffle=True)
train_label, test_label = train_test_split(label_data, test_size=0.2, random_state=42, shuffle=True)
train_label_traitless, test_label_traitless = train_test_split(label_data_traitless, test_size=0.2, random_state=42, shuffle=True)

In [None]:
train_dataset_binary = PandasDataset(train_binary, target_column="Placement")
test_dataset_binary = PandasDataset(test_binary, target_column="Placement")

    # Create DataLoaders
train_loader_binary = DataLoader(train_dataset_binary, batch_size=1024, shuffle=True)
test_loader_binary = DataLoader(test_dataset_binary, batch_size=1024, shuffle=False)

In [None]:
train_dataset_binary_traitless = PandasDataset(train_binary_traitless, target_column="Placement")
test_dataset_binary_traitless = PandasDataset(test_binary_traitless, target_column="Placement")

    # Create DataLoaders
train_loader_binary_traitless = DataLoader(train_dataset_binary_traitless, batch_size=1024, shuffle=True)
test_loader_binary_traitless = DataLoader(test_dataset_binary_traitless, batch_size=1024, shuffle=False)

In [None]:
train_dataset_label = PandasDataset(train_label, target_column="Placement")
test_dataset_label = PandasDataset(test_label, target_column="Placement")

    # Create DataLoaders
train_loader_label = DataLoader(train_dataset_label, batch_size=1024, shuffle=True)
test_loader_label = DataLoader(test_dataset_label, batch_size=1024, shuffle=False)

In [None]:
train_dataset_label_traitless = PandasDataset(train_label_traitless, target_column="Placement")
test_dataset_label_traitless = PandasDataset(test_label_traitless, target_column="Placement")

    # Create DataLoaders
train_loader_label_traitless = DataLoader(train_dataset_label_traitless, batch_size=1024, shuffle=True)
test_loader_label_traitless = DataLoader(test_dataset_label_traitless, batch_size=1024, shuffle=False)

In [None]:
train_dataset_binary[0]

(tensor([0., 0., 0., 0., 0., 1., 0., 2., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 1., 0., 3., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
         1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 3.,
         0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0.,
         0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 1., 1., 0., 2., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
         0., 1., 1., 2., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1.,
         0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0.,
         0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 1., 0.,

In [None]:
epochs=15
crit = nn.CrossEntropyLoss()
learning_rate = 0.001

In [None]:
binary_model = TransformerMLP(input_dim=len(train_dataset_binary[0][0]), hidden_dim=512).to(device)
opt = torch.optim.Adam(binary_model.parameters(), lr=learning_rate)
final_test_score = run_model(binary_model, train_loader_binary, test_loader_binary, epochs, crit, opt, device)

Epoch 1/15
Train Loss: 1.8801, Train Score: 0.2381
Test Loss: 1.7342, Test Score: 0.2831
Epoch 2/15
Train Loss: 1.6848, Train Score: 0.2951
Test Loss: 1.6657, Test Score: 0.3023
Epoch 3/15
Train Loss: 1.6586, Train Score: 0.3047
Test Loss: 1.6523, Test Score: 0.3119
Epoch 4/15
Train Loss: 1.6360, Train Score: 0.3110
Test Loss: 1.6368, Test Score: 0.3148
Epoch 5/15
Train Loss: 1.6199, Train Score: 0.3183
Test Loss: 1.6404, Test Score: 0.3086
Epoch 6/15
Train Loss: 1.6145, Train Score: 0.3191
Test Loss: 1.6149, Test Score: 0.3190
Epoch 7/15
Train Loss: 1.5974, Train Score: 0.3265
Test Loss: 1.6122, Test Score: 0.3213
Epoch 8/15
Train Loss: 1.5907, Train Score: 0.3293
Test Loss: 1.6174, Test Score: 0.3210
Epoch 9/15
Train Loss: 1.5854, Train Score: 0.3298
Test Loss: 1.6138, Test Score: 0.3181
Epoch 10/15
Train Loss: 1.5796, Train Score: 0.3330
Test Loss: 1.6087, Test Score: 0.3201
Epoch 11/15
Train Loss: 1.5872, Train Score: 0.3312
Test Loss: 1.6239, Test Score: 0.3180
Epoch 12/15
Train L

In [None]:
binary_model_traitless = TransformerMLP(input_dim=len(train_dataset_binary_traitless[0][0]), hidden_dim=512).to(device)
opt = torch.optim.Adam(binary_model_traitless.parameters(), lr=learning_rate)
final_test_score_traitless = run_model(binary_model_traitless, train_loader_binary_traitless, test_loader_binary_traitless, epochs, crit, opt, device)

Epoch 1/15
Train Loss: 1.9341, Train Score: 0.2194
Test Loss: 1.8567, Test Score: 0.2446
Epoch 2/15
Train Loss: 1.8443, Train Score: 0.2441
Test Loss: 1.8426, Test Score: 0.2463
Epoch 3/15
Train Loss: 1.8777, Train Score: 0.2324
Test Loss: 1.9842, Test Score: 0.1802
Epoch 4/15
Train Loss: 1.9720, Train Score: 0.2015
Test Loss: 1.9279, Test Score: 0.2121
Epoch 5/15
Train Loss: 1.9544, Train Score: 0.2090
Test Loss: 1.9307, Test Score: 0.2199
Epoch 6/15
Train Loss: 1.9324, Train Score: 0.2167
Test Loss: 1.8822, Test Score: 0.2291
Epoch 7/15
Train Loss: 1.9097, Train Score: 0.2240
Test Loss: 1.9073, Test Score: 0.2243
Epoch 8/15
Train Loss: 1.9554, Train Score: 0.2085
Test Loss: 1.9050, Test Score: 0.2287
Epoch 9/15
Train Loss: 1.9011, Train Score: 0.2253
Test Loss: 1.9392, Test Score: 0.2098
Epoch 10/15
Train Loss: 1.8898, Train Score: 0.2309
Test Loss: 1.8563, Test Score: 0.2391
Epoch 11/15
Train Loss: 1.8601, Train Score: 0.2400
Test Loss: 1.8426, Test Score: 0.2449
Epoch 12/15
Train L

In [None]:
label_model = TransformerMLP(input_dim=len(train_dataset_label[0][0]), hidden_dim=512).to(device)
opt = torch.optim.Adam(label_model.parameters(), lr=learning_rate)
final_test_score_label = run_model(label_model, train_loader_label, test_loader_label, epochs, crit, opt, device)

Epoch 1/15
Train Loss: 1.9440, Train Score: 0.2173
Test Loss: 1.8468, Test Score: 0.2429
Epoch 2/15
Train Loss: 1.8311, Train Score: 0.2476
Test Loss: 1.8195, Test Score: 0.2493
Epoch 3/15
Train Loss: 1.8163, Train Score: 0.2538
Test Loss: 1.8092, Test Score: 0.2596
Epoch 4/15
Train Loss: 1.8149, Train Score: 0.2534
Test Loss: 1.8028, Test Score: 0.2572
Epoch 5/15
Train Loss: 1.8620, Train Score: 0.2394
Test Loss: 1.9554, Test Score: 0.2065
Epoch 6/15
Train Loss: 1.9990, Train Score: 0.1862
Test Loss: 2.0063, Test Score: 0.1797
Epoch 7/15
Train Loss: 2.0584, Train Score: 0.1457
Test Loss: 2.0797, Test Score: 0.1379
Epoch 8/15
Train Loss: 2.0426, Train Score: 0.1632
Test Loss: 2.0241, Test Score: 0.1807
Epoch 9/15
Train Loss: 2.0237, Train Score: 0.1722
Test Loss: 2.0892, Test Score: 0.1413
Epoch 10/15
Train Loss: 1.9654, Train Score: 0.2041
Test Loss: 2.0433, Test Score: 0.1614
Epoch 11/15
Train Loss: 1.9383, Train Score: 0.2156
Test Loss: 1.9249, Test Score: 0.2221
Epoch 12/15
Train L

In [None]:
label_model_traitless = TransformerMLP(input_dim=len(train_dataset_label_traitless[0][0]), hidden_dim=512).to(device)
opt = torch.optim.Adam(label_model_traitless.parameters(), lr=learning_rate)
final_test_score_label_traitless = run_model(label_model_traitless, train_loader_label_traitless, test_loader_label_traitless, epochs, crit, opt, device)

Epoch 1/15
Train Loss: 1.9403, Train Score: 0.2190
Test Loss: 1.8428, Test Score: 0.2464
Epoch 2/15
Train Loss: 1.8520, Train Score: 0.2415
Test Loss: 1.8315, Test Score: 0.2480
Epoch 3/15
Train Loss: 1.8526, Train Score: 0.2418
Test Loss: 1.8355, Test Score: 0.2480
Epoch 4/15
Train Loss: 1.8324, Train Score: 0.2484
Test Loss: 1.8200, Test Score: 0.2499
Epoch 5/15
Train Loss: 1.8145, Train Score: 0.2541
Test Loss: 1.8009, Test Score: 0.2581
Epoch 6/15
Train Loss: 1.8079, Train Score: 0.2573
Test Loss: 1.8128, Test Score: 0.2554
Epoch 7/15
Train Loss: 1.8073, Train Score: 0.2570
Test Loss: 1.8007, Test Score: 0.2564
Epoch 8/15
Train Loss: 1.9857, Train Score: 0.1946
Test Loss: 2.0613, Test Score: 0.1407
Epoch 9/15
Train Loss: 1.9897, Train Score: 0.1934
Test Loss: 1.9575, Test Score: 0.2121
Epoch 10/15
Train Loss: 2.0335, Train Score: 0.1600
Test Loss: 2.0781, Test Score: 0.1318
Epoch 11/15
Train Loss: 2.0441, Train Score: 0.1649
Test Loss: 1.9496, Test Score: 0.2080
Epoch 12/15
Train L

**binary encoded data with traits performs the best, so optimize on this model**

In [None]:
def objective(trial):
  num_heads = trial.suggest_int("num_heads", 2, 8)
  random_integer = random.randint(32, 128)
  hidden_dim = (random_integer // 32) * 32 * num_heads
  num_layers = trial.suggest_int("num_layers", 1, 4)
  ffn_dim = trial.suggest_int("ffn_dim", 64, 512, log=True)
  dropout = trial.suggest_uniform("dropout", 0.1, 0.5)
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-1)
  batch_size = trial.suggest_int("batch_size", 16, 1024)

  input_dim=len(train_dataset_binary[0][0])
  best_model = TransformerMLP(input_dim=input_dim,
                              hidden_dim=hidden_dim,
                              num_heads=num_heads,
                              num_layers=num_layers,
                              ffn_dim=ffn_dim,
                              dropout=dropout,
                              output_dim=8)
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(best_model.parameters(), lr=learning_rate)

  train_loader_binary_best = DataLoader(train_dataset_binary, batch_size=batch_size, shuffle=True)

  best_model.train()
  running_loss = 0.0
  correct_preds = 0
  total_preds = 0
  print(num_heads, hidden_dim, num_layers, ffn_dim, dropout, learning_rate, batch_size)
  for inputs, labels in train_loader_binary_best:
      optimizer.zero_grad()

      # Forward pass
      outputs = best_model(inputs)
      loss = criterion(outputs, labels)

      # Backpropagation
      loss.backward()
      optimizer.step()

      running_loss += loss.item()

      # Calculate accuracy
      _, predicted = torch.max(outputs.data, 1)
      total_preds += labels.size(0)
      correct_preds += (predicted == labels).sum().item()

  # Calculate accuracy
  accuracy = correct_preds / total_preds
  print(f"Loss: {running_loss / len(train_loader_binary_best)}, Accuracy: {accuracy}")

  # Return the loss as the value to optimize
  return running_loss / len(train_loader_binary_best)

# Create an Optuna study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Output the best hyperparameters found by Optuna
print("Best hyperparameters found: ", study.best_trial.params)

[I 2025-03-12 19:00:06,120] A new study created in memory with name: no-name-582be93f-1f70-4eec-8cb2-2117980fb6f0
  dropout = trial.suggest_uniform("dropout", 0.1, 0.5)
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-1)


3 192 4 111 0.14047986072797786 0.017246153310800715 672


[I 2025-03-12 19:03:20,842] Trial 0 finished with value: 2.124136547555126 and parameters: {'num_heads': 3, 'num_layers': 4, 'ffn_dim': 111, 'dropout': 0.14047986072797786, 'learning_rate': 0.017246153310800715, 'batch_size': 672}. Best is trial 0 with value: 2.124136547555126.


Loss: 2.124136547555126, Accuracy: 0.12582827294708282
5 480 1 258 0.18821946248479252 0.0005203017550857601 259


[I 2025-03-12 19:03:58,000] Trial 1 finished with value: 1.7089815598377334 and parameters: {'num_heads': 5, 'num_layers': 1, 'ffn_dim': 258, 'dropout': 0.18821946248479252, 'learning_rate': 0.0005203017550857601, 'batch_size': 259}. Best is trial 1 with value: 1.7089815598377334.


Loss: 1.7089815598377334, Accuracy: 0.286831613724875
5 320 4 326 0.17191839104102774 1.136690529933134e-05 816


[I 2025-03-12 19:05:43,708] Trial 2 finished with value: 1.9942948437274848 and parameters: {'num_heads': 5, 'num_layers': 4, 'ffn_dim': 326, 'dropout': 0.17191839104102774, 'learning_rate': 1.136690529933134e-05, 'batch_size': 816}. Best is trial 1 with value: 1.7089815598377334.


Loss: 1.9942948437274848, Accuracy: 0.1953985861680725
8 256 4 336 0.35141918855878584 2.1327296383213623e-05 938


[I 2025-03-12 19:08:15,162] Trial 3 finished with value: 1.973677069462579 and parameters: {'num_heads': 8, 'num_layers': 4, 'ffn_dim': 336, 'dropout': 0.35141918855878584, 'learning_rate': 2.1327296383213623e-05, 'batch_size': 938}. Best is trial 1 with value: 1.7089815598377334.


Loss: 1.973677069462579, Accuracy: 0.2009819302681851
8 512 1 370 0.4024804742361371 0.007105573676308286 392


[I 2025-03-12 19:09:06,144] Trial 4 finished with value: 2.116543292136994 and parameters: {'num_heads': 8, 'num_layers': 1, 'ffn_dim': 370, 'dropout': 0.4024804742361371, 'learning_rate': 0.007105573676308286, 'batch_size': 392}. Best is trial 1 with value: 1.7089815598377334.


Loss: 2.116543292136994, Accuracy: 0.12549142656748924
8 512 1 93 0.43034788125244416 0.0032007947239337414 995


[I 2025-03-12 19:11:29,250] Trial 5 finished with value: 2.1577177474258145 and parameters: {'num_heads': 8, 'num_layers': 1, 'ffn_dim': 93, 'dropout': 0.43034788125244416, 'learning_rate': 0.0032007947239337414, 'batch_size': 995}. Best is trial 1 with value: 1.7089815598377334.


Loss: 2.1577177474258145, Accuracy: 0.12552372690525848
7 448 4 147 0.18868240233749642 0.010564346650278581 789


[I 2025-03-12 19:25:17,602] Trial 6 finished with value: 2.177641978697343 and parameters: {'num_heads': 7, 'num_layers': 4, 'ffn_dim': 147, 'dropout': 0.18868240233749642, 'learning_rate': 0.010564346650278581, 'batch_size': 789}. Best is trial 1 with value: 1.7089815598377334.


Loss: 2.177641978697343, Accuracy: 0.12489617748574171
5 480 4 415 0.2388253061788308 0.011381355998485202 893


[I 2025-03-12 19:33:55,255] Trial 7 finished with value: 2.2372145741074174 and parameters: {'num_heads': 5, 'num_layers': 4, 'ffn_dim': 415, 'dropout': 0.2388253061788308, 'learning_rate': 0.011381355998485202, 'batch_size': 893}. Best is trial 1 with value: 1.7089815598377334.


Loss: 2.2372145741074174, Accuracy: 0.12482696247623618
3 96 1 356 0.3716588248699091 0.01425472832411105 370


[I 2025-03-12 19:34:07,554] Trial 8 finished with value: 2.0885653320839785 and parameters: {'num_heads': 3, 'num_layers': 1, 'ffn_dim': 356, 'dropout': 0.3716588248699091, 'learning_rate': 0.01425472832411105, 'batch_size': 370}. Best is trial 1 with value: 1.7089815598377334.


Loss: 2.0885653320839785, Accuracy: 0.12660348105354474
6 384 3 382 0.3257071906649246 0.008696157285562759 958


[I 2025-03-12 19:43:48,487] Trial 9 finished with value: 2.1485676030230416 and parameters: {'num_heads': 6, 'num_layers': 3, 'ffn_dim': 382, 'dropout': 0.3257071906649246, 'learning_rate': 0.008696157285562759, 'batch_size': 958}. Best is trial 1 with value: 1.7089815598377334.


Loss: 2.1485676030230416, Accuracy: 0.12574060060170916
2 64 2 219 0.2557440528914259 0.00026750052120056307 26


[I 2025-03-12 19:44:41,844] Trial 10 finished with value: 1.6967165302949996 and parameters: {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 219, 'dropout': 0.2557440528914259, 'learning_rate': 0.00026750052120056307, 'batch_size': 26}. Best is trial 10 with value: 1.6967165302949996.


Loss: 1.6967165302949996, Accuracy: 0.291432104690009
2 64 2 221 0.26170335938408695 0.00015340307314982833 18


[I 2025-03-12 19:45:52,545] Trial 11 finished with value: 1.7021077736964654 and parameters: {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 221, 'dropout': 0.26170335938408695, 'learning_rate': 0.00015340307314982833, 'batch_size': 18}. Best is trial 10 with value: 1.6967165302949996.


Loss: 1.7021077736964654, Accuracy: 0.29037080787759095
2 256 2 196 0.27298496788546384 0.0001799323549258246 21


[I 2025-03-12 19:47:42,096] Trial 12 finished with value: 1.6985305796644485 and parameters: {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 196, 'dropout': 0.27298496788546384, 'learning_rate': 0.0001799323549258246, 'batch_size': 21}. Best is trial 10 with value: 1.6967165302949996.


Loss: 1.6985305796644485, Accuracy: 0.29252108750622935
2 256 2 163 0.2810512029808898 0.00010539174662319339 26


[I 2025-03-12 19:49:13,748] Trial 13 finished with value: 1.706155498646157 and parameters: {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 163, 'dropout': 0.2810512029808898, 'learning_rate': 0.00010539174662319339, 'batch_size': 26}. Best is trial 10 with value: 1.6967165302949996.


Loss: 1.706155498646157, Accuracy: 0.2881559275734141
3 96 2 208 0.4938552754094862 0.09174409350045439 191


[I 2025-03-12 19:49:31,996] Trial 14 finished with value: 2.111247867634643 and parameters: {'num_heads': 3, 'num_layers': 2, 'ffn_dim': 208, 'dropout': 0.4938552754094862, 'learning_rate': 0.09174409350045439, 'batch_size': 191}. Best is trial 10 with value: 1.6967165302949996.


Loss: 2.111247867634643, Accuracy: 0.12448550176267557
4 256 3 131 0.22880891429736 0.0005297977920867471 147


[I 2025-03-12 19:50:20,714] Trial 15 finished with value: 1.7045255020917471 and parameters: {'num_heads': 4, 'num_layers': 3, 'ffn_dim': 131, 'dropout': 0.22880891429736, 'learning_rate': 0.0005297977920867471, 'batch_size': 147}. Best is trial 10 with value: 1.6967165302949996.


Loss: 1.7045255020917471, Accuracy: 0.28826205725465587
2 64 2 79 0.30233330011045106 9.605509502451262e-05 524


[I 2025-03-12 19:50:34,728] Trial 16 finished with value: 1.875345260336779 and parameters: {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 79, 'dropout': 0.30233330011045106, 'learning_rate': 9.605509502451262e-05, 'batch_size': 524}. Best is trial 10 with value: 1.6967165302949996.


Loss: 1.875345260336779, Accuracy: 0.2348465272522564
4 384 3 512 0.22891145737025695 0.001441174897488019 122


[I 2025-03-12 19:55:28,604] Trial 17 finished with value: 2.0942680700213034 and parameters: {'num_heads': 4, 'num_layers': 3, 'ffn_dim': 512, 'dropout': 0.22891145737025695, 'learning_rate': 0.001441174897488019, 'batch_size': 122}. Best is trial 10 with value: 1.6967165302949996.


Loss: 2.0942680700213034, Accuracy: 0.12422248472655457
4 384 2 250 0.12767808963967553 3.4743062191619445e-05 295


[I 2025-03-12 19:56:19,487] Trial 18 finished with value: 1.7970478447116152 and parameters: {'num_heads': 4, 'num_layers': 2, 'ffn_dim': 250, 'dropout': 0.12767808963967553, 'learning_rate': 3.4743062191619445e-05, 'batch_size': 295}. Best is trial 10 with value: 1.6967165302949996.


Loss: 1.7970478447116152, Accuracy: 0.26311855146828106
3 96 3 184 0.3148846750287438 0.0002198628939732341 501


[I 2025-03-12 19:56:45,128] Trial 19 finished with value: 1.7634741106429772 and parameters: {'num_heads': 3, 'num_layers': 3, 'ffn_dim': 184, 'dropout': 0.3148846750287438, 'learning_rate': 0.0002198628939732341, 'batch_size': 501}. Best is trial 10 with value: 1.6967165302949996.


Loss: 1.7634741106429772, Accuracy: 0.27075988851769134
2 64 2 276 0.10740856009515329 0.0009321152444711285 100


[I 2025-03-12 19:57:04,134] Trial 20 finished with value: 1.686800765595313 and parameters: {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 276, 'dropout': 0.10740856009515329, 'learning_rate': 0.0009321152444711285, 'batch_size': 100}. Best is trial 20 with value: 1.686800765595313.


Loss: 1.686800765595313, Accuracy: 0.29457908045552705
2 128 2 281 0.11159925895512796 0.0004023858299284209 91


[I 2025-03-12 19:57:30,452] Trial 21 finished with value: 1.6906125712154494 and parameters: {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 281, 'dropout': 0.11159925895512796, 'learning_rate': 0.0004023858299284209, 'batch_size': 91}. Best is trial 20 with value: 1.686800765595313.


Loss: 1.6906125712154494, Accuracy: 0.2920181251038225
2 64 2 267 0.10042809904090386 0.0011073882093840127 106


[I 2025-03-12 19:57:49,931] Trial 22 finished with value: 1.6853186340378665 and parameters: {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 267, 'dropout': 0.10042809904090386, 'learning_rate': 0.0011073882093840127, 'batch_size': 106}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.6853186340378665, Accuracy: 0.2939930600417136
3 288 2 284 0.10202189942314857 0.0014936035530228873 221


[I 2025-03-12 19:59:24,001] Trial 23 finished with value: 1.7750715895894835 and parameters: {'num_heads': 3, 'num_layers': 2, 'ffn_dim': 284, 'dropout': 0.10202189942314857, 'learning_rate': 0.0014936035530228873, 'batch_size': 221}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.7750715895894835, Accuracy: 0.2670684213440632
2 64 3 294 0.10500125360179233 0.0031138709404153826 120


[I 2025-03-12 19:59:49,719] Trial 24 finished with value: 1.7349635784412143 and parameters: {'num_heads': 2, 'num_layers': 3, 'ffn_dim': 294, 'dropout': 0.10500125360179233, 'learning_rate': 0.0031138709404153826, 'batch_size': 120}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.7349635784412143, Accuracy: 0.27629708927813357
3 192 1 485 0.1569234629273978 0.0007509007365453631 386


[I 2025-03-12 20:00:08,029] Trial 25 finished with value: 1.7079533440362515 and parameters: {'num_heads': 3, 'num_layers': 1, 'ffn_dim': 485, 'dropout': 0.1569234629273978, 'learning_rate': 0.0007509007365453631, 'batch_size': 386}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.7079533440362515, Accuracy: 0.28909725170268924
4 384 2 247 0.12409853579801597 0.0025273915914982327 313


[I 2025-03-12 20:04:04,278] Trial 26 finished with value: 2.105617435463579 and parameters: {'num_heads': 4, 'num_layers': 2, 'ffn_dim': 247, 'dropout': 0.12409853579801597, 'learning_rate': 0.0025273915914982327, 'batch_size': 313}. Best is trial 22 with value: 1.6853186340378665.


Loss: 2.105617435463579, Accuracy: 0.1255144982373244
6 576 3 290 0.19929702153100862 0.0004329153935924633 104


[I 2025-03-12 20:07:15,393] Trial 27 finished with value: 1.7289750570871094 and parameters: {'num_heads': 6, 'num_layers': 3, 'ffn_dim': 290, 'dropout': 0.19929702153100862, 'learning_rate': 0.0004329153935924633, 'batch_size': 104}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.7289750570871094, Accuracy: 0.2782812528839587
2 64 2 410 0.15565930877329498 5.446751392754012e-05 488


[I 2025-03-12 20:07:31,691] Trial 28 finished with value: 1.8868244567613923 and parameters: {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 410, 'dropout': 0.15565930877329498, 'learning_rate': 5.446751392754012e-05, 'batch_size': 488}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.8868244567613923, Accuracy: 0.23068439801399065
3 288 1 162 0.13872011897740494 0.028797067532820367 648


[I 2025-03-12 20:07:55,592] Trial 29 finished with value: 2.2328119448761443 and parameters: {'num_heads': 3, 'num_layers': 1, 'ffn_dim': 162, 'dropout': 0.13872011897740494, 'learning_rate': 0.028797067532820367, 'batch_size': 648}. Best is trial 22 with value: 1.6853186340378665.


Loss: 2.2328119448761443, Accuracy: 0.12438860074936783
4 256 2 129 0.11594699139570788 0.0011157181656211712 223


[I 2025-03-12 20:08:27,293] Trial 30 finished with value: 1.7108313874207406 and parameters: {'num_heads': 4, 'num_layers': 2, 'ffn_dim': 129, 'dropout': 0.11594699139570788, 'learning_rate': 0.0011157181656211712, 'batch_size': 223}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.7108313874207406, Accuracy: 0.28702541575149043
2 128 2 226 0.14806563876731615 0.000257114801484917 77


[I 2025-03-12 20:08:54,464] Trial 31 finished with value: 1.6986619246365968 and parameters: {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 226, 'dropout': 0.14806563876731615, 'learning_rate': 0.000257114801484917, 'batch_size': 77}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.6986619246365968, Accuracy: 0.2909798999612396
2 192 2 260 0.17233142758868625 0.0002949411230943191 171


[I 2025-03-12 20:09:18,101] Trial 32 finished with value: 1.7015791420680868 and parameters: {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 260, 'dropout': 0.17233142758868625, 'learning_rate': 0.0002949411230943191, 'batch_size': 171}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.7015791420680868, Accuracy: 0.2890372653611178
3 288 2 317 0.20652284473135485 0.0007596812785918264 67


[I 2025-03-12 20:10:14,535] Trial 33 finished with value: 1.695754340406179 and parameters: {'num_heads': 3, 'num_layers': 2, 'ffn_dim': 317, 'dropout': 0.20652284473135485, 'learning_rate': 0.0007596812785918264, 'batch_size': 67}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.695754340406179, Accuracy: 0.2903892652134591
3 192 1 325 0.2026157455386579 0.0007799974553424931 80


[I 2025-03-12 20:10:35,898] Trial 34 finished with value: 1.6937003683982277 and parameters: {'num_heads': 3, 'num_layers': 1, 'ffn_dim': 325, 'dropout': 0.2026157455386579, 'learning_rate': 0.0007799974553424931, 'batch_size': 80}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.6937003683982277, Accuracy: 0.29118293065578915
3 192 1 438 0.17289177630365454 0.001844783068635161 262


[I 2025-03-12 20:10:51,155] Trial 35 finished with value: 1.7103967037465837 and parameters: {'num_heads': 3, 'num_layers': 1, 'ffn_dim': 438, 'dropout': 0.17289177630365454, 'learning_rate': 0.001844783068635161, 'batch_size': 262}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.7103967037465837, Accuracy: 0.28730227578951256
2 64 1 321 0.13576650777643698 0.0049653342865718314 172


[I 2025-03-12 20:11:07,274] Trial 36 finished with value: 1.736779081348389 and parameters: {'num_heads': 2, 'num_layers': 1, 'ffn_dim': 321, 'dropout': 0.13576650777643698, 'learning_rate': 0.0049653342865718314, 'batch_size': 172}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.736779081348389, Accuracy: 0.27711844072426584
5 320 1 268 0.11301272297973128 0.0004827970929838882 236


[I 2025-03-12 20:11:31,669] Trial 37 finished with value: 1.704487774006301 and parameters: {'num_heads': 5, 'num_layers': 1, 'ffn_dim': 268, 'dropout': 0.11301272297973128, 'learning_rate': 0.0004827970929838882, 'batch_size': 236}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.704487774006301, Accuracy: 0.28948024142195317
3 192 1 324 0.2118334864146117 0.004682155023697413 324


[I 2025-03-12 20:11:55,293] Trial 38 finished with value: 1.88580549333484 and parameters: {'num_heads': 3, 'num_layers': 1, 'ffn_dim': 324, 'dropout': 0.2118334864146117, 'learning_rate': 0.004682155023697413, 'batch_size': 324}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.88580549333484, Accuracy: 0.22687295815721958
2 192 1 64 0.10020514550588763 0.0007705627872818164 438


[I 2025-03-12 20:12:09,203] Trial 39 finished with value: 1.7167881370794893 and parameters: {'num_heads': 2, 'num_layers': 1, 'ffn_dim': 64, 'dropout': 0.10020514550588763, 'learning_rate': 0.0007705627872818164, 'batch_size': 438}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.7167881370794893, Accuracy: 0.2868731427305783
3 96 3 363 0.17942874334811182 0.00225071021585248 587


[I 2025-03-12 20:12:37,760] Trial 40 finished with value: 1.7728704152880488 and parameters: {'num_heads': 3, 'num_layers': 3, 'ffn_dim': 363, 'dropout': 0.17942874334811182, 'learning_rate': 0.00225071021585248, 'batch_size': 587}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.7728704152880488, Accuracy: 0.26342771184407243
3 96 2 306 0.211905873428864 0.000852326099308739 72


[I 2025-03-12 20:13:04,171] Trial 41 finished with value: 1.6933870955559107 and parameters: {'num_heads': 3, 'num_layers': 2, 'ffn_dim': 306, 'dropout': 0.211905873428864, 'learning_rate': 0.000852326099308739, 'batch_size': 72}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.6933870955559107, Accuracy: 0.2916212923826575
2 128 2 244 0.1568028165528944 0.0012029917648594033 72


[I 2025-03-12 20:13:33,341] Trial 42 finished with value: 1.69391501330062 and parameters: {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 244, 'dropout': 0.1568028165528944, 'learning_rate': 0.0012029917648594033, 'batch_size': 72}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.69391501330062, Accuracy: 0.29028774986618433
4 384 2 347 0.13633083588176995 0.0004682591974403443 88


[I 2025-03-12 20:14:44,761] Trial 43 finished with value: 1.692640054579242 and parameters: {'num_heads': 4, 'num_layers': 2, 'ffn_dim': 347, 'dropout': 0.13633083588176995, 'learning_rate': 0.0004682591974403443, 'batch_size': 88}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.692640054579242, Accuracy: 0.2918935380867126
4 128 2 386 0.13220872438082296 0.0003971753440659656 148


[I 2025-03-12 20:15:06,867] Trial 44 finished with value: 1.6991397132645696 and parameters: {'num_heads': 4, 'num_layers': 2, 'ffn_dim': 386, 'dropout': 0.13220872438082296, 'learning_rate': 0.0003971753440659656, 'batch_size': 148}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.6991397132645696, Accuracy: 0.28960944277303013
4 256 2 450 0.11977242827096264 0.004445385773734572 810


[I 2025-03-12 20:17:15,994] Trial 45 finished with value: 2.120678971952467 and parameters: {'num_heads': 4, 'num_layers': 2, 'ffn_dim': 450, 'dropout': 0.11977242827096264, 'learning_rate': 0.004445385773734572, 'batch_size': 810}. Best is trial 22 with value: 1.6853186340378665.


Loss: 2.120678971952467, Accuracy: 0.1254314402259178
2 64 2 294 0.14845570208201786 1.1752533823194516e-05 209


[I 2025-03-12 20:17:29,484] Trial 46 finished with value: 2.01342944905751 and parameters: {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 294, 'dropout': 0.14845570208201786, 'learning_rate': 1.1752533823194516e-05, 'batch_size': 209}. Best is trial 22 with value: 1.6853186340378665.


Loss: 2.01342944905751, Accuracy: 0.19030436146846563
5 320 2 354 0.18956105019797825 0.00010701832756471199 270


[I 2025-03-12 20:18:11,494] Trial 47 finished with value: 1.7308268034888679 and parameters: {'num_heads': 5, 'num_layers': 2, 'ffn_dim': 354, 'dropout': 0.18956105019797825, 'learning_rate': 0.00010701832756471199, 'batch_size': 270}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.7308268034888679, Accuracy: 0.28114213994352055
3 192 3 230 0.16697663162953597 0.0005998757494016452 48


[I 2025-03-12 20:19:12,363] Trial 48 finished with value: 1.6932455801620567 and parameters: {'num_heads': 3, 'num_layers': 3, 'ffn_dim': 230, 'dropout': 0.16697663162953597, 'learning_rate': 0.0005998757494016452, 'batch_size': 48}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.6932455801620567, Accuracy: 0.29158899204488825
6 384 3 229 0.3543911100502931 0.00035628653913875055 125


[I 2025-03-12 20:20:33,981] Trial 49 finished with value: 1.7068649351665581 and parameters: {'num_heads': 6, 'num_layers': 3, 'ffn_dim': 229, 'dropout': 0.3543911100502931, 'learning_rate': 0.00035628653913875055, 'batch_size': 125}. Best is trial 22 with value: 1.6853186340378665.


Loss: 1.7068649351665581, Accuracy: 0.2884420162793702
Best hyperparameters found:  {'num_heads': 2, 'num_layers': 2, 'ffn_dim': 267, 'dropout': 0.10042809904090386, 'learning_rate': 0.0011073882093840127, 'batch_size': 106}


**Now build model given optimal hyperparameters according to optuna**

In [None]:
binary_model_best = TransformerMLP(input_dim=len(train_dataset_binary[0][0]), hidden_dim=64, num_heads=2, num_layers=2, ffn_dim=267, dropout=.1004, output_dim=8).to(device)
opt = torch.optim.Adam(binary_model_best.parameters(), lr=0.0011079)
final_test_score = run_model(binary_model_best, train_loader_binary, test_loader_binary, epochs, crit, opt, device)

Epoch 1/15
Train Loss: 1.7485, Train Score: 0.2760
Test Loss: 1.6642, Test Score: 0.3057
Epoch 2/15
Train Loss: 1.6443, Train Score: 0.3091
Test Loss: 1.6341, Test Score: 0.3166
Epoch 3/15
Train Loss: 1.6204, Train Score: 0.3188
Test Loss: 1.6261, Test Score: 0.3170
Epoch 4/15
Train Loss: 1.6024, Train Score: 0.3252
Test Loss: 1.6394, Test Score: 0.3110
Epoch 5/15
Train Loss: 1.5898, Train Score: 0.3299
Test Loss: 1.6068, Test Score: 0.3223
Epoch 6/15
Train Loss: 1.5806, Train Score: 0.3329
Test Loss: 1.6084, Test Score: 0.3221
Epoch 7/15
Train Loss: 1.5711, Train Score: 0.3376
Test Loss: 1.5994, Test Score: 0.3243
Epoch 8/15
Train Loss: 1.5619, Train Score: 0.3420
Test Loss: 1.6018, Test Score: 0.3240
Epoch 9/15
Train Loss: 1.5524, Train Score: 0.3453
Test Loss: 1.6044, Test Score: 0.3219
Epoch 10/15
Train Loss: 1.5459, Train Score: 0.3478
Test Loss: 1.6044, Test Score: 0.3203
Epoch 11/15
Train Loss: 1.5375, Train Score: 0.3528
Test Loss: 1.6007, Test Score: 0.3237
Epoch 12/15
Train L