In [1]:
import torch
import optuna
import time
import copy
from torch import nn as nn
import torch.backends.cudnn as cudnn
from torch.optim import lr_scheduler
from pathlib import Path
#import mlflow
from thermostability.thermo_dataset import ThermostabilityDataset
from thermostability.thermo_pregenerated_dataset import ThermostabilityPregeneratedDataset
from thermostability.hotinfer_pregenerated import HotInferPregenerated
from thermostability.hotinfer import HotInfer
import wandb
cudnn.benchmark = True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    torch.cuda.empty_cache() 
    
cpu = torch.device("cpu")

torch.cuda.list_gpu_processes()



'pynvml module not found, please install pynvml'

In [2]:
train_ds = ThermostabilityPregeneratedDataset('data/s_s/train', limit=1000)
eval_ds = ThermostabilityPregeneratedDataset('data/s_s/train', limit=1000)

dataloaders = {
    "train": torch.utils.data.DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=4),
    "val": torch.utils.data.DataLoader(eval_ds, batch_size=32, shuffle=True, num_workers=4)
}

dataset_sizes = {"train": len(train_ds),"val": len(eval_ds)}

In [3]:
from tqdm.notebook import tqdm
import sys

def train_model(model, optimizer, criterion, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())

    best_epoch_loss = sys.float_info.max
    losses = []
    batchEnumeration = []
    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
         

            # Iterate over data.
            for idx, (inputs, labels) in enumerate(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    
                    loss = criterion(outputs,labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                batch_size = inputs.size(0)
                batch_loss = loss.item() * batch_size
                losses.append(batch_loss)
                batchEnumeration.append(batchEnumeration[-1]+1 if len(batchEnumeration)>0 else 0)

                running_loss += batch_loss
               
            
                if idx % 10 == 0:
                    batch_size = inputs.size(0)
                    tqdm.write("Epoch: [{}/{}], Batch: [{}/{}], train accuracy: {:.6f}, loss: {:.6f}".format(
                        epoch,
                        num_epochs,
                        idx + 1,
                        len(dataloaders[phase]),
                        batch_loss / float(batch_size)
                        ), end="\r")
                    
                    
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]


            print(f'{phase} Loss: {epoch_loss:.4f}')

            # deep copy the model
            if phase == 'val' and epoch_loss < best_epoch_loss:
                best_epoch_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())

        print()


    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_epoch_loss:4f}')
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, best_epoch_loss

In [4]:
# YOUR_TRACKING_URI = "http://127.0.0.1:5000"
# mlflc = MLflowCallback(
#     tracking_uri=YOUR_TRACKING_URI,
#     metric_name="metric_score"
#)

def optimize_thermostability(trial):    
    params = {
        'model_learning_rate': trial.suggest_float('model_learning_rate', 0.001, 0.501, step=0.05),
        'model_hidden_units': trial.suggest_int('model_hidden_units', 64, 640, step=64),
        'model_hidden_layers': trial.suggest_int('model_hidden_layers', 1, 4, step=1)
    }
    #wandb.init(project="HotProt", entity="7-vs-capsule")
    #
    #wandb.config = params
    model = HotInferPregenerated(
        params['model_hidden_units'],
        params['model_hidden_layers'],
    )
    
    criterion = nn.MSELoss()

    optimizer_ft = torch.optim.SGD(model.parameters(), lr=params['model_learning_rate'], momentum=0.9)
    
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

    model, score = train_model(model, optimizer_ft, criterion, exp_lr_scheduler, num_epochs=10)

    #mlflow.log_params(params)

    #wandb.log({"score": score})
    #candidate_model_uri = mlflow.pytorch.log_model(model).model_uri
    #mlflow.evaluate(model=candidate_model_uri, data=eval_data, targets="label", model_type="regressor")
    #mlflow.log_metric("score", score)
    return score


In [5]:
# minimize or maximize
study = optuna.create_study(direction="minimize", study_name="thermostability-hyperparameter-optimization") # maximise the score during tuning
study.optimize(optimize_thermostability, n_trials=100) # run the objective function 100 times

print(study.best_trial) # print the best performing pipeline

[32m[I 2023-01-20 14:11:39,881][0m A new study created in memory with name: thermostability-hyperparameter-optimization[0m


Epoch 0/9
----------


[33m[W 2023-01-20 14:11:40,970][0m Trial 0 failed with parameters: {'model_learning_rate': 0.35100000000000003, 'model_hidden_units': 256, 'model_hidden_layers': 1} because of the following error: RuntimeError('Caught RuntimeError in DataLoader worker process 0.\nOriginal Traceback (most recent call last):\n  File "/dhc/home/tobias.fiedler/conda3/envs/hotprot/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop\n    data = fetcher.fetch(index)\n  File "/dhc/home/tobias.fiedler/conda3/envs/hotprot/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch\n    return self.collate_fn(data)\n  File "/dhc/home/tobias.fiedler/conda3/envs/hotprot/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 175, in default_collate\n    return [default_collate(samples) for samples in transposed]  # Backwards compatibility.\n  File "/dhc/home/tobias.fiedler/conda3/envs/hotprot/lib/python3.7/site-packages/torch/utils/data/_uti

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/dhc/home/tobias.fiedler/conda3/envs/hotprot/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/dhc/home/tobias.fiedler/conda3/envs/hotprot/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    return self.collate_fn(data)
  File "/dhc/home/tobias.fiedler/conda3/envs/hotprot/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 175, in default_collate
    return [default_collate(samples) for samples in transposed]  # Backwards compatibility.
  File "/dhc/home/tobias.fiedler/conda3/envs/hotprot/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 175, in <listcomp>
    return [default_collate(samples) for samples in transposed]  # Backwards compatibility.
  File "/dhc/home/tobias.fiedler/conda3/envs/hotprot/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py", line 141, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [299, 1024] at entry 0 and [333, 1024] at entry 1
