In [3]:
import torch
import optuna
import time
import copy
from torch import nn as nn
import torch.backends.cudnn as cudnn
from torch.optim import lr_scheduler
from pathlib import Path
#import mlflow
from thermostability.thermo_dataset import ThermostabilityDataset
from thermostability.thermo_pregenerated_dataset import ThermostabilityPregeneratedDataset
from thermostability.hotinfer_pregenerated import HotInferPregenerated
from thermostability.hotinfer import HotInfer
import wandb
cudnn.benchmark = True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    torch.cuda.empty_cache() 
    
cpu = torch.device("cpu")

torch.cuda.list_gpu_processes()



'pynvml module not found, please install pynvml'

In [4]:
def prep_minimal_dataset(src_path: Path, dst_path: Path):
    minimized_data = []
    reduction = 10
    with open(src_path) as file:
        data = file.readlines()
        file_len = len(data)
        minimized_data = data[:round(len(data) / reduction)]
        print(f'Dataset reduced: {file_len} --> {len(minimized_data)}')
        
    with open(dst_path, 'w') as file:
        file.writelines(minimized_data)
            
prep_minimal_dataset(Path('data/train_sequences.fasta'), Path('data/minimal_train_sequences.fasta'))
prep_minimal_dataset(Path('data/eval_sequences.fasta'), Path('data/minimal_eval_sequences.fasta'))

Dataset reduced: 367394 --> 36739
Dataset reduced: 35170 --> 3517


In [5]:
train_ds = ThermostabilityDataset('data/minimal_train_sequences.fasta')
eval_ds = ThermostabilityDataset('data/minimal_eval_sequences.fasta')

dataloaders = {
    "train": torch.utils.data.DataLoader(train_ds, batch_size=1, shuffle=True, num_workers=4),
    "val": torch.utils.data.DataLoader(eval_ds, batch_size=1, shuffle=True, num_workers=4)
}

dataset_sizes = {"train": len(train_ds),"val": len(eval_ds)}
dataset_sizes

Reading line 36700


ValueError: All arrays must be of the same length

In [None]:
from tqdm.notebook import tqdm
import sys

def train_model(model, optimizer, criterion, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())

    best_epoch_loss = sys.float_info.max
    losses = []
    batchEnumeration = []
    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
         

            # Iterate over data.
            for idx, (inputs, labels) in enumerate(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                if torch.isnan(inputs).any():
                    print("#########################################################################################\n################")

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    
                    loss = criterion(outputs,labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        if not torch.isnan(loss):
                            loss.backward()
                            threshold = 10000
                            for p in model.parameters():
                                if p.grad != None:
                                    if p.grad.norm() > threshold:
                                        torch.nn.utils.clip_grad_norm_(p, threshold)
                            optimizer.step()
                        if torch.isnan(loss).any():
                            print(f"Nan loss: {torch.isnan(loss)}| Loss: {loss}| inputs: {inputs}")

                # statistics
                batch_size = inputs.size(0)
                batch_loss = loss.item() * batch_size
                losses.append(batch_loss)
                batchEnumeration.append(batchEnumeration[-1]+1 if len(batchEnumeration)>0 else 0)

                running_loss += batch_loss
               
            
                if idx % 10 == 0:
                    batch_size = inputs.size(0)
                    tqdm.write("Epoch: [{}/{}], Batch: [{}/{}], train accuracy: {:.6f}, loss: {:.6f}".format(
                        epoch,
                        num_epochs,
                        idx + 1,
                        len(dataloaders[phase]),
                        batch_loss / float(batch_size), running_loss / (idx + 1)
                        ), end="\r")
                    
                    
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]


            print(f'{phase} Loss: {epoch_loss:.4f}')

            # deep copy the model
            if phase == 'val' and epoch_loss < best_epoch_loss:
                best_epoch_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())

        print()


    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_epoch_loss:4f}')
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, best_epoch_loss

In [None]:
# YOUR_TRACKING_URI = "http://127.0.0.1:5000"
# mlflc = MLflowCallback(
#     tracking_uri=YOUR_TRACKING_URI,
#     metric_name="metric_score"
#)

def optimize_thermostability(trial):    
    params = {
        'model_learning_rate': trial.suggest_float('model_learning_rate', 0.001, 0.501, step=0.05),
        'model_hidden_units': trial.suggest_int('model_hidden_units', 64, 640, step=64),
        'model_hidden_layers': trial.suggest_int('model_hidden_layers', 1, 4, step=1)
    }
    #wandb.init(project="HotProt", entity="7-vs-capsule")
    #
    #wandb.config = params
    model = HotInferPregenerated(
        params['model_hidden_units'],
        params['model_hidden_layers'],
    )
    
    
    criterion = nn.MSELoss()

    optimizer_ft = torch.optim.SGD(model.parameters(), lr=params['model_learning_rate'], momentum=0.9)
    
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

    model, score = train_model(model, optimizer_ft, criterion, exp_lr_scheduler, num_epochs=10)

    #mlflow.log_params(params)

    #wandb.log({"score": score})
    #candidate_model_uri = mlflow.pytorch.log_model(model).model_uri
    #mlflow.evaluate(model=candidate_model_uri, data=eval_data, targets="label", model_type="regressor")
    #mlflow.log_metric("score", score)
    return score


In [None]:
# minimize or maximize
study = optuna.create_study(direction="minimize", study_name="thermostability-hyperparameter-optimization") # maximise the score during tuning
study.optimize(optimize_thermostability, n_trials=100) # run the objective function 100 times

print(study.best_trial) # print the best performing pipeline

[32m[I 2023-01-20 15:13:11,437][0m A new study created in memory with name: thermostability-hyperparameter-optimization[0m


Epoch 0/9
----------
train Loss: 11522075641428048796712257907064832.0000792501685808878958323491739271168.000000, loss: 11519619703588062500052228741529600.000000
val Loss: 11792645857543793176871506785861632.000011792501685808878958323491739271168.000000, loss: 11792647166873375357707317059518464.000000

Epoch 1/9
----------
train Loss: 11792645857543793176871506785861632.0000792501685808878958323491739271168.000000, loss: 11792647166873375357707317059518464.000000
val Loss: 11792645857543793176871506785861632.000011792501685808878958323491739271168.000000, loss: 11792647176866836701878217877749760.000000

Epoch 2/9
----------
train Loss: 11792645857543793176871506785861632.0000792501685808878958323491739271168.000000, loss: 11792644803419738638251658376642560.000000
val Loss: 11792645857543793176871506785861632.000011792501685808878958323491739271168.000000, loss: 11792618670518000813324728784650240.000000

Epoch 3/9
----------
train Loss: 11792117026890172469706175551635456.0000790

[32m[I 2023-01-20 15:21:23,246][0m Trial 0 finished with value: 1.1788497078527555e+34 and parameters: {'model_learning_rate': 0.15100000000000002, 'model_hidden_units': 384, 'model_hidden_layers': 4}. Best is trial 0 with value: 1.1788497078527555e+34.[0m


val Loss: 11788497078527555189301879062396928.000011788352110797194363642029874872320.000000, loss: 11788495636890790233550657429700608.000000

Training complete in 8m 12s
Best val Acc: 11788497078527555189301879062396928.000000
Epoch 0/9
----------
train Loss: 20467709763435774269718528.0000uracy: 590.273254, loss: 20653592092266168775081984.0000000000000823979008.000000
val Loss: 403.5739ch: [991/1000], train accuracy: 2.620529, loss: 404.085692235

Epoch 1/9
----------
train Loss: 603.6627: [991/1000], train accuracy: 224.120117, loss: 605.9560217
val Loss: 233.0333ch: [991/1000], train accuracy: 0.703929, loss: 234.205741226

Epoch 2/9
----------
train Loss: 570.1187: [991/1000], train accuracy: 79.076721, loss: 573.79974040
val Loss: 188.2541ch: [991/1000], train accuracy: 551.766724, loss: 188.655888

Epoch 3/9
----------
train Loss: 587.1956: [991/1000], train accuracy: 1527.239380, loss: 589.274282
val Loss: 152.1965ch: [991/1000], train accuracy: 12.147702, loss: 151.8069110



[32m[I 2023-01-20 15:25:36,088][0m Trial 1 finished with value: 126.23322334720405 and parameters: {'model_learning_rate': 0.30100000000000005, 'model_hidden_units': 512, 'model_hidden_layers': 1}. Best is trial 1 with value: 126.23322334720405.[0m


val Loss: 146.0555

Training complete in 4m 13s
Best val Acc: 126.233223
Epoch 0/9
----------
train Loss: 11744284195693917551531571281920.0000 12074347305087164910820934025216.000000, loss: 11741286649796298367787383914496.000000
val Loss: 12072330543602812728652909772800.0000y: 12074347305087164910820934025216.000000, loss: 12072312227908200949620194410496.000000

Epoch 1/9
----------
train Loss: 12072330543602812728652909772800.0000 12074347305087164910820934025216.000000, loss: 12072312227908200949620194410496.000000
val Loss: 12072330543602812728652909772800.0000y: 12074347305087164910820934025216.000000, loss: 12072312227908200949620194410496.000000

Epoch 2/9
----------
train Loss: 12072330543602812728652909772800.0000 12074347305087164910820934025216.000000, loss: 12072312227908200949620194410496.000000
val Loss: 12072330543602812728652909772800.0000y: 12074347305087164910820934025216.000000, loss: 12072312227908200949620194410496.000000

Epoch 3/9
----------
train Loss: 120723

[33m[W 2023-01-20 15:26:27,741][0m Trial 2 failed with parameters: {'model_learning_rate': 0.251, 'model_hidden_units': 64, 'model_hidden_layers': 2} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/dhc/home/tobias.fiedler/conda3/envs/hotprot/lib/python3.7/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_658976/482875446.py", line 28, in optimize_thermostability
    model, score = train_model(model, optimizer_ft, criterion, exp_lr_scheduler, num_epochs=10)
  File "/tmp/ipykernel_658976/2957317812.py", line 47, in train_model
    loss.backward()
  File "/dhc/home/tobias.fiedler/conda3/envs/hotprot/lib/python3.7/site-packages/torch/_tensor.py", line 396, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/dhc/home/tobias.fiedler/conda3/envs/hotprot/lib/python3.7/site-packages/torch/autograd/__init__.py", li

KeyboardInterrupt: 