# Run Joint Learning Benchmark

In [1]:
%cd ..

/home/adam/Projects/hybrid-transformer


In [2]:
import os
import torch
import wandb

from hybrid_transformer.configs.task import TaskConfig
from hybrid_transformer.configs.model import ModelConfig
from hybrid_transformer.configs.trainer import TrainerConfig
from hybrid_transformer.configs.logger import LoggerConfig

from hybrid_transformer.utils.datasets.auto import AutoDataset
from hybrid_transformer.utils.tokenizers.auto import AutoTokenizer
from hybrid_transformer.models.auto import AutoModel
from hybrid_transformer.utils.loggers.wandb import WandbLogger

from hybrid_transformer.trainers.trainer import Trainer

from scripts.pretrain.train import DEFAULT_CONFIG_FILES

from hybrid_transformer.utils.objectives.guacamol.objective import GUACAMOL_TASKS
from hybrid_transformer.models.prediction import PREDICTION_MODEL_CONFIGS
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm
2023-12-30 22:05:39.050616: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-30 22:05:39.083278: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-30 22:05:39.235551: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-30 22:05:39.235585: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-30 22:05:39.264343: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515

In [6]:
# Load configs

task_config_path = lambda: f'./configs/tasks/guacamol/{guacamol_task}/config.json'


for guacamol_task in GUACAMOL_TASKS:
    
    task_config = TaskConfig.from_pretrained(task_config_path())
    task_config.validate = True
    task_config.split = 'val'
    dataset = AutoDataset.from_config(task_config)
    print(f"Loaded {task_config.target_label} data.")
    
    for model_name, path_to_model_config in PREDICTION_MODEL_CONFIGS.items():
        
        model_config = ModelConfig.from_pretrained(path_to_model_config)

        trainer_config = TrainerConfig.from_pretrained('./configs/trainers/debug/')
        logger_config = LoggerConfig.from_pretrained(DEFAULT_CONFIG_FILES['logger'])
        
        out_dir = f'./results/regression_task/guacamol/{model_name}/{guacamol_task}'
        trainer_config.out_dir = out_dir
        logger_config.name = model_name + '_' + guacamol_task
        logger_config.project = 'debug'
        trainer_config.enable_save_checkpoint = False
        task_config.validate = False
        logger_config.wandb_log = True
        
        dataset = AutoDataset.from_config(task_config)
        tokenizer = AutoTokenizer.from_config(task_config)                
        model = AutoModel.from_config(model_config)
        logger = WandbLogger(logger_config, [task_config, model_config, trainer_config])
        trainer = Trainer(config=trainer_config, model=model, train_dataset=dataset, eval_dataset=dataset, tokenizer=tokenizer, logger=logger)
        
        trainer.train()
        

        

Validating Guacamol dataset...


79568it [00:10, 7837.35it/s]


Guacamol dataset loaded with 79568 valid examples!
Loaded osmb data.
number of parameters: 38.06M
tokens per iteration will be: 512
Using cuda device
num decayed parameter tensors: 63, with 38,115,840 parameters
num non-decayed parameter tensors: 25, with 12,800 parameters
using fused AdamW: True
Compiling model..




Evaluation at iter 0: train loss 0.0276, val loss 0.0550, percent 0.0000
Evaluation at iter 1: train loss 0.0747, val loss 0.0538, percent 0.0000
Evaluation at iter 2: train loss 0.0438, val loss 0.0227, percent 0.0000
Training finished.0, time 2473.47ms.
number of parameters: 38.06M
tokens per iteration will be: 512
Using cuda device
num decayed parameter tensors: 63, with 38,115,840 parameters
num non-decayed parameter tensors: 25, with 12,800 parameters
using fused AdamW: True
Compiling model..




0,1
iter,▁▅█
lr,▁▅█
train/loss,▁█▃
train/loss_supervised,▁█▃
train/loss_unsupervised,█▁▃
val/loss,██▁
val/loss_supervised,██▁
val/loss_unsupervised,▁▃█
val/valid,▁▁▁

0,1
iter,2.0
lr,0.0
train/loss,0.04376
train/loss_supervised,0.04376
train/loss_unsupervised,3.61361
val/loss,0.02273
val/loss_supervised,0.02273
val/loss_unsupervised,3.88283
val/valid,0.0


Evaluation at iter 0: train loss 4.3206, val loss 3.7270, percent 0.0000
Evaluation at iter 1: train loss 3.4616, val loss 3.8427, percent 0.0000
Evaluation at iter 2: train loss 3.3065, val loss 3.5399, percent 0.0000
Training finished.7, time 2456.78ms.
number of parameters: 38.06M
tokens per iteration will be: 512
Using cuda device
num decayed parameter tensors: 63, with 38,115,840 parameters
num non-decayed parameter tensors: 25, with 12,800 parameters
using fused AdamW: True
Compiling model..




0,1
iter,▁▅█
lr,▁▅█
train/loss,█▂▁
train/loss_supervised,█▅▁
train/loss_unsupervised,█▁▂
val/loss,▅█▁
val/loss_supervised,▅█▁
val/loss_unsupervised,▃▁█
val/valid,▁▁▁

0,1
iter,2.0
lr,0.0
train/loss,3.30649
train/loss_supervised,0.19248
train/loss_unsupervised,3.11402
val/loss,3.53988
val/loss_supervised,0.29079
val/loss_unsupervised,3.24909
val/valid,0.0


Evaluation at iter 0: train loss 4.3445, val loss 3.5670, percent 0.0000
Evaluation at iter 1: train loss 3.5383, val loss 3.8854, percent 0.0000
Evaluation at iter 2: train loss 3.2390, val loss 3.5561, percent 0.0000
Training finished.1, time 2458.56ms.
number of parameters: 38.06M
tokens per iteration will be: 512
Using cuda device
num decayed parameter tensors: 63, with 38,115,840 parameters
num non-decayed parameter tensors: 25, with 12,800 parameters
using fused AdamW: True
Compiling model..




0,1
iter,▁▅█
lr,▁▅█
train/loss,█▃▁
train/loss_supervised,█▅▁
train/loss_unsupervised,█▁▁
val/loss,▁█▁
val/loss_supervised,▅█▁
val/loss_unsupervised,▁▅█
val/valid,▁▁▁

0,1
iter,2.0
lr,0.0
train/loss,3.23899
train/loss_supervised,0.1892
train/loss_unsupervised,3.04979
val/loss,3.55609
val/loss_supervised,0.28659
val/loss_unsupervised,3.2695
val/valid,0.0


Evaluation at iter 0: train loss 1.9493, val loss 1.4402, percent 0.0000
Evaluation at iter 1: train loss 1.6378, val loss 1.6417, percent 0.0000
Evaluation at iter 2: train loss 1.4221, val loss 1.4515, percent 0.0000
Training finished.1, time 2439.95ms.
number of parameters: 38.06M
tokens per iteration will be: 512
Using cuda device
num decayed parameter tensors: 63, with 38,115,840 parameters
num non-decayed parameter tensors: 25, with 12,800 parameters
using fused AdamW: True
Compiling model..




0,1
iter,▁▅█
lr,▁▅█
train/loss,█▄▁
train/loss_supervised,█▅▁
train/loss_unsupervised,█▅▁
val/loss,▁█▁
val/loss_supervised,█▇▁
val/loss_unsupervised,▂█▁
val/valid,▁▁▁

0,1
iter,2.0
lr,0.0
train/loss,1.42208
train/loss_supervised,0.15594
train/loss_unsupervised,2.23001
val/loss,1.45151
val/loss_supervised,0.20537
val/loss_unsupervised,2.2453
val/valid,0.0


Evaluation at iter 0: train loss 1.9106, val loss 1.5000, percent 0.0000
Evaluation at iter 1: train loss 1.5707, val loss 1.6001, percent 0.0000
Evaluation at iter 2: train loss 1.3970, val loss 1.4243, percent 0.0000
Training finished.8, time 2454.09ms.
number of parameters: 38.06M
tokens per iteration will be: 512
Using cuda device
num decayed parameter tensors: 63, with 38,115,840 parameters
num non-decayed parameter tensors: 25, with 12,800 parameters
using fused AdamW: True
Compiling model..




0,1
iter,▁▅█
lr,▁▅█
train/loss,█▃▁
train/loss_supervised,█▅▁
train/loss_unsupervised,█▄▁
val/loss,▄█▁
val/loss_supervised,█▆▁
val/loss_unsupervised,█▇▁
val/valid,▁▁▁

0,1
iter,2.0
lr,0.0
train/loss,1.39704
train/loss_supervised,0.15206
train/loss_unsupervised,2.20391
val/loss,1.42426
val/loss_supervised,0.19925
val/loss_unsupervised,2.20747
val/valid,0.0


Evaluation at iter 0: train loss 2.7217, val loss 2.3374, percent 0.0000


OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacty of 1.83 GiB of which 94.94 MiB is free. Including non-PyTorch memory, this process has 1.73 GiB memory in use. Of the allocated memory 1.44 GiB is allocated by PyTorch, and 218.81 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF