# Debug training

In [2]:
# Imports

import os, logging, argparse, sys

import torch
import torch.distributed as dist

from torch.distributed import init_process_group, destroy_process_group
from torch.distributed.elastic.multiprocessing.errors import record

from hyformer.configs.dataset import DatasetConfig
from hyformer.configs.tokenizer import TokenizerConfig
from hyformer.configs.model import ModelConfig
from hyformer.configs.trainer import TrainerConfig
from hyformer.configs.logger import LoggerConfig

from hyformer.utils.datasets.auto import AutoDataset
from hyformer.utils.tokenizers.auto import AutoTokenizer
from hyformer.models.auto import AutoModel
from hyformer.utils.loggers.auto import AutoLogger

from hyformer.trainers.trainer import Trainer

from hyformer.utils.experiments import log_args, dump_configs
from hyformer.utils.reproducibility import set_seed

# autoreload
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
DATA_DIR = "/lustre/groups/aih/hyformer/data"

DATASET_CONFIG_PATH = "configs/datasets/guacamol/config.json"
TOKENIZER_CONFIG_PATH = "configs/tokenizers/smiles/config.json"
MODEL_CONFIG_PATH = "configs/models/hyformer_small/config.json"
TRAINER_CONFIG_PATH = "configs/trainers/distribution_learning/guacamol/lm/config.json"


In [4]:
# Load configurations
dataset_config = DatasetConfig.from_config_path(DATASET_CONFIG_PATH)
tokenizer_config = TokenizerConfig.from_config_path(TOKENIZER_CONFIG_PATH)
model_config = ModelConfig.from_config_path(MODEL_CONFIG_PATH)
trainer_config = TrainerConfig.from_config_path(TRAINER_CONFIG_PATH)


In [5]:
# Initialize
train_dataset = AutoDataset.from_config(dataset_config, split='train', root=DATA_DIR)
val_dataset = AutoDataset.from_config(dataset_config, split='val', root=DATA_DIR)
tokenizer = AutoTokenizer.from_config(tokenizer_config)
model = AutoModel.from_config(model_config)
   

In [6]:
# Determine the device
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')


In [15]:
# Initialize trainer
trainer = Trainer(
    config=trainer_config,
    model=model,
    tokenizer=tokenizer,
    device=device,
    )



In [7]:
samples = [train_dataset[i]['data'] for i in range(2)]

In [13]:
samples


['CCC(C)(C)Br', 'CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O']

In [26]:
tokenizer.decode(torch.tensor(tokenizer(samples, task='lm')['input_ids'][0]))

'CCC(C)(C)Br'

In [16]:
trainer_loader = trainer.create_loader(train_dataset, shuffle=True, tasks=trainer.config.tasks)



In [17]:
batch = next(iter(trainer_loader))



In [24]:
tokenizer.decode(batch['input_ids'][5])

'O=C(O)C1C2CC=CC2c2cc(Cl)cc3c2N1CC1CC=CC31'

In [33]:
batch['input_ids'][5]

tensor([508, 503,  29,  21,  24,   6,  29,   7,  24,  12,  24,  13,  24,  24,
         21,  24,  24,  13, 498,  13, 498, 498,   6,  25,   7, 498, 498,  14,
        498,  13,  28,  12,  24,  24,  12,  24,  24,  21,  24,  24,  14,  12,
        504, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505,
        505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505,
        505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505,
        505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505,
        505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505,
        505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505, 505,
        505, 505])

In [34]:
batch['input_labels'][5]

tensor([-100, -100,   29,   21,   24,    6,   29,    7,   24,   12,   24,   13,
          24,   24,   21,   24,   24,   13,  498,   13,  498,  498,    6,   25,
           7,  498,  498,   14,  498,   13,   28,   12,   24,   24,   12,   24,
          24,   21,   24,   24,   14,   12,  504, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100])

In [29]:
batch['attention_mask'][5]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])