# Debug training

In [7]:
# Imports

import os, logging, argparse, sys

import torch

from hyformer.configs.dataset import DatasetConfig
from hyformer.configs.tokenizer import TokenizerConfig
from hyformer.configs.model import ModelConfig
from hyformer.configs.trainer import TrainerConfig

from hyformer.utils.datasets.auto import AutoDataset
from hyformer.utils.tokenizers.auto import AutoTokenizer
from hyformer.models.auto import AutoModel

from hyformer.trainers.trainer import Trainer

from hyformer.utils.reproducibility import set_seed

# autoreload
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
DATA_DIR = "/lustre/groups/aih/hyformer/data"

DATASET_CONFIG_PATH = "configs/datasets/molecule_net/scaffold/bace/config.json"
TOKENIZER_CONFIG_PATH = "configs/tokenizers/smiles/unimol/config.json"
MODEL_CONFIG_PATH = "configs/models/unimol_vocab/hyformer/config.json"
TRAINER_CONFIG_PATH = "configs/trainers/distribution_learning/unimol/combined/config.json"


In [15]:
# Load configurations
dataset_config = DatasetConfig.from_config_filepath(DATASET_CONFIG_PATH)
tokenizer_config = TokenizerConfig.from_config_filepath(TOKENIZER_CONFIG_PATH)
model_config = ModelConfig.from_config_filepath(MODEL_CONFIG_PATH)
trainer_config = TrainerConfig.from_config_filepath(TRAINER_CONFIG_PATH)


In [16]:
trainer_config.compile = False
trainer_config.num_workers = 1

In [17]:
# Initialize
train_dataset = AutoDataset.from_config(dataset_config, split='train', root=DATA_DIR)
val_dataset = AutoDataset.from_config(dataset_config, split='val', root=DATA_DIR)


In [18]:
# determine the maximum length of the training set
max_length = max([len(train_dataset[i]['data']) for i in range(len(train_dataset))])
print(max_length)

139


In [19]:
tokenizer = AutoTokenizer.from_config(tokenizer_config)


In [33]:
# Determine the device
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')


In [34]:

model = AutoModel.from_config(
    model_config,
    num_prediction_tasks=dataset_config.num_prediction_tasks,
    prediction_task_type=dataset_config.prediction_task_type)
model.to(device)


Hyformer(
  (token_embedding): Embedding(467, 512)
  (layers): ModuleList(
    (0-7): 8 x TransformerLayer(
      (attention_layer): Attention(
        (q_proj): Linear(in_features=512, out_features=512, bias=False)
        (k_proj): Linear(in_features=512, out_features=512, bias=False)
        (v_proj): Linear(in_features=512, out_features=512, bias=False)
        (out): Linear(in_features=512, out_features=512, bias=False)
        (relative_embedding): RotaryEmbedding()
      )
      (feed_forward): FeedForward(
        (w1): Linear(in_features=512, out_features=2048, bias=False)
        (w3): Linear(in_features=512, out_features=2048, bias=False)
        (w2): Linear(in_features=2048, out_features=512, bias=False)
      )
      (attention_layer_normalization): RMSNorm()
      (feed_forward_normalization): RMSNorm()
    )
  )
  (layer_norm): RMSNorm()
  (lm_head): Linear(in_features=512, out_features=467, bias=False)
  (mlm_head): Linear(in_features=512, out_features=467, bias=False)

In [35]:
assert len(tokenizer) == model.vocab_size, f"Tokenizer vocab size {len(tokenizer)} does not match model embedding dim {model.embedding_dim}"

In [36]:
# Initialize trainer
trainer = Trainer(
    config=trainer_config,
    model=model,
    tokenizer=tokenizer,
    device=device,
    )


In [37]:
trainer.test(val_dataset, dataset_config.test_metric)

0.5835164835164836

In [38]:
trainer_loader = trainer.create_loader(train_dataset, shuffle=True, tasks={'prediction': 1.0})

In [103]:
with torch.no_grad():
    batch = next(iter(trainer_loader))
    batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
    output = model(**batch, return_loss=True)
    

In [107]:
output['embeddings'].shape

torch.Size([512, 128, 512])

In [54]:
nan_target_idx = -1

logits = output['logits']
target = batch['target']

valid_mask = target != nan_target_idx

In [59]:
batch_indices = torch.nonzero(valid_mask, as_tuple=True)[0]

valid_logits = logits[batch_indices]
valid_targets = target[batch_indices]


In [60]:
valid_logits.shape

torch.Size([13824, 1])

In [61]:
valid_targets.shape

torch.Size([13824, 27])

In [37]:
import torch.nn.functional as F

In [39]:
F.binary_cross_entropy_with_logits(
    valid_logits,
    valid_targets.float(),
)

tensor(0.6956, device='cuda:0')

In [82]:
import numpy as np

targets = np.array([train_dataset[idx]['target'] for idx in range(len(train_dataset))])

In [84]:
targets.mean()

2.5374927e-08

In [85]:
targets.std()

1.0

In [88]:
ckpt = torch.load(f"/lustre/groups/aih/hyformer/results/distribution_learning/unimol/hyformer/combined/ckpt.pt", weights_only=True)

In [89]:
ckpt

{'model': OrderedDict([('_orig_mod.token_embedding.weight',
               tensor([[-8.3178e-04, -4.0049e-02,  9.7936e-04,  ...,  5.7653e-04,
                        -2.7849e-03, -1.8312e-02],
                       [-8.7098e-04,  1.0235e-02,  3.5240e-02,  ...,  3.1984e-03,
                         1.7517e-02, -1.0844e-01],
                       [ 1.6659e-02,  1.1573e-02,  5.5675e-02,  ..., -5.7113e-03,
                         1.5995e-02, -1.0882e-01],
                       ...,
                       [-4.7732e-05,  9.9253e-03,  1.2178e-03,  ...,  9.0702e-05,
                        -4.8318e-04, -3.3103e-02],
                       [ 9.3578e-03,  1.3270e-02,  4.6671e-02,  ..., -2.0456e-03,
                         2.7457e-02, -1.1691e-01],
                       [ 1.0032e-03,  2.4849e-02, -2.8373e-03,  ..., -3.0068e-04,
                         3.7336e-04, -4.7882e-02]], device='cuda:0')),
              ('_orig_mod.layers.0.attention_layer.q_proj.weight',
               tensor([[ 0.

In [108]:
train_dataset

<hyformer.utils.datasets.sequence.SequenceDataset at 0x7f7df7c5f010>

In [109]:
train_dataset[0]

{'data': 'CC(C)=CCCC(C)=CC(=O)', 'target': array([0.39041302], dtype=float32)}

In [111]:
train_dataset.target.mean()

-2.8668756

In [112]:
train_dataset.target.std()

2.066724

In [22]:
import numpy as np

In [31]:
# see what happens to the molecule that has the longest sequence

max_length_idx = np.argmax([len(train_dataset[i]['data']) for i in range(len(train_dataset))])
smiles = train_dataset[max_length_idx]['data']
print(smiles)
print(len(tokenizer(smiles, task='prediction')['input_ids'][0]))


OC(C(NC(=O)C(NC(=O)C(NC(=O)C([NH3+])CCC(=O)[O-])CC(C)C)CC(=O)[O-])CC(C)C)CC(C(=O)NC(C(C)C)C(=O)NC(CCC(=O)[O-])C(=O)NC(Cc1ccccc1)C(=O)[O-])C
125


In [32]:
len(tokenizer(smiles, task='prediction')['input_ids'][0])

125

In [42]:
shapes = []
for batch in trainer_loader:
    shapes.append(batch['input_ids'].shape[1])

print(shapes)



[128, 128, 128]
