# Hyformer Examples

This notebook shows how to use Hyformer, as a SMILES encoder, generator and how to train the model with default Trainer. 

In [38]:
# Imports

import os
import torch

from hyformer.configs.tokenizer import TokenizerConfig
from hyformer.configs.model import ModelConfig

from hyformer.utils.tokenizers.auto import AutoTokenizer
from hyformer.models.auto import AutoModel

# Imports

import os
import torch

from hyformer.configs.dataset import DatasetConfig
from hyformer.configs.tokenizer import TokenizerConfig
from hyformer.configs.model import ModelConfig
from hyformer.configs.trainer import TrainerConfig

from hyformer.utils.datasets.auto import AutoDataset
from hyformer.utils.tokenizers.auto import AutoTokenizer
from hyformer.models.auto import AutoModel

from hyformer.utils.runtime import set_seed

import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# Set logging

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# set seed
SEED = 1337

set_seed(SEED)

# Set device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Set working directory of the project

REPOSITORY_DIR = '/home/aih/adam.izdebski/project/hyformer-interface/hyformer'
os.chdir(REPOSITORY_DIR)

# Load Fibrotic Data

In [13]:
DATA_FILEPATH = '/lustre/groups/aih/hyformer/icml25/data/fibrosis/raw/FDA_Div_Filtered_Dataset.csv'

In [34]:
_df = pd.read_csv(DATA_FILEPATH, index_col=0)
SEQUENCE_COLUMN = 'standard_smiles'

idx = _df.index.tolist()
smiles = _df[SEQUENCE_COLUMN].tolist()

In [35]:
_df.head()

Unnamed: 0,SMILES,standard_smiles,Hits,Library,SMILES_no_chiral
10166,CC1=NN(C2CCCCC2)C2=C1C(CC(=O)N2)C1=C(C)N=CN1,Cc1nc[nH]c1C1CC(=O)Nc2c1c(C)nn2C1CCCCC1,True,Diversity,Cc1nc[nH]c1C1CC(=O)Nc2c1c(C)nn2C1CCCCC1
932,CCCCNC(=O)NS(=O)(=O)C1=CC=C(C=C1)C,CCCCNC(=O)NS(=O)(=O)c1ccc(C)cc1,True,MCE,CCCCNC(=O)NS(=O)(=O)c1ccc(C)cc1
499,COC1=CC=C(NC2=C3C4=C(C=CC=C4)C(=O)C4=C3C(=CC=C...,COc1ccc(Nc2c3c4c(c(Br)ccc4n(C)c2=O)C(=O)c2cccc...,True,Diversity,COc1ccc(Nc2c3c4c(c(Br)ccc4n(C)c2=O)C(=O)c2cccc...
758,CN1C2=C(C(=O)N(C1=O)C)N(C=N2)CC3OCCO3,Cn1c(=O)c2c(ncn2CC2OCCO2)n(C)c1=O,True,Prestwick,Cn1c(=O)c2c(ncn2CC2OCCO2)n(C)c1=O
9049,[H]N([H])C12CC3CC(C1)CC(C3)(C2)N1C=NC=N1,NC12CC3CC(C1)CC(n1cncn1)(C3)C2,True,Diversity,NC12CC3CC(C1)CC(n1cncn1)(C3)C2


# Hyformer

In [29]:
# Configs

PATH_TO_TOKENIZER_CONFIG = 'configs/tokenizers/smiles_separate_task_token/config.json'
PATH_TO_MODEL_CONFIG = 'configs/models/hyformer_best_big/config.json'
PATH_TO_MODEL_CKPT = '/lustre/groups/aih/hyformer/icml25/results/pretrain/unimol/hyformer/llama_backbone_best_big_250k_iters/pretrain_generation_mlm_physchem/ckpt.pt'


In [31]:
# Tokenizer

tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)
tokenizer = AutoTokenizer.from_config(tokenizer_config)

# Model

model_config = ModelConfig.from_config_file(PATH_TO_MODEL_CONFIG)
model = AutoModel.from_config(model_config)
model.load_pretrained(PATH_TO_MODEL_CKPT)
model.to(device)
model.eval()

HyformerWithPrefix(
  (token_embedding): Embedding(596, 512)
  (layers): ModuleList(
    (0-11): 12 x TransformerLayer(
      (attention_layer): Attention(
        (qkv): Linear(in_features=512, out_features=1536, bias=False)
        (out): Linear(in_features=512, out_features=512, bias=False)
        (relative_embedding): RotaryPositionalEmbedding()
      )
      (feed_forward): FeedForward(
        (w1): Linear(in_features=512, out_features=2048, bias=False)
        (w3): Linear(in_features=512, out_features=2048, bias=False)
        (w2): Linear(in_features=2048, out_features=512, bias=False)
      )
      (attention_layer_normalization): RMSNorm()
      (feed_forward_normalization): RMSNorm()
    )
  )
  (layer_norm): RMSNorm()
  (lm_head): Linear(in_features=512, out_features=596, bias=False)
  (mlm_head): Linear(in_features=512, out_features=596, bias=False)
  (physchem_head): RegressionHead(
    (net): Sequential(
      (0): Linear(in_features=512, out_features=256, bias=True)
 

## Hyformer as a SMILES Encoder

In [32]:
# Init Smiles Encoder as a wrapper for molecule encoding

encoding_batch_size = 256
smiles_encoder = model.to_smiles_encoder(tokenizer=tokenizer, device=device, batch_size=encoding_batch_size)

In [33]:
# Encode smiles

encoding = smiles_encoder.encode(smiles)  # setting the device and model.eval() under the hood

Encoding samples: 100%|██████████| 64/64 [00:39<00:00,  1.60it/s]


In [37]:
encoding.shape

(16197, 512)

In [39]:
EMBEDDINGS_FILEPATH = '/lustre/groups/aih/hyformer/icml25/data/fibrosis/embeddings/hyformer.npz'
if not os.path.exists(EMBEDDINGS_FILEPATH):
    os.makedirs(os.path.dirname(EMBEDDINGS_FILEPATH), exist_ok=True)

In [40]:
np.savez(EMBEDDINGS_FILEPATH, embeddings=encoding, smiles=smiles, ids=idx)

In [41]:
encoding

array([[ 0.08279007, -0.25026557, -0.45100284, ..., -0.6174221 ,
        -0.12018453,  0.20276001],
       [ 0.0333407 ,  0.86286128, -0.60028887, ..., -0.70104438,
        -0.51607561, -0.94197947],
       [ 0.04557168, -0.53243083, -1.06156468, ..., -0.28652242,
         0.22138278,  0.23150636],
       ...,
       [-0.06491564,  0.15110879, -0.62854481, ..., -0.85657644,
        -0.8597644 , -1.67475033],
       [-0.11813052,  0.62057179, -0.78862149, ..., -0.33209422,
        -0.50159997, -0.81410062],
       [ 0.19068508,  0.12450401, -0.81713712, ..., -0.18272324,
        -0.41439158,  0.1988963 ]])

## MolGPT

In [52]:
MODEL_NAME = 'molgpt'

# MolGPT Configs

PATH_TO_TOKENIZER_CONFIG = 'configs/tokenizers/gpt_tokenizer/config.json'
PATH_TO_MODEL_CONFIG = 'configs/models/molgpt/config.json'
PATH_TO_MODEL_CKPT = '/lustre/groups/aih/hyformer/icml25/results/pretrain/guacamol/molgpt/guacamol_nocond.pt'


In [55]:
# Tokenizer

tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)
tokenizer = AutoTokenizer.from_config(tokenizer_config)

# Model

model_config = ModelConfig.from_config_file(PATH_TO_MODEL_CONFIG)
model = AutoModel.from_config(model_config)
smiles_encoder = model.to_smiles_encoder(tokenizer=tokenizer, device=device, batch_size=encoding_batch_size)
smiles_encoder.load_pretrained(PATH_TO_MODEL_CKPT)


Removed key: prop_nn.weight from checkpoint; not found in model or size mismatch.
Removed key: prop_nn.bias from checkpoint; not found in model or size mismatch.


In [56]:
# Encode smiles

encoding = smiles_encoder.encode(smiles)  # setting the device and model.eval() under the hood

Filtering data: 100%|██████████| 16197/16197 [00:00<00:00, 98742.36it/s] 
Filtered 5657 examples due to unknown characters.
100%|██████████| 64/64 [00:15<00:00,  4.26it/s]


In [57]:
encoding.shape

(16197, 256)

In [58]:
EMBEDDINGS_FILEPATH = f'/lustre/groups/aih/hyformer/icml25/data/fibrosis/embeddings/{MODEL_NAME}.npz'
if not os.path.exists(EMBEDDINGS_FILEPATH):
    os.makedirs(os.path.dirname(EMBEDDINGS_FILEPATH), exist_ok=True)

In [59]:
np.savez(EMBEDDINGS_FILEPATH, embeddings=encoding, smiles=smiles, ids=idx)

## ChemBERT-a

In [84]:
MODEL_NAME = 'chemberta-2'

# ChemBERT-a Configs

PATH_TO_TOKENIZER_CONFIG = 'configs/tokenizers/chemberta/config.json'
PATH_TO_MODEL_CONFIG = 'configs/models/chemberta_for_regression/config.json'
PATH_TO_MODEL_CKPT = '/lustre/groups/aih/hyformer/icml25/results/pretrain/unimol/hyformer/llama_backbone_best_big_250k_iters/pretrain_generation_mlm_physchem/ckpt.pt'


In [85]:
# Tokenizer

tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)
tokenizer = AutoTokenizer.from_config(tokenizer_config)

# Model

model_config = ModelConfig.from_config_file(PATH_TO_MODEL_CONFIG)
model = AutoModel.from_config(model_config)
smiles_encoder = model.to_smiles_encoder(tokenizer=tokenizer, device=device, batch_size=encoding_batch_size)

In [86]:
# Encode smiles

encoding = smiles_encoder.encode(smiles)  # setting the device and model.eval() under the hood

Encoding samples: 100%|██████████| 64/64 [00:07<00:00,  9.11it/s]


In [87]:
EMBEDDINGS_FILEPATH = f'/lustre/groups/aih/hyformer/icml25/data/fibrosis/embeddings/{MODEL_NAME}.npz'
if not os.path.exists(EMBEDDINGS_FILEPATH):
    os.makedirs(os.path.dirname(EMBEDDINGS_FILEPATH), exist_ok=True)

In [88]:
np.savez(EMBEDDINGS_FILEPATH, embeddings=encoding, smiles=smiles, ids=idx)

In [89]:
EMBEDDINGS_FILEPATH

'/lustre/groups/aih/hyformer/icml25/data/fibrosis/embeddings/chemberta-2.npz'

## Uni-Mol

In [77]:
MODEL_NAME = 'unimol'

# Uni-Mol Configs

PATH_TO_MODEL_CONFIG = 'configs/models/unimol/config.json'
PATH_TO_MODEL_CKPT = '/lustre/groups/aih/hyformer/icml25/results/pretrain/guacamol/unimol/mol_pre_no_h_220816.pt'


In [78]:
# Tokenizer

tokenizer = None

# Model

model_config = ModelConfig.from_config_file(PATH_TO_MODEL_CONFIG)
model = AutoModel.from_config(model_config)
smiles_encoder = model.to_smiles_encoder(tokenizer=tokenizer, device=device, batch_size=encoding_batch_size)
smiles_encoder.load_pretrained(PATH_TO_MODEL_CKPT)

2025-04-01 15:55:12 | unimol_tools/models/unimol.py | 135 | INFO | Uni-Mol Tools | Loading pretrained weights from /home/aih/adam.izdebski/miniconda3/envs/hyformer/lib/python3.9/site-packages/unimol_tools/weights/mol_pre_all_h_220816.pt


In [79]:
# Encode smiles

encoding = smiles_encoder.encode(smiles)  # setting the device and model.eval() under the hood

2025-04-01 15:55:33 | unimol_tools/data/conformer.py | 150 | INFO | Uni-Mol Tools | Start generating conformers...
0it [00:00, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The c

In [80]:
EMBEDDINGS_FILEPATH = f'/lustre/groups/aih/hyformer/icml25/data/fibrosis/embeddings/{MODEL_NAME}.npz'
if not os.path.exists(EMBEDDINGS_FILEPATH):
    os.makedirs(os.path.dirname(EMBEDDINGS_FILEPATH), exist_ok=True)

In [81]:
np.savez(EMBEDDINGS_FILEPATH, embeddings=encoding, smiles=smiles, ids=idx)

In [82]:
MODEL_NAME

'unimol'

In [83]:
encoding.shape

(16197, 512)