# Jointformer Training

This notebook shows how to train Jointformer. 

In [47]:
# Imports

import os

from jointformer.configs.dataset import DatasetConfig
from jointformer.configs.tokenizer import TokenizerConfig
from jointformer.configs.model import ModelConfig
from jointformer.configs.trainer import TrainerConfig

from jointformer.utils.datasets.auto import AutoDataset
from jointformer.utils.tokenizers.auto import AutoTokenizer
from jointformer.models.auto import AutoModel
from jointformer.trainers.trainer import Trainer

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [48]:
# Set working directory of the project

REPOSITORY_DIR = '/home/adamizdebski/projects/jointformer'
os.chdir(REPOSITORY_DIR)

In [53]:
# Configs

DATA_DIR = '/home/adamizdebski/files/data'
OUTPUT_DIR = '/home/adamizdebski/files/jointformer/results/finetune'

PATH_TO_DATASET_CONFIG   = '/home/adamizdebski/projects/jointformer/configs/datasets/molecule_net/scaffold/lipo'
PATH_TO_TOKENIZER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/tokenizers/gpt_tokenizer'
PATH_TO_MODEL_CONFIG = '/home/adamizdebski/projects/jointformer/configs/models/joint_gpt_prediction'
PATH_TO_TRAINER_CONFIG = '/home/adamizdebski/projects/jointformer/configs/trainers/test'

In [54]:
# Test Datsaset

dataset_config = DatasetConfig.from_config_file(PATH_TO_DATASET_CONFIG)
tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)

train_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='train')
val_dataset = AutoDataset.from_config(dataset_config, data_dir=DATA_DIR, split='val')

tokenizer = AutoTokenizer.from_config(tokenizer_config)

In [60]:
# Init Jointformer

model_config = ModelConfig.from_config_file(PATH_TO_MODEL_CONFIG)
model = AutoModel.from_config(model_config, downstream_task=dataset_config.task_type, num_tasks=dataset_config.num_tasks, hidden_dim=256)
# model.load_pretrained('ckpt.pt')

number of parameters: 6.45M


In [61]:
model

JointGPTForDownstreamPrediction(
  (transformer): ModuleDict(
    (wte): Embedding(593, 256)
    (wpe): Embedding(128, 256)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-7): 8 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=256, out_features=768, bias=False)
          (c_proj): Linear(in_features=256, out_features=256, bias=False)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=256, out_features=1024, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1024, out_features=256, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=256, out_features=593, bias=False)
  (downstream_prediction_task_head): Downstr

In [62]:

trainer_config = TrainerConfig.from_config_file(PATH_TO_TRAINER_CONFIG)
trainer = Trainer(
    config=trainer_config,
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    tokenizer=tokenizer
    )

using fused AdamW: True


In [63]:
trainer._init_data_loaders()

batch = next(iter(trainer.train_loader))
batch.to('cuda')
model.to('cuda')
outputs = model(**batch)

In [66]:
model.get_loss(**batch)

{'token_embeddings': tensor([[[-0.0523, -0.0435, -0.0128,  ..., -0.0171, -0.0043,  0.0739],
          [ 0.0247, -0.0027,  0.0099,  ..., -0.0462,  0.0014, -0.0134],
          [ 0.0404,  0.0254, -0.0292,  ...,  0.0226,  0.0302,  0.0122],
          ...,
          [ 0.0059, -0.0195,  0.0278,  ..., -0.0457, -0.0063, -0.0160],
          [ 0.0143,  0.0149,  0.0547,  ..., -0.0321, -0.0021,  0.0379],
          [ 0.0040, -0.0210,  0.0362,  ..., -0.0282,  0.0392,  0.0006]],
 
         [[-0.0523, -0.0435, -0.0128,  ..., -0.0171, -0.0043,  0.0739],
          [ 0.0419,  0.0254,  0.0354,  ..., -0.0138,  0.0017,  0.0224],
          [ 0.0251,  0.0234, -0.0165,  ..., -0.0135,  0.0274, -0.0064],
          ...,
          [ 0.0059, -0.0195,  0.0278,  ..., -0.0457, -0.0063, -0.0160],
          [ 0.0143,  0.0149,  0.0547,  ..., -0.0321, -0.0021,  0.0379],
          [ 0.0040, -0.0210,  0.0362,  ..., -0.0282,  0.0392,  0.0006]]],
        device='cuda:0', grad_fn=<AddBackward0>),
 'embeddings': tensor([[[ 0.855