## some config

In [1]:
import os
import math
import random
import torch
import fire
import copy
import multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import GradScaler
# import deepspeed
from datetime import datetime
from typing import Optional
from pprint import pprint, pformat
from torch.utils.data import DataLoader, IterableDataset
from timm.utils import ModelEmaV3
from timm.models import load_checkpoint
from timm.utils.model import unwrap_model, get_state_dict
try:
    from torch.utils.tensorboard import SummaryWriter
except ModuleNotFoundError:
    from tensorboardX import SummaryWriter

import sys

# sys.path.insert(0, "..")
sys.path.insert(0, ".")

In [2]:
from src.data import (
    collator,
    vocab_builder,
    tokenizer,
    read_dataset,
    OdpsTableIterableDataset,
)
from src.models import (
    GraphGPTConfig,
    GraphGPTCausal,
    GraphGPT2Config,
    GraphGPT2Causal,
    GraphBertConfig,
    GraphBertForMaskedLM,
)
from src.utils import (
    conf_utils,
    loss_utils,
    loader_utils,
    tokenizer_utils,
    modules_utils,
    misc_utils,
    print_trainable_parameters,
    print_params,
    inspect_tokenization_results,
    set_up_shuffle_and_sampler,
    worker_init_fn_seed,
)

dict_models = {
    "graphgpt2": (GraphGPT2Causal, GraphGPT2Config),
    "graphgpt": (GraphGPTCausal, GraphGPTConfig),
    "graphbert": (GraphBertForMaskedLM, GraphBertConfig),
}

[2024-12-10 15:22:34,917] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlvsym'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `shm_open'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined

In [3]:
data_dir: str = "../data/OGB"
tables: str = ""
# deepspeed_config = "./examples/ds_config2_pt.json"
intermediate_size = 0
num_attention_heads = 0
hidden_size = 512
num_hidden_layers = 8
task_type='pretrain'
causal_attention = 1
lr=3e-4
model_type = 'graphgpt'
output_dir='./exp/models/pcqm4m-v2/test'
pretrain_cpt = '/datalake/datastore1/yang/graph-gpt/exp/models/pcqm4m-v2/medium_ntp/pt_ns_h512_l8_b8192_mpe1024_tk1e9_gelu_pretrain3.3m_nmlm_mrlinear_mtp0.8_0_0.2_lr3e-4_adp0.1_pdp0_edp0_mdp0_lsi0_short_gated_wd0.1'
samples_per_saving=1000000

batch_size = 1024
stack_method = 'short'

pack_tokens = 0
max_position_embeddings = 1024

task_type='pretrain'
total_tokens=1e9
batch_size = 1024
warmup_tokens=1e8

In [4]:
use_tb_writer = False           # use tensorboard writer
use_ema = False # False # use exponential moving average to smooth model
use_deepspeed = False # True # use deepspeed for training, good to set scheduler
if (intermediate_size == 0) and (num_attention_heads == 0): # True
    (
        hidden_size,
        intermediate_size,
        num_attention_heads,
        num_hidden_layers,
    ) = modules_utils.set_up_model_architect(
        hidden_size=hidden_size, num_hidden_layers=num_hidden_layers # 768 24 related to model names intermediate_size = hidden_size * 4, num_attention_heads = hidden_size // 64
    )# 768 3072 12 24
causal_attention = 0 if task_type == "pretrain-mlm" else causal_attention
print('hidden_size:', hidden_size, 'intermediate_size:', intermediate_size, 'num_attention_heads:', num_attention_heads, 'num_hidden_layers:', num_hidden_layers, 'causal_attention:', causal_attention) # 768 3072 12 24 1


# #########################
# betas = (0.9, 0.95) # used in AdamW optimizer, important for config beta
# #########################
# # lr * 0.1 -> from llama2 pre-train settings
# min_lr = lr * 0.1 if use_deepspeed else 0    # used in scheduler, when not using deepspeed.
# #########################
gpu_name = torch.cuda.get_device_name()
GraphModel, GraphModelConfig = dict_models[model_type] # Not instantiate yet
print('gpu_name:', gpu_name, 'GraphModel:', GraphModel, 'GraphModelConfig:', GraphModelConfig) 

if os.path.exists(os.path.join(output_dir, "log.csv")):
    print(
        f"log file {os.path.join(output_dir, 'log.csv')} exists, resume training from {output_dir} instead of initializing from pre-train ckp {pretrain_cpt}!"
    )
    pretrain_cpt = output_dir


# # 0. init distributed train and get gpu/device info
# dist.init_process_group(backend="nccl", init_method="env://")  # for distributed training
# dist.barrier() # for sync training
# world_size = dist.get_world_size() # 1 # number of GPUs
# rank = dist.get_rank() # 0 # current GPU index
# local_rank = os.environ.get("LOCAL_RANK") # 0 # current GPU index local to the node
# print(f"\nworld size: {world_size}, rank: {rank}, local rank: {local_rank}") # 1 0 0
# rnd_seed = torch.random.initial_seed() - rank
# random.seed(rnd_seed)
# print(f"seed random with {rnd_seed}") # 1234
# steps_per_saving = samples_per_saving // (world_size * batch_size) # 1000000 // (1 * 1024) = 976
# print(f"\nsteps_per_saving: {steps_per_saving}") # 976
# params = print_params(**locals())

hidden_size: 512 intermediate_size: 2048 num_attention_heads: 8 num_hidden_layers: 8 causal_attention: 1
gpu_name: NVIDIA RTX A6000 GraphModel: <class 'src.models.graphgpt.modeling_graphgpt.GraphGPTCausal'> GraphModelConfig: <class 'src.models.graphgpt.configuration_graphgpt.GraphGPTConfig'>


## load data

In [5]:
# tokenizer config loading
import json

# Load the JSON file
file_path = "./zhang_test/tokenizer_config.json"
with open(file_path, "r") as json_file:
    tokenizer_config = json.load(json_file)

# Print the loaded data
pprint(tokenizer_config)

{'attr_world_identifier': 'molecule',
 'data_dir': './data/OGB',
 'dataset': 'PCQM4Mv2',
 'ensemble_datasets': [],
 'label_tokens_to_pad': ['<icl>'],
 'name_or_path': './data/OGB/pcqm4m-v2',
 'pretrain_mlm': {'info': 'name->polynomial|cosine|fixed,power->3/2/1/0.5',
                  'name': 'polynomial',
                  'params': {'fixed_ratio': 0.7,
                             'mtp': [0.8, 0, 0.2],
                             'power': 1}},
 'sampling': None,
 'semantics': {'attr_assignment': 'first',
               'attr_shuffle': False,
               'common': {'numbers': ['<e>',
                                      '<.>',
                                      '<->',
                                      '<0>',
                                      '<1>',
                                      '<2>',
                                      '<3>',
                                      '<4>',
                                      '<5>',
                                      '<6>',


In [6]:
# 1.1 read configuration
assert "pretrain" in tokenizer_config["task_type"]
assert (
    tokenizer_config["semantics"]["attr_assignment"]   # first
    in tokenizer_utils.ATTR_ASSIGNMENT_TYPES   # ATTR_ASSIGNMENT_TYPES = {"first", "last", "random", "all", "mix"}
)
# pprint(tokenizer_config)
if tokenizer_config["tokenizer_class"] == "StackedGSTTokenizer":
    attr_dim = (
        tokenizer_config["semantics"]["edge"]["dim"] # 3
        + tokenizer_config["semantics"]["node"]["dim"] # 9
    ) # 12
    assert stack_method in ("short", "long", None), f"stack_method: {stack_method}" # short
    if tokenizer_config["structure"]["edge"]["remove_edge_type_token"]: # True
        stacked_feat = 1 + attr_dim
    else:
        stacked_feat = 2 + attr_dim
    next_n_token = stacked_feat
else:
    stacked_feat = 1
    next_n_token = 1 # maybe how many pack of tokens to predict
embed_dim = tokenizer_config["semantics"]["node"].get(
    "embed_dim", 0
) + tokenizer_config["semantics"]["edge"].get("embed_dim", 0) # 0
print(
    f"stacked_feat: {stacked_feat}, next_n_token: {next_n_token}, embed_dim: {embed_dim}" # 13 13 0
)

stacked_feat: 13, next_n_token: 13, embed_dim: 0


In [7]:
# 1.2 get graph dataset
dataset, raw_dataset = read_dataset(
    name=tokenizer_config["dataset"],   # PCQM4Mv2
    # for local data file reading
    data_dir=data_dir,   # './data/OGB'
    sampling_config=tokenizer_config["sampling"],    # None
    # for odps data reading
    table=tables,   # ""
    edge_dim=tokenizer_config["semantics"]["edge"]["dim"],    # 3
    node_dim=tokenizer_config["semantics"]["node"]["dim"],    # 9
    mode="train",
    # general
    pretrain_mode=True,
    # return_valid_test=True,
    ensemble_datasets=tokenizer_config.get("ensemble_datasets", []),    # []
)
reset_samples_per_epoch = (   # what is this  # None for PCQM4Mv2
    dataset.reset_samples_per_epoch
    if hasattr(dataset, "reset_samples_per_epoch")
    else False
)
if isinstance(dataset, IterableDataset):
    print(next(iter(dataset))) 
else: # True
    idx = dataset.sampler[0] # (0, Data(edge_index=[2, 40], edge_attr=[40, 3], x=[18, 9], y=[1, 1], num_nodes=18, idx=0, idx_of_ds=0))
    print(dataset[idx])


Loading dataset PCQM4Mv2 ...

dataset._data -> Data(edge_index=[2, 109093626], edge_attr=[109093626, 3], x=[52970652, 9], y=[3746620])
In pre-train mode, set all valid data's y to nan!
Before setting, y has 294469 NANs
After setting, y has 368014 NANs
Default process group has not been initialized, please make sure to call init_process_group.

Raw indices: 3746620, Removed indices: 0, New indices: 3746620

Raw indices: 3746620, Removed indices: 294469, New indices: 3452151

[2024-12-10 15:23:05.097728] NOT RESET samples of GraphsMapDataset of 3452151 graphs for epoch None!
idx_tuple: None
(0, Data(edge_index=[2, 40], edge_attr=[40, 3], x=[18, 9], y=[1, 1], num_nodes=18, idx=0, idx_of_ds=0))


In [8]:
print(dataset)
print('length of dataset:', len(dataset))
print(dataset[1])
print(dataset[2])
print("#" * 100)
print("example")
print("edge_index: ", dataset[1][1].edge_index)
print("edge_attr: ", dataset[1][1].edge_attr)
print("x: ", dataset[1][1].x)
print("y: ", dataset[1][1].y) 

<src.data.dataset_map.GraphsMapDataset object at 0x7f3af33bf130>
length of dataset: 3452151
(1, Data(edge_index=[2, 34], edge_attr=[34, 3], x=[17, 9], y=[1, 1], num_nodes=17, idx=1, idx_of_ds=0))
(2, Data(edge_index=[2, 32], edge_attr=[32, 3], x=[16, 9], y=[1, 1], num_nodes=16, idx=2, idx_of_ds=0))
####################################################################################################
example
edge_index:  tensor([[11,  8,  8,  6,  6,  2,  2,  5,  5, 12, 12, 13,  5, 16, 16,  1,  1, 14,
         14, 15, 15, 10, 10,  7,  7,  0,  0,  9,  0,  4,  7,  3, 14,  6],
        [ 8, 11,  6,  8,  2,  6,  5,  2, 12,  5, 13, 12, 16,  5,  1, 16, 14,  1,
         15, 14, 10, 15,  7, 10,  0,  7,  9,  0,  4,  0,  3,  7,  6, 14]])
edge_attr:  tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 1],
        [0, 0, 1],
        [3, 0, 1],
        [3, 0, 1],
        [3, 0, 1],
        [3, 0, 1],
        [0, 0, 1],
        [0, 0, 1],
        [0, 0, 0],
        [0, 0, 0],
        [3, 0, 1],
        

In [10]:
print("edge_index type: ", type(dataset[1][1].edge_index), "dtype: ", dataset[1][1].edge_index.dtype)
print("edge_attr type: ", type(dataset[1][1].edge_attr), "dtype: ", dataset[1][1].edge_attr.dtype if dataset[1][1].edge_attr is not None else "None")
print("x type: ", type(dataset[1][1].x), "dtype: ", dataset[1][1].x.dtype)
print("y type: ", type(dataset[1][1].y), "dtype: ", dataset[1][1].y.dtype)


edge_index type:  <class 'torch.Tensor'> dtype:  torch.int64
edge_attr type:  <class 'torch.Tensor'> dtype:  torch.int64
x type:  <class 'torch.Tensor'> dtype:  torch.int64
y type:  <class 'torch.Tensor'> dtype:  torch.float32


In [9]:
example_graph = dataset[0][1]
from src.utils.my_utiles import graph2smiles
smiles = graph2smiles(example_graph.edge_index, example_graph.edge_attr, example_graph.x)
print(smiles)

Cc1ccc(C2Cc3cnccc3NC2=O)cc1


## build vocabulary

In [10]:
add_eos = False
rank = 0
stack_method = "short"
# 1.3 build vocab and then init tokenizer from the tokenization config
vocab_builder.build_vocab(raw_dataset, tokenizer_config, rank) # build vocab from file or scratch
tokenizer_cls = getattr(tokenizer, tokenizer_config["tokenizer_class"]) # StackGSTTokenizer, custom defined
gtokenizer = tokenizer_cls(
    tokenizer_config, add_eos=add_eos, stack_method=stack_method # instantiate
)

[2024-12-10 01:56:52.431594] Vocab is already built and saved in ./data/OGB/pcqm4m-v2/vocab512_stacked!
[2024-12-10 01:56:52.432030] Loading vocab from ./data/OGB/pcqm4m-v2/vocab512_stacked ...
[2024-12-10 01:56:52.435351]
{   '0': 22,
    '1': 23,
    '10': 32,
    '100': 122,
    '101': 123,
    '102': 124,
    '103': 125,
    '104': 126,
    '105': 127,
    '106': 128,
    '107': 129,
    '108': 130,
    '109': 131,
    '11': 33,
    '110': 132,
    '111': 133,
    '112': 134,
    '113': 135,
    '114': 136,
    '115': 137,
    '116': 138,
    '117': 139,
    '118': 140,
    '119': 141,
    '12': 34,
    '120': 142,
    '121': 143,
    '122': 144,
    '123': 145,
    '124': 146,
    '125': 147,
    '126': 148,
    '127': 149,
    '128': 150,
    '129': 151,
    '13': 35,
    '130': 152,
    '131': 153,
    '132': 154,
    '133': 155,
    '134': 156,
    '135': 157,
    '136': 158,
    '137': 159,
    '138': 160,
    '139': 161,
    '14': 36,
    '140': 162,
    '141': 163,
    '142'

In [11]:
print(gtokenizer)

<src.data.tokenizer.StackedGSTTokenizer object at 0x7ff2ef2c2430>


In [12]:
example_graph = dataset[0][1]
from src.utils.my_utiles import graph2token2input
import numpy as np
token, label, embed, inputs = graph2token2input(example_graph, gtokenizer)

print(
    f"\nTokens:\n{pformat(token)}\nLabels:\n{pformat(label)}\nembed:{np.array(embed)}\n"
)

print(f"Inputs for model:\n{pformat(inputs)}\n")

Inspecting tokenization results!
Tokenize graph:
Data(edge_index=[2, 40], edge_attr=[40, 3], x=[18, 9], y=[1, 1], num_nodes=18, idx=0, idx_of_ds=0)

Tokens:
[['22',
  'molecule#node#0#5',
  'molecule#node#1#2',
  'molecule#node#2#4',
  'molecule#node#3#5',
  'molecule#node#4#1',
  'molecule#node#5#0',
  'molecule#node#6#2',
  'molecule#node#7#0',
  'molecule#node#8#1',
  'molecule#edge#0',
  'molecule#edge#1',
  'molecule#edge#2'],
 ['23',
  'molecule#node#0#5',
  'molecule#node#1#0',
  'molecule#node#2#3',
  'molecule#node#3#5',
  'molecule#node#4#1',
  'molecule#node#5#1',
  'molecule#node#6#2',
  'molecule#node#7#0',
  'molecule#node#8#1',
  'molecule#edge#0#0',
  'molecule#edge#1#0',
  'molecule#edge#2#0'],
 ['24',
  'molecule#node#0#5',
  'molecule#node#1#0',
  'molecule#node#2#3',
  'molecule#node#3#5',
  'molecule#node#4#0',
  'molecule#node#5#0',
  'molecule#node#6#1',
  'molecule#node#7#1',
  'molecule#node#8#1',
  'molecule#edge#0#0',
  'molecule#edge#1#0',
  'molecule#edge#2

In [13]:
print(inputs.keys())

dict_keys(['input_ids', 'position_ids', 'labels', 'attention_mask', 'embed'])


In [14]:
from src.utils.my_utiles import convert_to_tensors

tensor_inputs = convert_to_tensors(inputs)
print(tensor_inputs.keys())

dict_keys(['input_ids', 'position_ids', 'labels', 'attention_mask', 'embed'])


In [15]:
print("input_ids:", tensor_inputs["input_ids"].shape)
print("position_ids:", tensor_inputs["position_ids"].shape)
print("labels:", tensor_inputs["labels"].shape)
print("attention_mask:", tensor_inputs["attention_mask"].shape)
print("embed:", tensor_inputs["embed"].shape)


input_ids: torch.Size([1, 24, 13])
position_ids: torch.Size([1, 24])
labels: torch.Size([1, 24, 13])
attention_mask: torch.Size([1, 24])
embed: torch.Size([1, 24, 0])


In [16]:
print(tensor_inputs["input_ids"])
print(tensor_inputs["labels"])

tensor([[[ 44, 630, 687, 696, 709, 715, 724, 732, 736, 739, 740, 741, 742],
         [ 45, 630, 685, 695, 709, 715, 725, 732, 736, 739, 743, 748, 754],
         [ 46, 630, 685, 695, 709, 714, 724, 731, 737, 739, 743, 748, 754],
         [ 47, 630, 685, 695, 709, 714, 724, 731, 737, 739, 746, 748, 755],
         [ 48, 630, 685, 695, 709, 715, 724, 731, 737, 739, 746, 748, 755],
         [ 49, 630, 685, 695, 709, 715, 724, 731, 737, 739, 746, 748, 755],
         [ 50, 641, 685, 694, 709, 714, 724, 731, 737, 739, 746, 748, 755],
         [ 51, 630, 685, 695, 709, 715, 724, 731, 737, 739, 746, 748, 755],
         [ 46, 630, 685, 695, 709, 714, 724, 731, 737, 739, 746, 748, 755],
         [ 47, 630, 685, 695, 709, 714, 724, 731, 737, 739, 746, 748, 755],
         [ 52, 641, 685, 694, 709, 714, 725, 731, 736, 739, 743, 748, 755],
         [ 53, 630, 685, 695, 709, 714, 724, 731, 736, 739, 743, 748, 755],
         [ 54, 652, 685, 691, 709, 714, 724, 731, 736, 738, 744, 748, 755],
         [ 5

## set model

In [17]:
import pickle
with open("./zhang_test/model_config.pkl", "rb") as file:  # "rb" mode for reading binary
    config = pickle.load(file)
print(config)

GraphGPTConfig {
  "attention_bias": false,
  "attention_dropout": 0.1,
  "bos_token_id": 20,
  "causal_attention": true,
  "cls_token_id": null,
  "dropout": 0,
  "embed_dim": 0,
  "embed_pdrop": 0,
  "eos_token_id": 19,
  "hidden_act": "gelu",
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_scale_init_value": 0,
  "loss_type": null,
  "max_position_embeddings": 1024,
  "mlp": [],
  "mlp_pdrop": 0,
  "model_type": "graphgpt",
  "next_n_token": 13,
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "num_key_value_heads": 8,
  "num_neg": null,
  "pad_token_id": 0,
  "path_pdrop": 0,
  "pooling_method": "last",
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000,
  "stack_method": "short",
  "stacked_feat": 13,
  "stacked_feat_agg_method": "gated",
  "tie_word_embeddings": false,
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 756
}



In [18]:
# use_deepspeed = True

# # 2.2 create model
# if use_deepspeed:
#     deepspeed.init_distributed(
#         dist_backend="nccl", rank=rank, world_size=world_size
#     )

In [19]:
model = GraphModel(config)


# model.gradient_checkpointing_enable()
# silence the warnings. Please re-enable for inference!
model.config.use_cache = False
print_trainable_parameters(model) # 235368960

NOT Applying dropout in backbone transformer
Next-token-prediction changed to next/masked-13-tokens-prediction!
trainable params: 37751808 || all params: 37751808 || trainable%: 100.0


In [20]:

# 2.21 load from ckp IF provided existing ckp and NOT resume from the ckp
ckp, _ = misc_utils.get_latest_ckp(pretrain_cpt)
print(f"Loading pretrained weights from ckp {ckp}")
try:
    # fn_model = os.path.join(ckp, "../model_ema_best.pt")
    # if not os.path.isfile(fn_model):
    fn_model = os.path.join(ckp, "model.pt")
    stat_dict = torch.load(fn_model)
    stat_dict = {
        (k[7:] if k.startswith("module.") else k): v for k, v in stat_dict.items()
    }
    print(f"[{datetime.now()}] load ckp using torch API from:\n{fn_model}")
except Exception as inst:
    # print(type(inst))
    # print(inst.args)
    print("inar: ", inst)
    from deepspeed.utils.zero_to_fp32 import (
        get_fp32_state_dict_from_zero_checkpoint,
    )
    stat_dict = get_fp32_state_dict_from_zero_checkpoint(ckp)
    print(
        f"[{datetime.now()}] load ckp using DeepSpeed API `get_fp32_state_dict_from_zero_checkpoint`"
    )

for key in list(stat_dict.keys()):
    if ("score" in key) and skip_keys:
        stat_dict.pop(key)
        print(f"pop key {key} in stat_dict!")
missing_keys, unexpected_keys = model.load_state_dict(stat_dict, strict=True)
print(
    f"[{datetime.now()}] init model params using pytorch `load_state_dict`\n"
    f"missing keys: {missing_keys}\n"
    f"unexpected_keys: {unexpected_keys}\n"
    f"After loading weights from ckp:\n{model.config}\nmodel-type: {model.dtype}\n\n{model}"
)

Loading pretrained weights from ckp /datalake/datastore1/yang/graph-gpt/exp/models/pcqm4m-v2/medium_ntp/pt_ns_h512_l8_b8192_mpe1024_tk1e9_gelu_pretrain3.3m_nmlm_mrlinear_mtp0.8_0_0.2_lr3e-4_adp0.1_pdp0_edp0_mdp0_lsi0_short_gated_wd0.1/epoch_51
inar:  [Errno 2] No such file or directory: '/datalake/datastore1/yang/graph-gpt/exp/models/pcqm4m-v2/medium_ntp/pt_ns_h512_l8_b8192_mpe1024_tk1e9_gelu_pretrain3.3m_nmlm_mrlinear_mtp0.8_0_0.2_lr3e-4_adp0.1_pdp0_edp0_mdp0_lsi0_short_gated_wd0.1/epoch_51/model.pt'
Processing zero checkpoint '/datalake/datastore1/yang/graph-gpt/exp/models/pcqm4m-v2/medium_ntp/pt_ns_h512_l8_b8192_mpe1024_tk1e9_gelu_pretrain3.3m_nmlm_mrlinear_mtp0.8_0_0.2_lr3e-4_adp0.1_pdp0_edp0_mdp0_lsi0_short_gated_wd0.1/epoch_51/global_step48830'
Detected checkpoint of type zero stage 2, world_size: 1
Parsing checkpoint created by deepspeed==0.15.1
Reconstructed fp32 state dict with 77 params 37751808 elements
[2024-12-10 01:56:54.509046] load ckp using DeepSpeed API `get_fp32_stat

In [21]:
print("input_ids:", tensor_inputs["input_ids"].shape)
print("position_ids:", tensor_inputs["position_ids"].shape)
print("labels:", tensor_inputs["labels"].shape)
print("attention_mask:", tensor_inputs["attention_mask"].shape)
print("embed:", tensor_inputs["embed"].shape)


input_ids: torch.Size([1, 24, 13])
position_ids: torch.Size([1, 24])
labels: torch.Size([1, 24, 13])
attention_mask: torch.Size([1, 24])
embed: torch.Size([1, 24, 0])


## test

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"device: {device}")
print(model.device)

device: cuda
cuda:0


In [23]:
print(embed_dim)

0


In [24]:
model.eval()
data = tensor_inputs
input_ids = data["input_ids"].to(device)
attention_mask = data["attention_mask"].to(device)
labels = data["labels"].to(device)
inputs_raw_embeds = None
if embed_dim > 0: # in tokenizer config
    inputs_raw_embeds = data["embed"].to(device)
output = model(
    input_ids=input_ids,
    attention_mask=attention_mask,
    labels=labels,
    inputs_raw_embeds=inputs_raw_embeds,
)  # Perform a single forward pass.
print(output)


CausalLMOutputWithPast(loss=tensor(0.1592, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-6.8285e-01, -5.2476e-01, -7.8901e-01,  ..., -8.6922e-01,
         -1.5787e-03,  4.5053e-01],
        [-8.2714e+00, -8.2891e+00, -8.2824e+00,  ..., -8.2746e+00,
         -3.3644e+00, -3.3576e+00],
        [-4.5638e+00, -4.7213e+00, -4.6861e+00,  ..., -4.8072e+00,
         -1.4092e+00, -1.0710e+00],
        ...,
        [ 3.0432e-01,  6.3430e-01,  9.7459e-02,  ..., -9.9691e-02,
          2.0882e+00,  3.4079e-01],
        [ 2.5485e+00,  2.9396e+00,  3.1794e+00,  ...,  3.0270e+00,
          3.2834e+00,  6.0216e-01],
        [-7.1706e-01, -8.3712e-01, -4.2776e-01,  ..., -1.1047e+00,
          4.3890e+00,  2.1820e+00]], device='cuda:0', grad_fn=<ViewBackward0>), past_key_values=None, hidden_states=None, attentions=None)


In [28]:
print("labels:", tensor_inputs["labels"].shape)
print("output:", output.keys())
print("output:", output["logits"].shape)

labels: torch.Size([1, 24, 13])
output: odict_keys(['loss', 'logits'])
output: torch.Size([312, 756])


In [None]:
predicted_labels = torch.argmax(output["logits"], dim=-1) 
reshaped_labels = predicted_labels.view(1, 24, 13)
print("labels:", tensor_inputs["labels"])
print("predicted_labels:", reshaped_labels)

In [39]:
# File path to your vocabulary file
vocab_file_path = "/datalake/datastore1/yang/graph-gpt/data/OGB/pcqm4m-v2/vocab512_stacked"

# Step 1: Load the vocabulary
def load_vocab(vocab_file):
    """Loads a vocabulary file and returns a dictionary mapping token IDs to tokens."""
    vocab = {}
    with open(vocab_file, "r") as f:
        for idx, line in enumerate(f):
            vocab[idx + 1] = line.strip()  # Remove newline characters
    return vocab

vocab = load_vocab(vocab_file_path)
# pprint(vocab.keys())
# Step 2: Convert predicted label IDs to tokens
def convert_labels_to_tokens(labels, vocab):
    """Converts label IDs to tokens using the provided vocabulary."""
    tokens = [vocab[label.item()] for label in labels.view(-1)]
    return tokens

# Example usage
# Assuming `reshaped_labels` contains the predicted label IDs of shape [1, 24, 13]
tokens = convert_labels_to_tokens(reshaped_labels, vocab)

# Optional: Reshape tokens back to the original structure for visualization
tokens_reshaped = [
    [tokens[i * 13 + j] for j in range(13)] for i in range(24)
]

# Print tokens
pprint(tokens_reshaped)


[['23 45',
  'molecule#node#0#5 630',
  'molecule#node#1#0 685',
  'molecule#node#2#4 696',
  'molecule#node#3#5 709',
  'molecule#node#4#2 716',
  'molecule#node#5#0 724',
  'molecule#node#6#2 732',
  'molecule#node#7#0 736',
  'molecule#node#8#1 739',
  'molecule#edge#0#0 743',
  'molecule#edge#1#0 748',
  'molecule#edge#2#0 754'],
 ['24 46',
  'molecule#node#0#5 630',
  'molecule#node#1#0 685',
  'molecule#node#2#3 695',
  'molecule#node#3#5 709',
  'molecule#node#4#0 714',
  'molecule#node#5#1 725',
  'molecule#node#6#2 732',
  'molecule#node#7#0 736',
  'molecule#node#8#1 739',
  'molecule#edge#0#0 743',
  'molecule#edge#1#0 748',
  'molecule#edge#2#0 754'],
 ['25 47',
  'molecule#node#0#5 630',
  'molecule#node#1#0 685',
  'molecule#node#2#3 695',
  'molecule#node#3#5 709',
  'molecule#node#4#0 714',
  'molecule#node#5#0 724',
  'molecule#node#6#1 731',
  'molecule#node#7#1 737',
  'molecule#node#8#1 739',
  'molecule#edge#0#3 746',
  'molecule#edge#1#0 748',
  'molecule#edge#2#1