## some config

In [1]:
import os
import math
import random
import torch
import fire
import copy
import multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import GradScaler
# import deepspeed
from datetime import datetime
from typing import Optional
from pprint import pprint, pformat
from torch.utils.data import DataLoader, IterableDataset
from timm.utils import ModelEmaV3
from timm.models import load_checkpoint 
from timm.utils.model import unwrap_model, get_state_dict
try:
    from torch.utils.tensorboard import SummaryWriter
except ModuleNotFoundError:
    from tensorboardX import SummaryWriter

import sys

# sys.path.insert(0, "..")
sys.path.insert(0, ".")

In [2]:
from src.data import (
    collator,
    vocab_builder,
    tokenizer,
    read_dataset,
    OdpsTableIterableDataset,
)
from src.models import (
    GraphGPTConfig,
    GraphGPTCausal,
    GraphGPT2Config,
    GraphGPT2Causal,
    GraphBertConfig,
    GraphBertForMaskedLM,
)
from src.utils import (
    conf_utils,
    loss_utils,
    loader_utils,
    tokenizer_utils,
    modules_utils,
    misc_utils,
    print_trainable_parameters,
    print_params,
    inspect_tokenization_results,
    set_up_shuffle_and_sampler,
    worker_init_fn_seed,
)

dict_models = {
    "graphgpt2": (GraphGPT2Causal, GraphGPT2Config),
    "graphgpt": (GraphGPTCausal, GraphGPTConfig),
    "graphbert": (GraphBertForMaskedLM, GraphBertConfig),
}

[2024-12-16 17:22:29,478] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlvsym'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `shm_open'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined

In [3]:
data_dir: str = "../data/OGB"
tables: str = ""
# deepspeed_config = "./examples/ds_config2_pt.json"
intermediate_size = 0
num_attention_heads = 0
hidden_size = 512
num_hidden_layers = 8
task_type='pretrain'
causal_attention = 1
lr=3e-4
model_type = 'graphgpt'
output_dir='./exp/models/pcqm4m-v2/test'
pretrain_cpt = '/datalake/datastore1/yang/graph-gpt/exp/models/pcqm4m-v2/medium_ntp/pt_ns_h512_l8_b8192_mpe1024_tk1e9_gelu_pretrain3.3m_nmlm_mrlinear_mtp0.8_0_0.2_lr3e-4_adp0.1_pdp0_edp0_mdp0_lsi0_short_gated_wd0.1'
samples_per_saving=1000000

batch_size = 1024
stack_method = 'short'

pack_tokens = 0
max_position_embeddings = 1024

task_type='pretrain'
total_tokens=1e9
batch_size = 1024
warmup_tokens=1e8

In [4]:
use_tb_writer = False           # use tensorboard writer
use_ema = False # False # use exponential moving average to smooth model
use_deepspeed = False # True # use deepspeed for training, good to set scheduler
if (intermediate_size == 0) and (num_attention_heads == 0): # True
    (
        hidden_size,
        intermediate_size,
        num_attention_heads,
        num_hidden_layers,
    ) = modules_utils.set_up_model_architect(
        hidden_size=hidden_size, num_hidden_layers=num_hidden_layers # 768 24 related to model names intermediate_size = hidden_size * 4, num_attention_heads = hidden_size // 64
    )# 768 3072 12 24
causal_attention = 0 if task_type == "pretrain-mlm" else causal_attention
print('hidden_size:', hidden_size, 'intermediate_size:', intermediate_size, 'num_attention_heads:', num_attention_heads, 'num_hidden_layers:', num_hidden_layers, 'causal_attention:', causal_attention) # 768 3072 12 24 1


# #########################
# betas = (0.9, 0.95) # used in AdamW optimizer, important for config beta
# #########################
# # lr * 0.1 -> from llama2 pre-train settings
# min_lr = lr * 0.1 if use_deepspeed else 0    # used in scheduler, when not using deepspeed.
# #########################
gpu_name = torch.cuda.get_device_name()
GraphModel, GraphModelConfig = dict_models[model_type] # Not instantiate yet
print('gpu_name:', gpu_name, 'GraphModel:', GraphModel, 'GraphModelConfig:', GraphModelConfig) 

if os.path.exists(os.path.join(output_dir, "log.csv")):
    print(
        f"log file {os.path.join(output_dir, 'log.csv')} exists, resume training from {output_dir} instead of initializing from pre-train ckp {pretrain_cpt}!"
    )
    pretrain_cpt = output_dir


# # 0. init distributed train and get gpu/device info
# dist.init_process_group(backend="nccl", init_method="env://")  # for distributed training
# dist.barrier() # for sync training
# world_size = dist.get_world_size() # 1 # number of GPUs
# rank = dist.get_rank() # 0 # current GPU index
# local_rank = os.environ.get("LOCAL_RANK") # 0 # current GPU index local to the node
# print(f"\nworld size: {world_size}, rank: {rank}, local rank: {local_rank}") # 1 0 0
# rnd_seed = torch.random.initial_seed() - rank
# random.seed(rnd_seed)
# print(f"seed random with {rnd_seed}") # 1234
# steps_per_saving = samples_per_saving // (world_size * batch_size) # 1000000 // (1 * 1024) = 976
# print(f"\nsteps_per_saving: {steps_per_saving}") # 976
# params = print_params(**locals())

hidden_size: 512 intermediate_size: 2048 num_attention_heads: 8 num_hidden_layers: 8 causal_attention: 1
gpu_name: NVIDIA RTX A6000 GraphModel: <class 'src.models.graphgpt.modeling_graphgpt.GraphGPTCausal'> GraphModelConfig: <class 'src.models.graphgpt.configuration_graphgpt.GraphGPTConfig'>


## load data

In [5]:
# tokenizer config loading
import json

# Load the JSON file
file_path = "./zhang_test/tokenizer_config.json"
with open(file_path, "r") as json_file:
    tokenizer_config = json.load(json_file)

# Print the loaded data
pprint(tokenizer_config)

{'attr_world_identifier': 'molecule',
 'data_dir': './data/OGB',
 'dataset': 'PCQM4Mv2',
 'ensemble_datasets': [],
 'label_tokens_to_pad': ['<icl>'],
 'name_or_path': './data/OGB/pcqm4m-v2',
 'pretrain_mlm': {'info': 'name->polynomial|cosine|fixed,power->3/2/1/0.5',
                  'name': 'polynomial',
                  'params': {'fixed_ratio': 0.7,
                             'mtp': [0.8, 0, 0.2],
                             'power': 1}},
 'sampling': None,
 'semantics': {'attr_assignment': 'first',
               'attr_shuffle': False,
               'common': {'numbers': ['<e>',
                                      '<.>',
                                      '<->',
                                      '<0>',
                                      '<1>',
                                      '<2>',
                                      '<3>',
                                      '<4>',
                                      '<5>',
                                      '<6>',


In [6]:
# 1.1 read configuration
assert "pretrain" in tokenizer_config["task_type"]
assert (
    tokenizer_config["semantics"]["attr_assignment"]   # first
    in tokenizer_utils.ATTR_ASSIGNMENT_TYPES   # ATTR_ASSIGNMENT_TYPES = {"first", "last", "random", "all", "mix"}
)
# pprint(tokenizer_config)
if tokenizer_config["tokenizer_class"] == "StackedGSTTokenizer":
    attr_dim = (
        tokenizer_config["semantics"]["edge"]["dim"] # 3
        + tokenizer_config["semantics"]["node"]["dim"] # 9
    ) # 12
    assert stack_method in ("short", "long", None), f"stack_method: {stack_method}" # short
    if tokenizer_config["structure"]["edge"]["remove_edge_type_token"]: # True
        stacked_feat = 1 + attr_dim
    else:
        stacked_feat = 2 + attr_dim
    next_n_token = stacked_feat
else:
    stacked_feat = 1
    next_n_token = 1 # maybe how many pack of tokens to predict
embed_dim = tokenizer_config["semantics"]["node"].get(
    "embed_dim", 0
) + tokenizer_config["semantics"]["edge"].get("embed_dim", 0) # 0
print(
    f"stacked_feat: {stacked_feat}, next_n_token: {next_n_token}, embed_dim: {embed_dim}" # 13 13 0
)

stacked_feat: 13, next_n_token: 13, embed_dim: 0


In [7]:
# 1.2 get graph dataset
dataset, raw_dataset = read_dataset(
    name=tokenizer_config["dataset"],   # PCQM4Mv2
    # for local data file reading
    data_dir=data_dir,   # './data/OGB'
    sampling_config=tokenizer_config["sampling"],    # None
    # for odps data reading
    table=tables,   # ""
    edge_dim=tokenizer_config["semantics"]["edge"]["dim"],    # 3
    node_dim=tokenizer_config["semantics"]["node"]["dim"],    # 9
    mode="train",
    # general
    pretrain_mode=True,
    # return_valid_test=True,
    ensemble_datasets=tokenizer_config.get("ensemble_datasets", []),    # []
)
reset_samples_per_epoch = (   # what is this  # None for PCQM4Mv2
    dataset.reset_samples_per_epoch
    if hasattr(dataset, "reset_samples_per_epoch")
    else False
)
if isinstance(dataset, IterableDataset):
    print(next(iter(dataset))) 
else: # True
    idx = dataset.sampler[0] # (0, Data(edge_index=[2, 40], edge_attr=[40, 3], x=[18, 9], y=[1, 1], num_nodes=18, idx=0, idx_of_ds=0))
    print(dataset[idx])


Loading dataset PCQM4Mv2 ...

dataset._data -> Data(edge_index=[2, 109093626], edge_attr=[109093626, 3], x=[52970652, 9], y=[3746620])
In pre-train mode, set all valid data's y to nan!
Before setting, y has 294469 NANs
After setting, y has 368014 NANs
Default process group has not been initialized, please make sure to call init_process_group.

Raw indices: 3746620, Removed indices: 0, New indices: 3746620

Raw indices: 3746620, Removed indices: 294469, New indices: 3452151

[2024-12-16 17:22:40.398866] NOT RESET samples of GraphsMapDataset of 3452151 graphs for epoch None!
idx_tuple: None
(0, Data(edge_index=[2, 40], edge_attr=[40, 3], x=[18, 9], y=[1, 1], num_nodes=18, idx=0, idx_of_ds=0))


In [8]:
print(dataset)
print('length of dataset:', len(dataset))
print(dataset[1])
print(dataset[2])
print("#" * 100)
print("example")
print("edge_index: ", dataset[1][1].edge_index)
print("edge_attr: ", dataset[1][1].edge_attr)
print("x: ", dataset[1][1].x)
print("y: ", dataset[1][1].y) 

<src.data.dataset_map.GraphsMapDataset object at 0x7fc291e26520>
length of dataset: 3452151
(1, Data(edge_index=[2, 34], edge_attr=[34, 3], x=[17, 9], y=[1, 1], num_nodes=17, idx=1, idx_of_ds=0))
(2, Data(edge_index=[2, 32], edge_attr=[32, 3], x=[16, 9], y=[1, 1], num_nodes=16, idx=2, idx_of_ds=0))
####################################################################################################
example
edge_index:  tensor([[ 9, 14, 14,  0,  0, 15, 15, 10, 10,  1,  1, 13, 10,  4,  4,  6,  6,  5,
          5, 16, 16,  3,  3,  2,  2,  8,  8, 12,  8, 11,  2,  7,  5,  0],
        [14,  9,  0, 14, 15,  0, 10, 15,  1, 10, 13,  1,  4, 10,  6,  4,  5,  6,
         16,  5,  3, 16,  2,  3,  8,  2, 12,  8, 11,  8,  7,  2,  0,  5]])
edge_attr:  tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 1],
        [0, 0, 1],
        [3, 0, 1],
        [3, 0, 1],
        [3, 0, 1],
        [3, 0, 1],
        [0, 0, 1],
        [0, 0, 1],
        [0, 0, 0],
        [0, 0, 0],
        [3, 0, 1],
        

In [None]:
example_graph = dataset[2][1]
from src.utils.my_utiles import graph2smiles
smiles = graph2smiles(example_graph.edge_index, example_graph.edge_attr, example_graph.x)
print(smiles)

C=CCN(C=Cc1ccccc1C)C(C)=O


## build vocabulary

In [10]:
add_eos = False
rank = 0
stack_method = "short"
# 1.3 build vocab and then init tokenizer from the tokenization config
vocab_builder.build_vocab(raw_dataset, tokenizer_config, rank) # build vocab from file or scratch
tokenizer_cls = getattr(tokenizer, tokenizer_config["tokenizer_class"]) # StackGSTTokenizer, custom defined
gtokenizer = tokenizer_cls(
    tokenizer_config, add_eos=add_eos, stack_method=stack_method # instantiate
)

[2024-12-16 17:22:40.493413] Vocab is already built and saved in ./data/OGB/pcqm4m-v2/vocab512_stacked!
[2024-12-16 17:22:40.493746] Loading vocab from ./data/OGB/pcqm4m-v2/vocab512_stacked ...
[2024-12-16 17:22:40.495952]
{   '0': 22,
    '1': 23,
    '10': 32,
    '100': 122,
    '101': 123,
    '102': 124,
    '103': 125,
    '104': 126,
    '105': 127,
    '106': 128,
    '107': 129,
    '108': 130,
    '109': 131,
    '11': 33,
    '110': 132,
    '111': 133,
    '112': 134,
    '113': 135,
    '114': 136,
    '115': 137,
    '116': 138,
    '117': 139,
    '118': 140,
    '119': 141,
    '12': 34,
    '120': 142,
    '121': 143,
    '122': 144,
    '123': 145,
    '124': 146,
    '125': 147,
    '126': 148,
    '127': 149,
    '128': 150,
    '129': 151,
    '13': 35,
    '130': 152,
    '131': 153,
    '132': 154,
    '133': 155,
    '134': 156,
    '135': 157,
    '136': 158,
    '137': 159,
    '138': 160,
    '139': 161,
    '14': 36,
    '140': 162,
    '141': 163,
    '142'

In [11]:
print(gtokenizer)

<src.data.tokenizer.StackedGSTTokenizer object at 0x7fc16e8d0af0>


## get set and tokenize

In [12]:
# world_size = 1

# 1.4 get train/test sampler
train_dataset = dataset  #         idx = dataset.sampler[0] # (0, Data(edge_index=[2, 40], edge_attr=[40, 3], x=[18, 9], y=[1, 1], num_nodes=18, idx=0, idx_of_ds=0))
if not isinstance(train_dataset, IterableDataset): # True
    print("train_dataset is not IterableDataset")
    train_sampler = train_dataset.sampler
    print("train_sampler: ", len(train_sampler))
    print("first 10 elements in train_sampler: ", train_sampler[:10])
    random.shuffle(train_sampler)
    print("first 10 elements in train_sampler: ", train_sampler[:10])
    train_shuffle, train_sampler, train_cnt = set_up_shuffle_and_sampler( # train_shuffle = False, sampler, len(sampler)
        train_dataset, train_sampler
    )
    print(train_shuffle, train_sampler[0], train_cnt)
else: 
    train_cnt = len(train_dataset) * world_size # why # 1
    train_sampler = None
    train_shuffle = False

train_dataset is not IterableDataset
train_sampler:  3452151
first 10 elements in train_sampler:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
first 10 elements in train_sampler:  [2275895, 2727090, 2311376, 1667965, 736249, 735614, 2840383, 1053302, 3240060, 3462391]
False 2275895 3452151


In [13]:
world_size = 1
if pack_tokens > 0: # 0
    gtokenizer.mpe = max_position_embeddings
    # cannot pass `iter(train_dataset)` for Iterable ds, because `TypeError: cannot pickle 'generator' object`
    gtokenizer.dataset = train_dataset
    gtokenizer.sampler = tuple(train_sampler) if train_sampler is not None else None
    gtokenizer.random_ratio = pack_tokens
    tokens_per_sample = max_position_embeddings
else:
    print("pack_tokens is 0")
    tokens_per_sample = misc_utils.estimate_tokens_per_sample(
        gtokenizer,
        train_dataset,
        train_sampler,
        max_position_embeddings,
        world_size,
    ) # Estimated tokens per sample 20.0 with std 4.0 using 10000 samples and mpe 1024

pack_tokens is 0


100%|██████████| 10000/10000 [00:44<00:00, 225.65it/s]

Estimated tokens per sample 20.0 with std 4.0 using 10000 samples and mpe 1024





In [14]:
tokens_per_sample = (
    tokens_per_sample // 2 if task_type == "pretrain-euler" else tokens_per_sample
)
print(f"\n[{datetime.now()}] tokens_per_sample: {tokens_per_sample}") # 20 what is this: estimated tokens per sample, by 10000 samples and mpe 1024

inspect_tokenization_results(dataset, gtokenizer) # print out tokenization results, one sample
# re-initialize `gtokenizer.dataset` to avoid `TypeError: cannot pickle 'generator' object`
gtokenizer.dataset = train_dataset if pack_tokens > 0 else None
print("gtokenizer.dataset: ", gtokenizer.dataset)

total_num_steps = int(
    math.ceil(total_tokens / (tokens_per_sample * batch_size * world_size)) # total_tokens defined in config 4e9/(20*1024*1) = 195313
)
warmup_num_steps = int(
    math.ceil(warmup_tokens / (tokens_per_sample * batch_size * world_size)) # 1e8 ...
)
tmp_cnt = len(train_sampler) if train_sampler else train_cnt / world_size # train_cnt = len(train_dataset) * world_size
epochs = int(math.ceil(total_tokens / (tmp_cnt * tokens_per_sample * world_size))) # token for training / token in the dataset = epochs
print(
    f"\n[{datetime.now()}] total_num_steps: {total_num_steps}\nwarmup_num_steps: {warmup_num_steps}\nepochs per worker: {epochs}\n" # 61 epochs
)
# 195313 4883 61


[2024-12-16 17:23:26.735709] tokens_per_sample: 20.0

Inspecting graph of index 2275895
Inspecting tokenization results!
Tokenize graph:
Data(edge_index=[2, 24], edge_attr=[24, 3], x=[12, 9], y=[1, 1], num_nodes=12, idx=2275895, idx_of_ds=0)

Tokens:
[['186', 'molecule#node#0#7', 'molecule#node#1#0', 'molecule#node#2#1', 'molecule#node#3#5', 'molecule#node#4#0', 'molecule#node#5#0', 'molecule#node#6#1', 'molecule#node#7#0', 'molecule#node#8#0', 'molecule#edge#0', 'molecule#edge#1', 'molecule#edge#2'],
 ['187', 'molecule#node#0#5', 'molecule#node#1#0', 'molecule#node#2#3', 'molecule#node#3#5', 'molecule#node#4#1', 'molecule#node#5#0', 'molecule#node#6#1', 'molecule#node#7#0', 'molecule#node#8#0', 'molecule#edge#0#1', 'molecule#edge#1#0', 'molecule#edge#2#0'],
 ['188', 'molecule#node#0#5', 'molecule#node#1#1', 'molecule#node#2#4', 'molecule#node#3#5', 'molecule#node#4#1', 'molecule#node#5#0', 'molecule#node#6#2', 'molecule#node#7#0', 'molecule#node#8#1', 'molecule#edge#0#0', 'molecule#e

In [15]:
import numpy as np
idx = 0
idx2, data = dataset[idx]
graph = data
print(f"Inspecting tokenization results!\nTokenize graph:\n{data}")
token_res = gtokenizer.tokenize(graph)
print(
    f"\nTokens:\n{pformat(token_res.ls_tokens)}\nLabels:\n{pformat(token_res.ls_labels)}\nembed:{np.array(token_res.ls_embed)}\n"
)
print("if gtokenizer.mpe is not None, ", gtokenizer.mpe)    # None
tokens, labels, ls_embed, ls_len = (
        gtokenizer.pack_token_seq(token_res, idx)
        if gtokenizer.mpe is not None
        else (
            token_res.ls_tokens,
            token_res.ls_labels,
            token_res.ls_embed,
            [len(token_res.ls_tokens)],
        )
    )

in_dict = gtokenizer.convert_tokens_to_ids(tokens, labels)
if ls_embed:  # for pretty print purpose ONLY
    in_dict["embed"] = np.array(ls_embed)
print(f"Tokenized results:\n{pformat(in_dict)}\n")
if ls_embed:
    in_dict["embed"] = ls_embed
token_res.ls_tokens = tokens
token_res.ls_labels = labels
token_res.ls_embed = ls_embed
token_res.ls_len = ls_len
inputs = gtokenizer.prepare_inputs_for_task(
    in_dict,
    graph,
    token_res=token_res,
)
print(f"Prepared inputs:\n{pformat(inputs)}\n")

Inspecting tokenization results!
Tokenize graph:
Data(edge_index=[2, 40], edge_attr=[40, 3], x=[18, 9], y=[1, 1], num_nodes=18, idx=0, idx_of_ds=0)

Tokens:
[['140',
  'molecule#node#0#5',
  'molecule#node#1#0',
  'molecule#node#2#3',
  'molecule#node#3#5',
  'molecule#node#4#0',
  'molecule#node#5#0',
  'molecule#node#6#1',
  'molecule#node#7#1',
  'molecule#node#8#1',
  'molecule#edge#0',
  'molecule#edge#1',
  'molecule#edge#2'],
 ['141',
  'molecule#node#0#5',
  'molecule#node#1#0',
  'molecule#node#2#3',
  'molecule#node#3#5',
  'molecule#node#4#1',
  'molecule#node#5#0',
  'molecule#node#6#1',
  'molecule#node#7#1',
  'molecule#node#8#1',
  'molecule#edge#0#3',
  'molecule#edge#1#0',
  'molecule#edge#2#1'],
 ['142',
  'molecule#node#0#6',
  'molecule#node#1#0',
  'molecule#node#2#2',
  'molecule#node#3#5',
  'molecule#node#4#0',
  'molecule#node#5#0',
  'molecule#node#6#1',
  'molecule#node#7#1',
  'molecule#node#8#1',
  'molecule#edge#0#3',
  'molecule#edge#1#0',
  'molecule#edg

## set model

In [16]:
import pickle
with open("./zhang_test/model_config.pkl", "rb") as file:  # "rb" mode for reading binary
    config = pickle.load(file)
print(config)

GraphGPTConfig {
  "attention_bias": false,
  "attention_dropout": 0.1,
  "bos_token_id": 20,
  "causal_attention": true,
  "cls_token_id": null,
  "dropout": 0,
  "embed_dim": 0,
  "embed_pdrop": 0,
  "eos_token_id": 19,
  "hidden_act": "gelu",
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_scale_init_value": 0,
  "loss_type": null,
  "max_position_embeddings": 1024,
  "mlp": [],
  "mlp_pdrop": 0,
  "model_type": "graphgpt",
  "next_n_token": 13,
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "num_key_value_heads": 8,
  "num_neg": null,
  "pad_token_id": 0,
  "path_pdrop": 0,
  "pooling_method": "last",
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000,
  "stack_method": "short",
  "stacked_feat": 13,
  "stacked_feat_agg_method": "gated",
  "tie_word_embeddings": false,
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 756
}



In [17]:
# use_deepspeed = True

# # 2.2 create model
# if use_deepspeed:
#     deepspeed.init_distributed(
#         dist_backend="nccl", rank=rank, world_size=world_size
#     )

In [18]:
model = GraphModel(config)


# model.gradient_checkpointing_enable()
# silence the warnings. Please re-enable for inference!
model.config.use_cache = False
print_trainable_parameters(model) # 235368960

NOT Applying dropout in backbone transformer
Next-token-prediction changed to next/masked-13-tokens-prediction!
trainable params: 37751808 || all params: 37751808 || trainable%: 100.0


In [19]:

# 2.21 load from ckp IF provided existing ckp and NOT resume from the ckp
ckp, _ = misc_utils.get_latest_ckp(pretrain_cpt)
print(f"Loading pretrained weights from ckp {ckp}")
try:
    # fn_model = os.path.join(ckp, "../model_ema_best.pt")
    # if not os.path.isfile(fn_model):
    fn_model = os.path.join(ckp, "model.pt")
    stat_dict = torch.load(fn_model)
    stat_dict = {
        (k[7:] if k.startswith("module.") else k): v for k, v in stat_dict.items()
    }
    print(f"[{datetime.now()}] load ckp using torch API from:\n{fn_model}")
except Exception as inst:
    # print(type(inst))
    # print(inst.args)
    print("inar: ", inst)
    from deepspeed.utils.zero_to_fp32 import (
        get_fp32_state_dict_from_zero_checkpoint,
    )
    stat_dict = get_fp32_state_dict_from_zero_checkpoint(ckp)
    print(
        f"[{datetime.now()}] load ckp using DeepSpeed API `get_fp32_state_dict_from_zero_checkpoint`"
    )

for key in list(stat_dict.keys()):
    if ("score" in key) and skip_keys:
        stat_dict.pop(key)
        print(f"pop key {key} in stat_dict!")
missing_keys, unexpected_keys = model.load_state_dict(stat_dict, strict=True)
print(
    f"[{datetime.now()}] init model params using pytorch `load_state_dict`\n"
    f"missing keys: {missing_keys}\n"
    f"unexpected_keys: {unexpected_keys}\n"
    f"After loading weights from ckp:\n{model.config}\nmodel-type: {model.dtype}\n\n{model}"
)

Loading pretrained weights from ckp /datalake/datastore1/yang/graph-gpt/exp/models/pcqm4m-v2/medium_ntp/pt_ns_h512_l8_b8192_mpe1024_tk1e9_gelu_pretrain3.3m_nmlm_mrlinear_mtp0.8_0_0.2_lr3e-4_adp0.1_pdp0_edp0_mdp0_lsi0_short_gated_wd0.1/epoch_51
inar:  [Errno 2] No such file or directory: '/datalake/datastore1/yang/graph-gpt/exp/models/pcqm4m-v2/medium_ntp/pt_ns_h512_l8_b8192_mpe1024_tk1e9_gelu_pretrain3.3m_nmlm_mrlinear_mtp0.8_0_0.2_lr3e-4_adp0.1_pdp0_edp0_mdp0_lsi0_short_gated_wd0.1/epoch_51/model.pt'
Processing zero checkpoint '/datalake/datastore1/yang/graph-gpt/exp/models/pcqm4m-v2/medium_ntp/pt_ns_h512_l8_b8192_mpe1024_tk1e9_gelu_pretrain3.3m_nmlm_mrlinear_mtp0.8_0_0.2_lr3e-4_adp0.1_pdp0_edp0_mdp0_lsi0_short_gated_wd0.1/epoch_51/global_step48830'
Detected checkpoint of type zero stage 2, world_size: 1
Parsing checkpoint created by deepspeed==0.15.1
Reconstructed fp32 state dict with 77 params 37751808 elements
[2024-12-16 17:23:28.372732] load ckp using DeepSpeed API `get_fp32_stat

## collator

In [20]:
pad_to_multiple_of = 8
# 3.1 init collator
collator_fn = collator.DataCollatorForGSTCausal(
    tokenizer=gtokenizer,
    max_length=max_position_embeddings,
    pad_to_multiple_of=pad_to_multiple_of,
    return_tensors="pt",
)
print(f"[{datetime.now()}] Finish -> 3.1 init collator")

[2024-12-16 17:23:28.418223] Finish -> 3.1 init collator


## data loader

In [21]:
# 3.2 set-up loader
print(f"reset_samples_per_epoch: {reset_samples_per_epoch}")
if (not reset_samples_per_epoch) and (
    not isinstance(train_dataset, IterableDataset)
):
    train_sampler_new = []
    for epoch in range(epochs):
        train_dataset.reset_samples(epoch, rank)
        # random.shuffle(train_sampler)
        train_sampler_new.extend(train_dataset.sampler)
    random.shuffle(train_sampler_new)
    print(
        f"train_sampler for {epochs} epochs increase: {len(train_sampler)} -> {len(train_sampler_new)}"    # train_sampler for 61 epochs increase: 3323391 -> 202726851 3323391* 61
    )
    train_sampler = train_sampler_new
    epochs = 1   # reset to 1 epoch
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=train_shuffle,
    sampler=train_sampler,
    num_workers=1, # 12
    collate_fn=collator_fn,
    worker_init_fn=worker_init_fn_seed,
    pin_memory=True,
    drop_last=True,
    prefetch_factor=4,
)

reset_samples_per_epoch: False
[2024-12-16 17:23:29.981578] NOT RESET samples of GraphsMapDataset of 3452151 graphs for epoch 0!
[2024-12-16 17:23:30.082107] NOT RESET samples of GraphsMapDataset of 3452151 graphs for epoch 1!
[2024-12-16 17:23:30.198648] NOT RESET samples of GraphsMapDataset of 3452151 graphs for epoch 2!
[2024-12-16 17:23:30.299039] NOT RESET samples of GraphsMapDataset of 3452151 graphs for epoch 3!
[2024-12-16 17:23:30.392570] NOT RESET samples of GraphsMapDataset of 3452151 graphs for epoch 4!
[2024-12-16 17:23:30.487291] NOT RESET samples of GraphsMapDataset of 3452151 graphs for epoch 5!
[2024-12-16 17:23:30.581383] NOT RESET samples of GraphsMapDataset of 3452151 graphs for epoch 6!
[2024-12-16 17:23:30.674736] NOT RESET samples of GraphsMapDataset of 3452151 graphs for epoch 7!
[2024-12-16 17:23:30.767873] NOT RESET samples of GraphsMapDataset of 3452151 graphs for epoch 8!
[2024-12-16 17:23:30.861530] NOT RESET samples of GraphsMapDataset of 3452151 graphs fo

In [22]:
print(train_loader)
print(type(train_loader))
print(train_loader.dataset)  # Prints the dataset object associated with the DataLoader
print(train_loader.batch_size)        # Batch size used
print(train_loader.num_workers)       # Number of workers for data loading
print(train_loader.drop_last)         # Whether the last incomplete batch is dropped

print(f"Total samples: {len(train_loader.dataset)}")
print(f"Batch size: {train_loader.batch_size}")
print(f"Total batches: {len(train_loader)}")

batch = next(iter(train_loader))
print("Batch:", batch.keys())

# # If the dataset returns (input, label), unpack the batch
# inputs, labels = batch
# print("Inputs:", inputs)
# print("Labels:", labels)


<torch.utils.data.dataloader.DataLoader object at 0x7fbf69b90ac0>
<class 'torch.utils.data.dataloader.DataLoader'>
<src.data.dataset_map.GraphsMapDataset object at 0x7fc291e26520>
1024
1
True
Total samples: 3452151
Batch size: 1024
Total batches: 50568
[DataLoader Worker 0] seed `random` & `np.random` with 1721667379594961650 & 639804146!
Batch: dict_keys(['input_ids', 'position_ids', 'labels', 'attention_mask', 'embed', 'idx'])


In [23]:
print(batch['embed'].shape)

torch.Size([1024, 40, 0])


## test

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"device: {device}")
print(model.device)

device: cuda
cuda:0


In [25]:
print(embed_dim)

0


In [30]:
for i, data in enumerate(train_loader):
    print(data.keys())
    print(data["input_ids"][0])
    print(data["labels"][0])
    print(data["input_ids"].shape)
    print(data["position_ids"].shape)
    print(data["labels"].shape)
    print(data["attention_mask"].shape)
    print(data["embed"].shape)
    print(data["idx"].shape)
    break

[DataLoader Worker 0] seed `random` & `np.random` with 4970904486264240564 & 338491828!
dict_keys(['input_ids', 'position_ids', 'labels', 'attention_mask', 'embed', 'idx'])
tensor([[131, 630, 685, 695, 709, 715, 724, 731, 737, 739, 740, 741, 742],
        [132, 630, 685, 695, 709, 715, 724, 731, 737, 739, 746, 748, 755],
        [133, 630, 685, 695, 709, 715, 724, 731, 737, 739, 746, 748, 755],
        [134, 630, 685, 695, 709, 715, 724, 731, 737, 739, 746, 748, 755],
        [135, 630, 685, 695, 709, 714, 724, 731, 737, 739, 746, 748, 755],
        [136, 630, 685, 696, 709, 716, 724, 732, 736, 738, 743, 748, 754],
        [137, 630, 687, 696, 709, 715, 724, 732, 736, 738, 743, 748, 754],
        [138, 641, 685, 695, 709, 714, 724, 732, 736, 738, 743, 748, 754],
        [139, 630, 685, 696, 709, 717, 724, 732, 736, 738, 743, 748, 754],
        [138, 641, 685, 695, 709, 714, 724, 732, 736, 738, 743, 748, 754],
        [140, 630, 685, 696, 709, 717, 724, 732, 736, 738, 743, 748, 754],
  

In [27]:
model.eval()
for i, data in enumerate(train_loader):
    input_ids = data["input_ids"].to(device)
    attention_mask = data["attention_mask"].to(device)
    labels = data["labels"].to(device)
    inputs_raw_embeds = None
    if embed_dim > 0: # in tokenizer config
        inputs_raw_embeds = data["embed"].to(device)
    output = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels,
        inputs_raw_embeds=inputs_raw_embeds,
    )  # Perform a single forward pass.
    print(output)
    if i == 2:
        break

[DataLoader Worker 0] seed `random` & `np.random` with 3750542880842102200 & 4285969848!
CausalLMOutputWithPast(loss=tensor(0.1341, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.9782, -0.8619, -0.9213,  ..., -0.8311, -0.1390, -0.6392],
        [-7.7857, -7.7682, -7.7229,  ..., -7.6117, -2.9909, -3.1718],
        [-5.3015, -5.4158, -5.3943,  ..., -5.4932, -1.6270, -1.2912],
        ...,
        [-0.0352,  0.3066, -0.2495,  ..., -0.4284,  2.0003,  0.2725],
        [ 2.2878,  2.6796,  2.9258,  ...,  2.7768,  3.2101,  0.4940],
        [-1.0649, -1.2182, -0.7403,  ..., -1.4388,  5.0863,  1.1478]],
       device='cuda:0', grad_fn=<ViewBackward0>), past_key_values=None, hidden_states=None, attentions=None)
CausalLMOutputWithPast(loss=tensor(0.1375, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.9172, -0.8996, -0.7361,  ..., -0.9960,  0.4533,  0.2306],
        [-8.1157, -8.0207, -8.0581,  ..., -7.9589, -3.0496, -2.8436],
        [-4.4264, -4.8304, -4.6915, 