## some config

In [13]:
import os
import math
import random
import torch
import fire
import copy
import multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import GradScaler
import deepspeed
from datetime import datetime
from typing import Optional
from pprint import pprint, pformat
from torch.utils.data import DataLoader, IterableDataset
from timm.utils import ModelEmaV3
from timm.models import load_checkpoint
from timm.utils.model import unwrap_model, get_state_dict
try:
    from torch.utils.tensorboard import SummaryWriter
except ModuleNotFoundError:
    from tensorboardX import SummaryWriter

import sys

# sys.path.insert(0, "..")
sys.path.insert(0, ".")

In [14]:
from src.data import (
    collator,
    vocab_builder,
    tokenizer,
    read_dataset,
    OdpsTableIterableDataset,
)
from src.models import (
    GraphGPTConfig,
    GraphGPTCausal,
    GraphGPT2Config,
    GraphGPT2Causal,
    GraphBertConfig,
    GraphBertForMaskedLM,
)
from src.utils import (
    conf_utils,
    loss_utils,
    loader_utils,
    tokenizer_utils,
    modules_utils,
    misc_utils,
    print_trainable_parameters,
    print_params,
    inspect_tokenization_results,
    set_up_shuffle_and_sampler,
    worker_init_fn_seed,
)

dict_models = {
    "graphgpt2": (GraphGPT2Causal, GraphGPT2Config),
    "graphgpt": (GraphGPTCausal, GraphGPTConfig),
    "graphbert": (GraphBertForMaskedLM, GraphBertConfig),
}

In [15]:
data_dir: str = "../data/TUDataset"
tables: str = ""
deepspeed_config = "./examples/ds_config2_pt.json"
intermediate_size = 0
num_attention_heads = 0
hidden_size = 512
num_hidden_layers = 8
task_type='pretrain'
causal_attention = 1
lr=3e-4
model_type = 'graphgpt'
output_dir='./exp/models/pcqm4m-v2/test'
pretrain_cpt = ''
samples_per_saving=1000000

batch_size = 1024

In [16]:
use_tb_writer = False           # use tensorboard writer
use_ema = False # False # use exponential moving average to smooth model
ema_file = "model_ema.pt"
ema_file_best = "model_ema_best.pt"
ema_best_res = None
ema_best_flag = False
use_deepspeed = len(deepspeed_config) > 0 # True # use deepspeed for training, good to set scheduler
if use_ema:
    do_test = 1
if (intermediate_size == 0) and (num_attention_heads == 0): # True
    (
        hidden_size,
        intermediate_size,
        num_attention_heads,
        num_hidden_layers,
    ) = modules_utils.set_up_model_architect(
        hidden_size=hidden_size, num_hidden_layers=num_hidden_layers # 768 24 related to model names intermediate_size = hidden_size * 4, num_attention_heads = hidden_size // 64
    )# 768 3072 12 24
causal_attention = 0 if task_type == "pretrain-mlm" else causal_attention
#########################
betas = (0.9, 0.95) # used in AdamW optimizer, important for config beta
#########################
# lr * 0.1 -> from llama2 pre-train settings
min_lr = lr * 0.1 if use_deepspeed else 0    # used in scheduler, when not using deepspeed.
#########################
gpu_name = torch.cuda.get_device_name()
GraphModel, GraphModelConfig = dict_models[model_type] # Not instantiate yet


if os.path.exists(os.path.join(output_dir, "log.csv")):
    print(
        f"log file {os.path.join(output_dir, 'log.csv')} exists, resume training from {output_dir} instead of initializing from pre-train ckp {pretrain_cpt}!"
    )
    pretrain_cpt = output_dir


# 0. init distributed train and get gpu/device info
dist.init_process_group(backend="nccl", init_method="env://")  # for distributed training
dist.barrier() # for sync training
world_size = dist.get_world_size() # 1 # number of GPUs
rank = dist.get_rank() # 0 # current GPU index
local_rank = os.environ.get("LOCAL_RANK") # 0 # current GPU index local to the node
print(f"\nworld size: {world_size}, rank: {rank}, local rank: {local_rank}") # 1 0 0
rnd_seed = torch.random.initial_seed() - rank
random.seed(rnd_seed)
print(f"seed random with {rnd_seed}") # 1234
steps_per_saving = samples_per_saving // (world_size * batch_size) # 1000000 // (1 * 1024) = 976
print(f"\nsteps_per_saving: {steps_per_saving}") # 976
params = print_params(**locals())

ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set

## load data

In [None]:
# tokenizer config loading
import json

# Load the JSON file
file_path = "./zhang_test/tokenizer_config.json"
with open(file_path, "r") as json_file:
    tokenizer_config = json.load(json_file)

# Print the loaded data
pprint(tokenizer_config)

In [5]:
# 1.2 get graph dataset
dataset, raw_dataset = read_dataset(
    name=tokenizer_config["dataset"],   # PCQM4Mv2
    # for local data file reading
    data_dir=data_dir,   # './data/OGB'
    sampling_config=tokenizer_config["sampling"],    # None
    # for odps data reading
    table=tables,   # ""
    edge_dim=tokenizer_config["semantics"]["edge"]["dim"],    # 3
    node_dim=tokenizer_config["semantics"]["node"]["dim"],    # 9
    mode="train",
    # general
    pretrain_mode=True,
    ensemble_datasets=tokenizer_config.get("ensemble_datasets", []),    # []
)
reset_samples_per_epoch = (   # what is this  # None for PCQM4Mv2
    dataset.reset_samples_per_epoch
    if hasattr(dataset, "reset_samples_per_epoch")
    else False
)
if isinstance(dataset, IterableDataset):
    print(next(iter(dataset))) 
else: # True
    idx = dataset.sampler[0] # (0, Data(edge_index=[2, 40], edge_attr=[40, 3], x=[18, 9], y=[1, 1], num_nodes=18, idx=0, idx_of_ds=0))
    print(dataset[idx])


Loading dataset PCQM4Mv2 ...

dataset._data -> Data(edge_index=[2, 109093626], edge_attr=[109093626, 3], x=[52970652, 9], y=[3746620])
In pre-train mode, set all valid data's y to nan!
Before setting, y has 294469 NANs
After setting, y has 368014 NANs
Default process group has not been initialized, please make sure to call init_process_group.

Raw indices: 3746620, Removed indices: 0, New indices: 3746620

Raw indices: 3746620, Removed indices: 294469, New indices: 3452151

[2024-12-09 01:28:00.336220] NOT RESET samples of GraphsMapDataset of 3452151 graphs for epoch None!
idx_tuple: None
(0, Data(edge_index=[2, 40], edge_attr=[40, 3], x=[18, 9], y=[1, 1], num_nodes=18, idx=0, idx_of_ds=0))


In [6]:
print(dataset)
print('length of dataset:', len(dataset))
print(dataset[1])
print(dataset[2])
print("#" * 100)
print("example")
print("edge_index: ", dataset[1][1].edge_index)
print("edge_attr: ", dataset[1][1].edge_attr)
print("x: ", dataset[1][1].x)
print("y: ", dataset[1][1].y) 

<src.data.dataset_map.GraphsMapDataset object at 0x7fa029091280>
length of dataset: 3452151
(1, Data(edge_index=[2, 34], edge_attr=[34, 3], x=[17, 9], y=[1, 1], num_nodes=17, idx=1, idx_of_ds=0))
(2, Data(edge_index=[2, 32], edge_attr=[32, 3], x=[16, 9], y=[1, 1], num_nodes=16, idx=2, idx_of_ds=0))
####################################################################################################
example
edge_index:  tensor([[16, 15, 15, 13, 13,  7,  7,  2,  2, 10, 10,  9,  2,  1,  1,  8,  8,  5,
          5,  4,  4,  6,  6, 11, 11, 12, 12,  0, 12, 14, 11,  3,  5, 13],
        [15, 16, 13, 15,  7, 13,  2,  7, 10,  2,  9, 10,  1,  2,  8,  1,  5,  8,
          4,  5,  6,  4, 11,  6, 12, 11,  0, 12, 14, 12,  3, 11, 13,  5]])
edge_attr:  tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 1],
        [0, 0, 1],
        [3, 0, 1],
        [3, 0, 1],
        [3, 0, 1],
        [3, 0, 1],
        [0, 0, 1],
        [0, 0, 1],
        [0, 0, 0],
        [0, 0, 0],
        [3, 0, 1],
        

## build vocabulary

In [7]:
add_eos = False
rank = 0
stack_method = "short"
# 1.3 build vocab and then init tokenizer from the tokenization config
vocab_builder.build_vocab(raw_dataset, tokenizer_config, rank) # build vocab from file or scratch
tokenizer_cls = getattr(tokenizer, tokenizer_config["tokenizer_class"]) # StackGSTTokenizer, custom defined
gtokenizer = tokenizer_cls(
    tokenizer_config, add_eos=add_eos, stack_method=stack_method # instantiate
)

[2024-12-09 01:28:00.409230] Vocab is already built and saved in ./data/OGB/pcqm4m-v2/vocab512_stacked!
[2024-12-09 01:28:00.409632] Loading vocab from ./data/OGB/pcqm4m-v2/vocab512_stacked ...
[2024-12-09 01:28:00.411244]
{   '0': 22,
    '1': 23,
    '10': 32,
    '100': 122,
    '101': 123,
    '102': 124,
    '103': 125,
    '104': 126,
    '105': 127,
    '106': 128,
    '107': 129,
    '108': 130,
    '109': 131,
    '11': 33,
    '110': 132,
    '111': 133,
    '112': 134,
    '113': 135,
    '114': 136,
    '115': 137,
    '116': 138,
    '117': 139,
    '118': 140,
    '119': 141,
    '12': 34,
    '120': 142,
    '121': 143,
    '122': 144,
    '123': 145,
    '124': 146,
    '125': 147,
    '126': 148,
    '127': 149,
    '128': 150,
    '129': 151,
    '13': 35,
    '130': 152,
    '131': 153,
    '132': 154,
    '133': 155,
    '134': 156,
    '135': 157,
    '136': 158,
    '137': 159,
    '138': 160,
    '139': 161,
    '14': 36,
    '140': 162,
    '141': 163,
    '142'

## get set and tokenize

In [8]:
world_size = 1

# 1.4 get train/test sampler
train_dataset = dataset  #         idx = dataset.sampler[0] # (0, Data(edge_index=[2, 40], edge_attr=[40, 3], x=[18, 9], y=[1, 1], num_nodes=18, idx=0, idx_of_ds=0))
if not isinstance(train_dataset, IterableDataset): # True
    print("train_dataset is not IterableDataset")
    train_sampler = train_dataset.sampler
    print("train_sampler: ", len(train_sampler))
    print("first 10 elements in train_sampler: ", train_sampler[:10])
    random.shuffle(train_sampler)
    print("first 10 elements in train_sampler: ", train_sampler[:10])
    train_shuffle, train_sampler, train_cnt = set_up_shuffle_and_sampler( # train_shuffle = False, sampler, len(sampler)
        train_dataset, train_sampler
    )
    print(train_shuffle, train_sampler[0], train_cnt)
else: 
    train_cnt = len(train_dataset) * world_size # why # 1
    train_sampler = None
    train_shuffle = False

train_dataset is not IterableDataset
train_sampler:  3452151
first 10 elements in train_sampler:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
first 10 elements in train_sampler:  [1194340, 3418645, 2107718, 2496436, 1786570, 1255597, 1883572, 1780019, 412330, 1569708]
False 1194340 3452151


In [9]:
pack_tokens = 0
max_position_embeddings = 1024

if pack_tokens > 0: # 0
    gtokenizer.mpe = max_position_embeddings
    # cannot pass `iter(train_dataset)` for Iterable ds, because `TypeError: cannot pickle 'generator' object`
    gtokenizer.dataset = train_dataset
    gtokenizer.sampler = tuple(train_sampler) if train_sampler is not None else None
    gtokenizer.random_ratio = pack_tokens
    tokens_per_sample = max_position_embeddings
else:
    print("pack_tokens is 0")
    tokens_per_sample = misc_utils.estimate_tokens_per_sample(
        gtokenizer,
        train_dataset,
        train_sampler,
        max_position_embeddings,
        world_size,
    ) # Estimated tokens per sample 20.0 with std 4.0 using 10000 samples and mpe 1024

pack_tokens is 0


100%|██████████| 10000/10000 [00:36<00:00, 274.75it/s]

Estimated tokens per sample 20.0 with std 4.0 using 10000 samples and mpe 1024





In [10]:
task_type='pretrain'
total_tokens=1e9
batch_size = 1024
warmup_tokens=1e8

tokens_per_sample = (
    tokens_per_sample // 2 if task_type == "pretrain-euler" else tokens_per_sample
)
print(f"\n[{datetime.now()}] tokens_per_sample: {tokens_per_sample}") # 20 what is this: estimated tokens per sample, by 10000 samples and mpe 1024

inspect_tokenization_results(dataset, gtokenizer) # print out tokenization results, one sample
# re-initialize `gtokenizer.dataset` to avoid `TypeError: cannot pickle 'generator' object`
gtokenizer.dataset = train_dataset if pack_tokens > 0 else None
print("gtokenizer.dataset: ", gtokenizer.dataset)

total_num_steps = int(
    math.ceil(total_tokens / (tokens_per_sample * batch_size * world_size)) # total_tokens defined in config 4e9/(20*1024*1) = 195313
)
warmup_num_steps = int(
    math.ceil(warmup_tokens / (tokens_per_sample * batch_size * world_size)) # 1e8 ...
)
tmp_cnt = len(train_sampler) if train_sampler else train_cnt / world_size # train_cnt = len(train_dataset) * world_size
epochs = int(math.ceil(total_tokens / (tmp_cnt * tokens_per_sample * world_size))) # token for training / token in the dataset = epochs
print(
    f"\n[{datetime.now()}] total_num_steps: {total_num_steps}\nwarmup_num_steps: {warmup_num_steps}\nepochs per worker: {epochs}\n" # 61 epochs
)
# 195313 4883 61


[2024-12-09 01:28:38.663031] tokens_per_sample: 20.0

Inspecting graph of index 1194340
Inspecting tokenization results!
Tokenize graph:
Data(edge_index=[2, 34], edge_attr=[34, 3], x=[17, 9], y=[1, 1], num_nodes=17, idx=1194340, idx_of_ds=0)

Tokens:
[['295', 'molecule#node#0#5', 'molecule#node#1#2', 'molecule#node#2#4', 'molecule#node#3#5', 'molecule#node#4#1', 'molecule#node#5#0', 'molecule#node#6#2', 'molecule#node#7#0', 'molecule#node#8#0', 'molecule#edge#0', 'molecule#edge#1', 'molecule#edge#2'],
 ['296', 'molecule#node#0#5', 'molecule#node#1#0', 'molecule#node#2#4', 'molecule#node#3#5', 'molecule#node#4#3', 'molecule#node#5#0', 'molecule#node#6#2', 'molecule#node#7#0', 'molecule#node#8#0', 'molecule#edge#0#0', 'molecule#edge#1#0', 'molecule#edge#2#0'],
 ['295', 'molecule#node#0#5', 'molecule#node#1#2', 'molecule#node#2#4', 'molecule#node#3#5', 'molecule#node#4#1', 'molecule#node#5#0', 'molecule#node#6#2', 'molecule#node#7#0', 'molecule#node#8#0', 'molecule#edge#0#0', 'molecule#e

## set model

In [11]:
import pickle
with open("./zhang_test/model_config.pkl", "rb") as file:  # "rb" mode for reading binary
    config = pickle.load(file)
print(config)

GraphGPTConfig {
  "attention_bias": false,
  "attention_dropout": 0.1,
  "bos_token_id": 20,
  "causal_attention": true,
  "cls_token_id": null,
  "dropout": 0,
  "embed_dim": 0,
  "embed_pdrop": 0,
  "eos_token_id": 19,
  "hidden_act": "gelu",
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_scale_init_value": 0,
  "loss_type": null,
  "max_position_embeddings": 1024,
  "mlp": [],
  "mlp_pdrop": 0,
  "model_type": "graphgpt",
  "next_n_token": 13,
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "num_key_value_heads": 8,
  "num_neg": null,
  "pad_token_id": 0,
  "path_pdrop": 0,
  "pooling_method": "last",
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000,
  "stack_method": "short",
  "stacked_feat": 13,
  "stacked_feat_agg_method": "gated",
  "tie_word_embeddings": false,
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 756
}



In [12]:
# use_deepspeed = True

# # 2.2 create model
# if use_deepspeed:
#     deepspeed.init_distributed(
#         dist_backend="nccl", rank=rank, world_size=world_size
#     )

In [None]:
model = GraphModel(config)


model.gradient_checkpointing_enable()
# silence the warnings. Please re-enable for inference!
model.config.use_cache = False
print_trainable_parameters(model) # 235368960