## some config

In [1]:
import os
import math
import random
import torch
import fire
import copy
import multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import GradScaler
# import deepspeed
from datetime import datetime
from typing import Optional
from pprint import pprint, pformat
from torch.utils.data import DataLoader, IterableDataset
from timm.utils import ModelEmaV3
from timm.models import load_checkpoint
from timm.utils.model import unwrap_model, get_state_dict
try:
    from torch.utils.tensorboard import SummaryWriter
except ModuleNotFoundError:
    from tensorboardX import SummaryWriter

import sys

# sys.path.insert(0, "..")
sys.path.insert(0, ".")

In [2]:
from src.data import (
    collator,
    vocab_builder,
    tokenizer,
    read_dataset,
    OdpsTableIterableDataset,
)
from src.models import (
    GraphGPTConfig,
    GraphGPTCausal,
    GraphGPT2Config,
    GraphGPT2Causal,
    GraphBertConfig,
    GraphBertForMaskedLM,
)
from src.utils import (
    conf_utils,
    loss_utils,
    loader_utils,
    tokenizer_utils,
    modules_utils,
    misc_utils,
    print_trainable_parameters,
    print_params,
    inspect_tokenization_results,
    set_up_shuffle_and_sampler,
    worker_init_fn_seed,
)

dict_models = {
    "graphgpt2": (GraphGPT2Causal, GraphGPT2Config),
    "graphgpt": (GraphGPTCausal, GraphGPTConfig),
    "graphbert": (GraphBertForMaskedLM, GraphBertConfig),
}

[2024-12-17 21:43:18,302] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlvsym'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `shm_open'
/home/yang/miniconda3/envs/graph_gpt/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined

## load data

In [3]:
# tokenizer config loading
import json

# Load the JSON file
file_path = "./zhang_test/tokenizer_config.json"
with open(file_path, "r") as json_file:
    tokenizer_config = json.load(json_file)

# Print the loaded data
pprint(tokenizer_config)

{'attr_world_identifier': 'molecule',
 'data_dir': './data/OGB',
 'dataset': 'PCQM4Mv2',
 'ensemble_datasets': [],
 'label_tokens_to_pad': ['<icl>'],
 'name_or_path': './data/OGB/pcqm4m-v2',
 'pretrain_mlm': {'info': 'name->polynomial|cosine|fixed,power->3/2/1/0.5',
                  'name': 'polynomial',
                  'params': {'fixed_ratio': 0.7,
                             'mtp': [0.8, 0, 0.2],
                             'power': 1}},
 'sampling': None,
 'semantics': {'attr_assignment': 'first',
               'attr_shuffle': False,
               'common': {'numbers': ['<e>',
                                      '<.>',
                                      '<->',
                                      '<0>',
                                      '<1>',
                                      '<2>',
                                      '<3>',
                                      '<4>',
                                      '<5>',
                                      '<6>',


## build vocabulary

In [4]:
add_eos = False
rank = 0
stack_method = "short"
# 1.3 build vocab and then init tokenizer from the tokenization config
tokenizer_cls = getattr(tokenizer, tokenizer_config["tokenizer_class"]) # StackGSTTokenizer, custom defined
gtokenizer = tokenizer_cls(
    tokenizer_config, add_eos=add_eos, stack_method=stack_method # instantiate
)

[2024-12-17 21:43:20.206361] Loading vocab from ./data/OGB/pcqm4m-v2/vocab512_stacked ...
[2024-12-17 21:43:20.209105]
{   '0': 22,
    '1': 23,
    '10': 32,
    '100': 122,
    '101': 123,
    '102': 124,
    '103': 125,
    '104': 126,
    '105': 127,
    '106': 128,
    '107': 129,
    '108': 130,
    '109': 131,
    '11': 33,
    '110': 132,
    '111': 133,
    '112': 134,
    '113': 135,
    '114': 136,
    '115': 137,
    '116': 138,
    '117': 139,
    '118': 140,
    '119': 141,
    '12': 34,
    '120': 142,
    '121': 143,
    '122': 144,
    '123': 145,
    '124': 146,
    '125': 147,
    '126': 148,
    '127': 149,
    '128': 150,
    '129': 151,
    '13': 35,
    '130': 152,
    '131': 153,
    '132': 154,
    '133': 155,
    '134': 156,
    '135': 157,
    '136': 158,
    '137': 159,
    '138': 160,
    '139': 161,
    '14': 36,
    '140': 162,
    '141': 163,
    '142': 164,
    '143': 165,
    '144': 166,
    '145': 167,
    '146': 168,
    '147': 169,
    '148': 170,
 

In [None]:
print(gtokenizer)

<src.data.tokenizer.StackedGSTTokenizer object at 0x7fc650727e50>


## test

In [6]:
data_dir: str = "../data/OGB"
tables: str = ""

In [7]:
# 1.2 get graph dataset
train_dataset, valid_dataset, test_dataset, raw_dataset = read_dataset(
    name=tokenizer_config["dataset"],   # PCQM4Mv2
    # for local data file reading
    data_dir=data_dir,   # './data/OGB'
    sampling_config=tokenizer_config["sampling"],    # None
    # for odps data reading
    table=tables,   # ""
    edge_dim=tokenizer_config["semantics"]["edge"]["dim"],    # 3
    node_dim=tokenizer_config["semantics"]["node"]["dim"],    # 9
    mode="train",
    # general
    # pretrain_mode=True,
    return_valid_test=True,
    ensemble_datasets=tokenizer_config.get("ensemble_datasets", []),    # []
)
reset_samples_per_epoch = (   # what is this  # None for PCQM4Mv2
    test_dataset.reset_samples_per_epoch
    if hasattr(test_dataset, "reset_samples_per_epoch")
    else False
)
if isinstance(test_dataset, IterableDataset):
    print(next(iter(test_dataset))) 
else: # True
    idx = test_dataset.sampler[0] # (0, Data(edge_index=[2, 40], edge_attr=[40, 3], x=[18, 9], y=[1, 1], num_nodes=18, idx=0, idx_of_ds=0))
    print(test_dataset[idx])


Loading dataset PCQM4Mv2 ...

dataset._data -> Data(edge_index=[2, 109093626], edge_attr=[109093626, 3], x=[52970652, 9], y=[3746620])

Raw indices: 3378606, Removed indices: 0, New indices: 3378606

Raw indices: 73545, Removed indices: 0, New indices: 73545
Using all valid data as valid: 73545, and last half of valid data as test: 36773!

[2024-12-17 21:43:29.300545] NOT RESET samples of GraphsMapDataset of 3378606 graphs for epoch None!
idx_tuple: None

[2024-12-17 21:43:29.337042] NOT RESET samples of GraphsMapDataset of 73545 graphs for epoch None!
idx_tuple: None

[2024-12-17 21:43:29.353848] NOT RESET samples of GraphsMapDataset of 36773 graphs for epoch None!
idx_tuple: None
Split dataset based on given train/valid/test index!
Train: 3378606, Valid: 73545, Test: 36773!
(3561983, Data(edge_index=[2, 34], edge_attr=[34, 3], x=[16, 9], y=[1], num_nodes=16, idx=3561983, idx_of_ds=0))


In [8]:
from src.utils.my_utiles import smiles2graph
smiles = "CCO"
graph = smiles2graph("CC(=O)OC1=CC=CC=C1C(=O)O")
print(graph.keys())
print(graph['edge_index'].shape)
print(graph['edge_attr'].shape)
print(graph['x'].shape)
print(graph['num_nodes'])


dict_keys(['edge_index', 'edge_attr', 'x', 'num_nodes'])
(2, 26)
(26, 3)
(13, 9)
13


In [9]:
from src.utils.my_utiles import graph_to_torch_geometric
graph = graph_to_torch_geometric(graph)
print(graph['edge_index'].shape)
print(graph['edge_attr'].shape)
print(graph['x'].shape)
print(graph['num_nodes'])

torch.Size([2, 26])
torch.Size([26, 3])
torch.Size([13, 9])
13


In [10]:
from src.utils.my_utiles import graph2smiles
smiles = graph2smiles(graph.edge_index, graph.edge_attr, graph.x)
print(smiles)

CC(=O)Oc1ccccc1C(=O)O


In [11]:
example_graph = graph
from src.utils.my_utiles import graph2token2input
import numpy as np
token, label, embed, inputs = graph2token2input(example_graph, gtokenizer)

# print(
#     f"\nTokens:\n{pformat(token)}\nLabels:\n{pformat(label)}\nembed:{np.array(embed)}\n"
# )

# print(f"Inputs for model:\n{pformat(inputs)}\n")

Inspecting tokenization results!
Tokenize graph:
Data(x=[13, 9], edge_index=[2, 26], edge_attr=[26, 3], num_nodes=13)


In [12]:
print(inputs.keys())

dict_keys(['input_ids', 'position_ids', 'labels', 'attention_mask', 'embed'])


In [13]:
from src.utils.my_utiles import convert_to_tensors

tensor_inputs = convert_to_tensors(inputs)
print(tensor_inputs.keys())

print("input_ids:", tensor_inputs["input_ids"].shape)
print("position_ids:", tensor_inputs["position_ids"].shape)
print("labels:", tensor_inputs["labels"].shape)
print("attention_mask:", tensor_inputs["attention_mask"].shape)
print("embed:", tensor_inputs["embed"].shape)


dict_keys(['input_ids', 'position_ids', 'labels', 'attention_mask', 'embed'])
input_ids: torch.Size([1, 19, 13])
position_ids: torch.Size([1, 19])
labels: torch.Size([1, 19, 13])
attention_mask: torch.Size([1, 19])
embed: torch.Size([1, 19, 0])


In [14]:
# print(tensor_inputs["input_ids"])
# print(tensor_inputs["labels"])

In [15]:
print("input_ids:", tensor_inputs["input_ids"].shape)
print("position_ids:", tensor_inputs["position_ids"].shape)
print("labels:", tensor_inputs["labels"].shape)
print("attention_mask:", tensor_inputs["attention_mask"].shape)
print("embed:", tensor_inputs["embed"].shape)


input_ids: torch.Size([1, 19, 13])
position_ids: torch.Size([1, 19])
labels: torch.Size([1, 19, 13])
attention_mask: torch.Size([1, 19])
embed: torch.Size([1, 19, 0])


In [16]:
from src.utils.my_utiles import smiles2graph
smiles = "CCO" # "CC(=O)OC1=CC=CC=C1C(=O)O"
graph = smiles2graph(smiles)
print(graph.keys())
print(graph['edge_index'].shape)
print(graph['edge_attr'].shape)
print(graph['x'].shape)
print(graph['num_nodes'])

from src.utils.my_utiles import graph_to_torch_geometric
graph = graph_to_torch_geometric(graph)

from src.utils.my_utiles import graph2smiles
smiles = graph2smiles(graph.edge_index, graph.edge_attr, graph.x)
print(smiles)

example_graph = graph
from src.utils.my_utiles import graph2token2input
import numpy as np
token, label, embed, inputs = graph2token2input(example_graph, gtokenizer)

# print(
#     f"\nTokens:\n{pformat(token)}"
# )

dict_keys(['edge_index', 'edge_attr', 'x', 'num_nodes'])
(2, 4)
(4, 3)
(3, 9)
3
CCO
Inspecting tokenization results!
Tokenize graph:
Data(x=[3, 9], edge_index=[2, 4], edge_attr=[4, 3], num_nodes=3)


In [17]:
smiles = "CCO" # "CC(=O)OC1=CC=CC=C1C(=O)O"
graph = smiles2graph(smiles)
graph = graph_to_torch_geometric(graph)
smiles2 = graph2smiles(graph.edge_index, graph.edge_attr, graph.x)
example_graph = graph


print("smiles:", smiles)
print("smiles2:", smiles2)
print("graph", graph)
print("graph x:", graph.x)
print("graph edge_index:", graph.edge_index)
print("graph edge_attr:", graph.edge_attr)

pprint(token)

smiles: CCO
smiles2: CCO
graph Data(x=[3, 9], edge_index=[2, 4], edge_attr=[4, 3], num_nodes=3)
graph x: tensor([[5, 0, 4, 5, 3, 0, 2, 0, 0],
        [5, 0, 4, 5, 2, 0, 2, 0, 0],
        [7, 0, 2, 5, 1, 0, 2, 0, 0]])
graph edge_index: tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])
graph edge_attr: tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])
[['423',
  'molecule#node#0#5',
  'molecule#node#1#0',
  'molecule#node#2#4',
  'molecule#node#3#5',
  'molecule#node#4#3',
  'molecule#node#5#0',
  'molecule#node#6#2',
  'molecule#node#7#0',
  'molecule#node#8#0',
  'molecule#edge#0',
  'molecule#edge#1',
  'molecule#edge#2'],
 ['424',
  'molecule#node#0#5',
  'molecule#node#1#0',
  'molecule#node#2#4',
  'molecule#node#3#5',
  'molecule#node#4#2',
  'molecule#node#5#0',
  'molecule#node#6#2',
  'molecule#node#7#0',
  'molecule#node#8#0',
  'molecule#edge#0#0',
  'molecule#edge#1#0',
  'molecule#edge#2#0'],
 ['425',
  'molecule#node#0#7',
  'molecule#node#1#0',
  'mo

In [18]:
# for i in range(len(test_dataset)):
#     if test_dataset[i][1].num_nodes < 10:
#         print(i, test_dataset[i][1].num_nodes)
#         break

In [19]:
# smiles = "CCO" # "CC(=O)OC1=CC=CC=C1C(=O)O"\
random_index = np.random.randint(0, len(test_dataset))  # Upper bound is exclusive
print(random_index)
graph = test_dataset[random_index][1]
# graph = graph_to_torch_geometric(graph)
smiles2 = graph2smiles(graph.edge_index, graph.edge_attr, graph.x)
example_graph = graph


# print("smiles:", smiles)
print("smiles2:", smiles2)
print("graph", graph)
print("graph x:", graph.x)
print("graph edge_index:", graph.edge_index)
print("graph edge_attr:", graph.edge_attr)

token, label, embed, inputs = graph2token2input(graph, gtokenizer)

pprint(token)

2366
smiles2: C=CCC(O)(CC(C)O)c1ccccc1
graph Data(edge_index=[2, 30], edge_attr=[30, 3], x=[15, 9], y=[1], num_nodes=15, idx=2366, idx_of_ds=0)
graph x: tensor([[5, 0, 3, 5, 1, 0, 1, 1, 1],
        [5, 0, 4, 5, 3, 0, 2, 0, 0],
        [5, 0, 3, 5, 0, 0, 1, 1, 1],
        [5, 2, 4, 5, 1, 0, 2, 0, 0],
        [5, 0, 4, 5, 2, 0, 2, 0, 0],
        [5, 0, 3, 5, 2, 0, 1, 0, 0],
        [7, 0, 2, 5, 1, 0, 2, 0, 0],
        [5, 0, 3, 5, 1, 0, 1, 1, 1],
        [5, 0, 3, 5, 1, 0, 1, 0, 0],
        [7, 0, 2, 5, 1, 0, 2, 0, 0],
        [5, 0, 3, 5, 1, 0, 1, 1, 1],
        [5, 2, 4, 5, 0, 0, 2, 0, 0],
        [5, 0, 3, 5, 1, 0, 1, 1, 1],
        [5, 0, 4, 5, 2, 0, 2, 0, 0],
        [5, 0, 3, 5, 1, 0, 1, 1, 1]])
graph edge_index: tensor([[ 5,  8,  8,  4,  4, 11, 11,  2,  2,  0,  0,  7,  7, 10, 10, 12, 12, 14,
         11, 13, 13,  3,  3,  6,  3,  1, 11,  9, 14,  2],
        [ 8,  5,  4,  8, 11,  4,  2, 11,  0,  2,  7,  0, 10,  7, 12, 10, 14, 12,
         13, 11,  3, 13,  6,  3,  1,  3,  9, 11,  2, 

In [20]:
from src.utils.my_utiles import token_to_graph 
graph_x, graph_edge_index, graph_edge_attr = token_to_graph(token)
print("Node Features:", graph_x)
print("Edge Index:", graph_edge_index)
print("Edge Attributes:", graph_edge_attr)

Node Features: [[5 0 3 5 0 0 1 1 1]
 [5 0 3 5 1 0 1 1 1]
 [5 0 3 5 1 0 1 1 1]
 [5 0 3 5 1 0 1 1 1]
 [5 0 3 5 1 0 1 1 1]
 [5 0 3 5 1 0 1 1 1]
 [5 2 4 5 0 0 2 0 0]
 [5 0 4 5 2 0 2 0 0]
 [5 0 3 5 1 0 1 0 0]
 [5 0 3 5 2 0 1 0 0]
 [7 0 2 5 1 0 2 0 0]
 [5 0 4 5 2 0 2 0 0]
 [5 2 4 5 1 0 2 0 0]
 [5 0 4 5 3 0 2 0 0]
 [7 0 2 5 1 0 2 0 0]]
Edge Index: [[ 0  1  1  2  2  3  3  4  4  5  5  0  0  6  6  7  7  8  8  9  6 10  6 11
  11 12 12 13 12 14]
 [ 1  0  2  1  3  2  4  3  5  4  0  5  6  0  7  6  8  7  9  8 10  6 11  6
  12 11 13 12 14 12]]
Edge Attributes: [[3 0 1]
 [3 0 1]
 [3 0 1]
 [3 0 1]
 [3 0 1]
 [3 0 1]
 [3 0 1]
 [3 0 1]
 [3 0 1]
 [3 0 1]
 [3 0 1]
 [3 0 1]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [1 0 0]
 [1 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]


In [21]:
smiles = graph2smiles(graph_edge_index, graph_edge_attr, graph_x)
print("SMILES:", smiles)
print("smiles2:", smiles2)

SMILES: C=CCC(O)(CC(C)O)c1ccccc1
smiles2: C=CCC(O)(CC(C)O)c1ccccc1


# test generation 

In [22]:
# smiles = "CCO" # "CC(=O)OC1=CC=CC=C1C(=O)O"\
random_index = np.random.randint(0, len(test_dataset))  # Upper bound is exclusive
print(random_index)
random_index = 0
graph = test_dataset[random_index][1]
# graph = graph_to_torch_geometric(graph)
smiles2 = graph2smiles(graph.edge_index, graph.edge_attr, graph.x)
print(smiles2)
example_graph = graph


token, label, embed, inputs = graph2token2input(graph, gtokenizer)
print('token', np.array(token).shape)
print('label', np.array(label).shape)
print('embed', np.array(embed).shape)
print(inputs.keys())
print('input_ids', np.array(inputs["input_ids"]).shape)
print('position_ids', np.array(inputs["position_ids"]).shape)
print('labels', np.array(inputs["labels"]).shape)
print('attention_mask', np.array(inputs["attention_mask"]).shape)
print('embed', np.array(inputs["embed"]).shape)
print(inputs["attention_mask"])

3081
Cc1ccc(C2Cc3cnccc3NC2=O)cc1
Inspecting tokenization results!
Tokenize graph:
Data(edge_index=[2, 40], edge_attr=[40, 3], x=[18, 9], y=[1], num_nodes=18, idx=0, idx_of_ds=0)
token (24, 13)
label (24, 13)
embed (24, 0)
dict_keys(['input_ids', 'position_ids', 'labels', 'attention_mask', 'embed'])
input_ids (24, 13)
position_ids (24,)
labels (24, 13)
attention_mask (24,)
embed (24, 0)
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [23]:
pprint(token)
pprint(label)
pprint(inputs["input_ids"])

[['178',
  'molecule#node#0#5',
  'molecule#node#1#0',
  'molecule#node#2#3',
  'molecule#node#3#5',
  'molecule#node#4#0',
  'molecule#node#5#0',
  'molecule#node#6#1',
  'molecule#node#7#0',
  'molecule#node#8#1',
  'molecule#edge#0',
  'molecule#edge#1',
  'molecule#edge#2'],
 ['179',
  'molecule#node#0#6',
  'molecule#node#1#0',
  'molecule#node#2#2',
  'molecule#node#3#5',
  'molecule#node#4#0',
  'molecule#node#5#1',
  'molecule#node#6#1',
  'molecule#node#7#0',
  'molecule#node#8#1',
  'molecule#edge#0#0',
  'molecule#edge#1#0',
  'molecule#edge#2#1'],
 ['180',
  'molecule#node#0#5',
  'molecule#node#1#0',
  'molecule#node#2#3',
  'molecule#node#3#5',
  'molecule#node#4#0',
  'molecule#node#5#0',
  'molecule#node#6#1',
  'molecule#node#7#1',
  'molecule#node#8#1',
  'molecule#edge#0#0',
  'molecule#edge#1#0',
  'molecule#edge#2#1'],
 ['181',
  'molecule#node#0#5',
  'molecule#node#1#0',
  'molecule#node#2#3',
  'molecule#node#3#5',
  'molecule#node#4#1',
  'molecule#node#5#0',
 

In [24]:
from src.utils.my_utiles import graph2token2input_generation
# smiles = "CCO" # "CC(=O)OC1=CC=CC=C1C(=O)O"\
random_index = np.random.randint(0, len(test_dataset))  # Upper bound is exclusive
print(random_index)
random_index = 0
graph = test_dataset[random_index][1]
# graph = graph_to_torch_geometric(graph)
smiles2 = graph2smiles(graph.edge_index, graph.edge_attr, graph.x)
example_graph = graph
print(smiles2)

num_input, max_length = 5, 40
token, label, embed, inputs = graph2token2input_generation(graph, gtokenizer, num_input, max_length)
print('token', np.array(token).shape)
print('label', np.array(label).shape)
print('embed', np.array(embed).shape)
print(inputs.keys())
print('input_ids', np.array(inputs["input_ids"]).shape)
print('position_ids', np.array(inputs["position_ids"]).shape)
print('labels', np.array(inputs["labels"]).shape)
print('attention_mask', np.array(inputs["attention_mask"]).shape)
# print('embed', np.array(inputs["embed"]).shape)
print(inputs["attention_mask"])

14358
Cc1ccc(C2Cc3cnccc3NC2=O)cc1
Inspecting tokenization results!
Tokenize graph:
Data(edge_index=[2, 40], edge_attr=[40, 3], x=[18, 9], y=[1], num_nodes=18, idx=0, idx_of_ds=0)
token (5, 13)
label (24, 13)
embed (5, 0)
dict_keys(['input_ids', 'position_ids', 'labels', 'attention_mask', 'embed'])
input_ids (5, 13)
position_ids (5,)
labels (24, 13)
attention_mask (5,)
[1. 1. 1. 1. 1.]


In [28]:
pprint(token)
# pprint(label)
pprint(inputs["input_ids"])
# pprint(inputs["labels"])

[['357',
  'molecule#node#0#5',
  'molecule#node#1#0',
  'molecule#node#2#3',
  'molecule#node#3#5',
  'molecule#node#4#0',
  'molecule#node#5#0',
  'molecule#node#6#1',
  'molecule#node#7#1',
  'molecule#node#8#1',
  'molecule#edge#0',
  'molecule#edge#1',
  'molecule#edge#2'],
 ['358',
  'molecule#node#0#5',
  'molecule#node#1#0',
  'molecule#node#2#3',
  'molecule#node#3#5',
  'molecule#node#4#1',
  'molecule#node#5#0',
  'molecule#node#6#1',
  'molecule#node#7#1',
  'molecule#node#8#1',
  'molecule#edge#0#3',
  'molecule#edge#1#0',
  'molecule#edge#2#1'],
 ['359',
  'molecule#node#0#5',
  'molecule#node#1#0',
  'molecule#node#2#3',
  'molecule#node#3#5',
  'molecule#node#4#1',
  'molecule#node#5#0',
  'molecule#node#6#1',
  'molecule#node#7#1',
  'molecule#node#8#1',
  'molecule#edge#0#3',
  'molecule#edge#1#0',
  'molecule#edge#2#1'],
 ['360',
  'molecule#node#0#5',
  'molecule#node#1#0',
  'molecule#node#2#3',
  'molecule#node#3#5',
  'molecule#node#4#0',
  'molecule#node#5#0',
 

In [26]:
tensor_inputs = convert_to_tensors(inputs)
print(tensor_inputs)

{'input_ids': tensor([[[379, 630, 685, 695, 709, 714, 724, 731, 737, 739, 740, 741, 742],
         [380, 630, 685, 695, 709, 715, 724, 731, 737, 739, 746, 748, 755],
         [381, 630, 685, 695, 709, 715, 724, 731, 737, 739, 746, 748, 755],
         [382, 630, 685, 695, 709, 714, 724, 731, 737, 739, 746, 748, 755],
         [383, 630, 687, 696, 709, 715, 724, 732, 736, 739, 743, 748, 754]]]), 'position_ids': tensor([[0, 1, 2, 3, 4]]), 'labels': tensor([[[380, 630, 685, 695, 709, 715, 724, 731, 737, 739, 746, 748, 755],
         [381, 630, 685, 695, 709, 715, 724, 731, 737, 739, 746, 748, 755],
         [382, 630, 685, 695, 709, 714, 724, 731, 737, 739, 746, 748, 755],
         [383, 630, 687, 696, 709, 715, 724, 732, 736, 739, 743, 748, 754],
         [384, 630, 685, 695, 709, 714, 724, 731, 736, 739, 743, 748, 754],
         [385, 652, 685, 691, 709, 714, 724, 731, 736, 738, 744, 748, 755],
         [384, 630, 685, 695, 709, 714, 724, 731, 736, 739, 744, 748, 755],
         [386, 641