In [41]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:

from pathlib import Path
import torch
from torch import nn
import yaml
import pandas as pd
from omegaconf import OmegaConf
import lightning as L
import numpy as np

# Add safe globals for pathlib.PosixPath to handle checkpoint loading
torch.serialization.add_safe_globals([Path])

import sys
sys.path.append(str(Path().cwd().parent.parent))
print(sys.path)

from src.models.data import get_tgt_str, id2token, TransformerDataset
from src.models.train import LitTransformer

from src.models.dataset import PRFDataset

proj_root = Path().cwd().parent.parent

cfg = OmegaConf.load(proj_root / "models/first_superfib/config.yaml")

ckpt_path = proj_root / "models/first_superfib/model-step=6500-val_loss=0.10.ckpt"
checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=False)

model = LitTransformer(**checkpoint["hyper_parameters"])
model.load_state_dict(checkpoint["state_dict"], strict=False)
device = "cuda"
model.to(device)
model.eval()

csv_path = proj_root / "data/training/superfib_r1.csv"
metadata_path = proj_root / "data/training/superfib_r1_metadata.yaml"

metadata = OmegaConf.load(metadata_path)

dataset = PRFDataset(
    csv_path=csv_path,
    max_tgt_length = metadata.max_tgt_length,
    max_src_points= metadata.max_src_points,
    src_vocab_list= metadata.src_vocab_list,
    tgt_vocab_list= metadata.tgt_vocab_list,
)



['/home/takeruito/.local/share/uv/python/cpython-3.12.7-linux-x86_64-gnu/lib/python312.zip', '/home/takeruito/.local/share/uv/python/cpython-3.12.7-linux-x86_64-gnu/lib/python3.12', '/home/takeruito/.local/share/uv/python/cpython-3.12.7-linux-x86_64-gnu/lib/python3.12/lib-dynload', '', '/home/takeruito/work/PrfSR/.venv/lib/python3.12/site-packages', '/home/takeruito/work/PrfSR/.venv/lib/python3.12/site-packages/setuptools/_vendor', '/tmp/tmp4r6fj068', '/home/takeruito/work/PrfSR', '/home/takeruito/work/PrfSR', '/home/takeruito/work/PrfSR']




Initializing tokenizers from vocabulary...
Source vocabulary size: 1005
Target vocabulary size: 13
Source vocab sample: [' ', ',', '0', '1', '10', '100', '1000', '101', '102', '103']
Target vocab sample: [' ', '(', ')', ',', '1', '2', '3', '4', 'C', 'P']
Tokenizer initialized with 1009 total tokens
Max token ID: 1008
Tokenizer initialized with 17 total tokens
Max token ID: 16


# Generate whole string

In [None]:
src_vocab = model.src_vocab
tgt_vocab = model.tgt_vocab
src_max_len = model.src_max_len
tgt_max_len = model.tgt_max_len

inputs = [(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,)]
#inputs = [(0,), (0,), (0,), (0,), (0,), (0,), (0,), (0,), (0,), (0,)]
#inputs = [(0,), (0,)]

#outputs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
#outputs = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
#outputs = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
#outputs = [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
#outputs = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
#outputs = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
#outputs = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
outputs = [0, 0, 2, 4, 6, 8, 10, 12, 14, 16]

#outputs = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
#outputs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
#outputs = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
#outputs = [0, 0]
#outputs = [1, 1]
#outputs = [2, 2]
#outputs = [3, 3]



tgt_vocab = dataset.get_tgt_tokenizer().vocab
src_tokenizer = dataset.get_src_tokenizer()
src = dataset._create_src_array(inputs, outputs)
print(src[:10])
arr = dataset._create_tgt_array(current_str)
arr[arr == tgt_vocab["[EOS]"]]= tgt_vocab["[PAD]"]
current_str = ""

with torch.no_grad():
    for read_token_id in range(tgt_max_len - 1):
        arr = dataset._create_tgt_array(current_str)
        print(arr[:10])
        tgt = torch.tensor(arr).to(device)
        output = model(src, tgt) # (T, N, C)
        pred = output[read_token_id, 0, :]
        max_values, max_id= torch.max(pred, axis=0)
        new_token = tgt_vocab.id_to_token(max_id)
        print(f"new_token: {new_token}")
        current_str += new_token
        if max_id == tgt_vocab["[EOS]"]:
            break

list_letters = [id2token(tgt_vocab, id) for id in current_str]
print(list_letters)
print(str("".join(list_letters[1:-1])))

[   2 1007 1007    6    5    7    5  119    5  230]
[2 0 0 0 0 0 0 0 0 0]
[2 3 0 0 0 0 0 0 0 0]


TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not numpy.ndarray

# Generate each token

In [None]:

data_path = "/home/takeru/AlphaSymbol/data/prfndim/d3-a2-c3-r3-status.csv"
df = pd.read_csv(data_path)
dataset = TransformerDataset(df)

idx = 0
src, tgt, tgt_correct = dataset[idx][0], dataset[idx][1][:-1], dataset[idx][1][1:]
src, tgt, tgt_correct = src.to(device), tgt.to(device), tgt_correct.to(device)
pred_token_place= 3 #>=1

print("=== Input ===")
input_str = dataset.df["expr"].iloc[idx]    
print("input string: ", input_str)
print("pred_token_place: ", pred_token_place)

print()
print("=== Raw Data ===")
print("src: ", src)
print("tgt: ", tgt)
print("tgt_correct: ", tgt_correct)

src = src.reshape((1, -1))
pad_tensor = torch.tensor([tgt_vocab["<pad>"] for _ in range(len(tgt) - pred_token_place)]).to(device)
tgt = torch.cat((tgt[:pred_token_place], pad_tensor)).reshape((1, -1))
tgt_correct = tgt_correct[pred_token_place- 1].reshape((1, ))

print()
print("=== Processed data for inference ===")
print("src: ", src)
print("tgt: ", tgt)
print("tgt_correct: ", tgt_correct)


print()
print("=== Prediction ===")
model.eval()
output = model(src, tgt) # (Seq, N, E)
pred_token = output[pred_token_place- 1, 0, :]
print("pred: ", pred_token)
token_id =  torch.argmax(pred_token).item()
print("token_id: ",token_id)
print("token: ", id2token(tgt_vocab, token_id))


    
loss_fn = nn.CrossEntropyLoss(ignore_index=tgt_vocab["<pad>"])
pred_for_loss = pred_token.reshape((1, -1))
loss = loss_fn(pred_for_loss, tgt_correct)
print(loss)

# Generate from src, tgt

In [None]:
src = torch.tensor([[1, 3, 4, 5, 6, 4, 7, 3, 4, 5, 6, 4, 7, 3, 4, 5, 6, 4, 7, 3, 4, 5, 6, 4,
         7, 3, 4, 5, 6, 4, 7, 3, 4, 5, 6, 4, 7, 3, 4, 5, 6, 4, 7, 3, 4, 5, 6, 4,
         7, 3, 4, 5, 6, 4, 7, 3, 4, 5, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')

tgt = torch.tensor([[1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       device='cuda:0')

output = model(src, tgt)
print(output[1, 0, :])

# Generate loss along with Training

In [None]:
idx = 0
src, tgt, tgt_correct = dataset[idx][0], dataset[idx][1][:-1], dataset[idx][1][1:]
src = src.reshape((1, -1)).to(device)
tgt = tgt.reshape((1, -1)).to(device)
tgt_correct = tgt_correct.reshape((1, -1)).to(device)
output = model(src, tgt) # (T, N=1, C)
output = output.permute(1, 2, 0) # (N=1, C, T)
loss_fn = nn.CrossEntropyLoss(ignore_index=tgt_vocab["<pad>"])
loss = loss_fn(output, tgt_correct)
print(loss)