In [27]:
import torch
import torch.nn.functional as F
import torch.optim as optim
### Huggingface dataset and tokenizer imports
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import XLNetTokenizer

from src.tokenizer.xval_tokenizer import XvalTokenizer
from src.transformer_backbone.xlnet import XLNetBackbone
from src.encoding_decoding.xval_encoding_decoding import XValModel, define_masked_num_collator as xval_define_masked_num_collator

from src.tokenizer.rt_tokenizer import RtTokenizer
from src.encoding_decoding.rt_encoding_decoding import RegressionTransformer, define_masked_num_collator as rt_define_masked_num_collator

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load dataset

In [2]:
# Define a function to read the text file and yield examples
def read_txt(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    for i in range(0, len(lines), 2):
        question = lines[i].strip()
        answer = lines[i+1].strip()
        yield {'question': question, 'answer': answer}

# Define the dataset loading function
def load_txt_dataset(file_path):
    return Dataset.from_generator(read_txt, gen_kwargs={'file_path': file_path})


In [3]:
data_path = './data/mathematics_dataset-v1.0/mathematics_dataset-v1.0/train-easy/algebra__linear_1d_small.txt'

ds = load_txt_dataset(data_path)
ds

Dataset({
    features: ['question', 'answer'],
    num_rows: 117
})

In [4]:
ds["question"][0]

'Solve 0 = 4*b + b + 15 for b.'

In [5]:
ds["answer"][0]

'-3'

### Load pretrained Model and Tokenizer

In [56]:
model_name = "xlnet-base-cased"
pretrained_tokenizer = XLNetTokenizer.from_pretrained(model_name)
transformer_backbone = XLNetBackbone(model_name).cuda()

OutOfMemoryError: CUDA out of memory. Tried to allocate 94.00 MiB. GPU 

### Xval

In [43]:
tokenizer = XvalTokenizer(
    num_tokens=["[NUM]"], # is explicitly added to tokenizer
    embedding_dim=128,# TODO # does not matter for the xVal case
    pretrained_tokenizer=pretrained_tokenizer)

Number token IDs: [32000]


In [45]:
tokenized_x = tokenizer(ds[0]["question"])

In [46]:
print(ds[0]["question"])
print([tokenizer.tokenizer.decode(x) for x in tokenized_x["input_ids"]])
print([x for x in tokenized_x["numbers"]])

Solve 0 = 4*b + b + 15 for b.
['Sol', 've', '[NUM]', '=', '[NUM]', '*', 'b', '+', '', 'b', '+', '[NUM]', 'for', '', 'b', '.', '<sep>', '<cls>']
[1.0, 1.0, 0.0, 1.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 15.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [47]:
print(ds[0]["question"])
print([tokenizer.tokenizer.decode(x) for x in tokenized_x["input_ids"]])
print([x for x in tokenized_x["numbers"]])

Solve 0 = 4*b + b + 15 for b.
['Sol', 've', '[NUM]', '=', '[NUM]', '*', 'b', '+', '', 'b', '+', '[NUM]', 'for', '', 'b', '.', '<sep>', '<cls>']
[1.0, 1.0, 0.0, 1.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 15.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [40]:
print("\nStarting tokenization...")
tokenize_lambda = lambda x: {"question": tokenizer(x["question"]), "answer": tokenizer(x["answer"])}
tokenized_ds = ds.map(
    tokenize_lambda,
    batched=False,
    # num_proc=30,
    load_from_cache_file=False,
)


Starting tokenization...


Map:   0%|          | 0/117 [00:00<?, ? examples/s]

In [48]:
model = XValModel(transformer_backbone=transformer_backbone, vocab_size=len(tokenizer.tokenizer), dim_feedforward=1536, context_length=955).cuda()

pad_token_id = tokenizer.tokenizer.pad_token_id
mask_token_id = tokenizer.tokenizer.mask_token_id
mlm_probability = 0.3
epochs = 10
lr = 1e-4
weight_decay = 0.01
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

collator = xval_define_masked_num_collator(pad_token_id, mask_token_id, mlm_probability)

train_loader = DataLoader(
    tokenized_ds["question"],
    batch_size=32,
    shuffle=True,
    collate_fn=collator,
)


OutOfMemoryError: CUDA out of memory. Tried to allocate 188.00 MiB. GPU 

In [22]:
loss_hist = []
loss_mlm_hist = []
loss_num_hist = []

max_n_batches = 100

try:
    for e in tqdm(range(epochs)):
        n_batches = 0
        for batch in train_loader:
            if n_batches > max_n_batches:
                break
            logit_preds, num_preds = model(
                batch["x"].cuda(),
                batch["x_num"].cuda(),
                batch["attention_mask"].cuda(),
                batch["token_type_ids"].cuda(),
            )
            with torch.autocast(device_type="cuda"):
                loss_mlm = F.cross_entropy(
                    logit_preds.view(-1, logit_preds.size(-1)),
                    batch["y"].cuda().view(-1),
                    ignore_index=-100,
                    reduction="mean",
                )
                num_mask = batch['y'] == tokenizer.tokenizer.convert_tokens_to_ids("[NUM]")
                loss_num = F.mse_loss(
                    num_preds[num_mask],
                    batch["y_num"][num_mask].view(-1, 1).cuda(),
                    reduction="mean",
                )
            loss = loss_mlm + loss_num
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_hist.append(loss.item())
            loss_mlm_hist.append(loss_mlm.item())
            loss_num_hist.append(loss_num.item())
            n_batches += 1

            try:
                loss_avg = 0.99 * loss_avg + 0.01 * loss.item()
                loss_mlm_avg = 0.99 * loss_mlm_avg + 0.01 * loss_mlm.item()
                loss_num_avg = 0.99 * loss_num_avg + 0.01 * loss_num.item()
            except:
                loss_avg = loss.item()
                loss_mlm_avg = loss_mlm.item()
                loss_num_avg = loss_num.item()

        checkpoint = {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "loss": loss_avg,
            "loss_hist": loss_hist,
            "loss_mlm_hist": loss_mlm_hist,
            "loss_num_hist": loss_num_hist,
        }
        torch.save(checkpoint, "./ckpt.pt")
        print(f"Epoch #{e}: loss_mlm = {loss_mlm_avg:.3f}; loss_num = {loss_num_avg:.3f}; loss_total = {loss_avg:.3f}")
except KeyboardInterrupt:
    print('Interrupted')


 10%|█         | 1/10 [00:06<00:58,  6.47s/it]

Epoch #0: loss_mlm = 11.389; loss_num = 3417633.368; loss_total = 3417644.843


 10%|█         | 1/10 [00:09<01:27,  9.77s/it]

Interrupted





### Regression Transformer

In [57]:
def read_num_tokens(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    return [line.strip() for line in lines]

num_tokens = read_num_tokens("regression_transformer_number_tokens.txt")

In [58]:
num_tokens[:5]

['[NEG]', '_0_0_', '_1_0_', '_2_0_', '_3_0_']

In [59]:
tokenizer = RtTokenizer(
        pretrained_tokenizer=pretrained_tokenizer, 
        num_tokens = num_tokens, 
        embedding_dim=transformer_backbone.hidden_size)

Number token IDs: [32000, 32001, 32002, 32003, 32004, 32005, 32006, 32007, 32008, 32009, 32010, 32011, 32012, 32013, 32014, 32015, 32016, 32017, 32018, 32019, 32020, 32021, 32022, 32023, 32024, 32025, 32026, 32027, 32028, 32029, 32030, 32031, 32032, 32033, 32034, 32035, 32036, 32037, 32038, 32039, 32040, 32041, 32042, 32043, 32044, 32045, 32046, 32047, 32048, 32049, 32050, 32051, 32052, 32053, 32054, 32055, 32056, 32057, 32058, 32059, 32060, 32061, 32062, 32063, 32064, 32065, 32066, 32067, 32068, 32069, 32070, 32071, 32072, 32073, 32074, 32075, 32076, 32077, 32078, 32079, 32080, 32081, 32082, 32083, 32084, 32085, 32086, 32087, 32088, 32089, 32090, 32091, 32092, 32093, 32094, 32095, 32096, 32097, 32098, 32099, 32100, 32101, 32102, 32103, 32104, 32105, 32106, 32107, 32108, 32109, 32110, 32111, 32112, 32113, 32114, 32115, 32116, 32117, 32118, 32119, 32120, 32121, 32122, 32123, 32124, 32125, 32126, 32127, 32128, 32129, 32130, 32131, 32132, 32133, 32134, 32135, 32136, 32137, 32138, 32139, 3

In [40]:
print(ds[0]["question"])

Solve 0 = 4*b + b + 15 for b.


In [9]:
tokenized_x = tokenizer(ds[0]["question"])
print(ds[0]["question"])
print([tokenizer.tokenizer.decode(x) for x in tokenized_x["input_ids"]])
print([x.sum() for x in tokenized_x["number_embeddings"]])

Solve 0 = 4*b + b + 15 for b.
['Sol', 've', '_0_0_', '=', '_4_0_', '*', 'b', '+', '', 'b', '+', '_1_1_', '_5_0_', 'for', '', 'b', '.', '<sep>', '<cls>']
[0.0, 0.0, 0.0, 0.0, 2.7699864, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6.9249663, 3.4624832, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [10]:
print("\nStarting tokenization...")
tokenize_lambda = lambda x: {"question": tokenizer(x["question"]), "answer": tokenizer(x["answer"])}
tokenized_ds = ds.map(
    tokenize_lambda,
    batched=False,
    # num_proc=30,
    load_from_cache_file=False,
)


Starting tokenization...


Map:   0%|          | 0/117 [00:00<?, ? examples/s]

In [13]:
model = RegressionTransformer(transformer_backbone=transformer_backbone, vocab_size=len(tokenizer.tokenizer), dim_feedforward=1536,
                  context_length=955).cuda()

pad_token_id = tokenizer.tokenizer.pad_token_id
mask_token_id = tokenizer.tokenizer.mask_token_id
mlm_probability = 0.3
epochs = 10
lr = 1e-4
weight_decay = 0.01
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

collator = rt_define_masked_num_collator(pad_token_id, mask_token_id, mlm_probability)

train_loader = DataLoader(
    tokenized_ds["question"],
    batch_size=5,
    shuffle=True,
    collate_fn=collator,
)

OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 

In [12]:
loss_hist = []

max_n_batches = 100

try:
    for e in tqdm(range(epochs)):
        n_batches = 0
        for batch in train_loader:
            if n_batches > max_n_batches:
                break
            logit_preds = model(
                batch["x"].cuda(),
                batch["number_embeddings"].cuda(),
                batch["attention_mask"].cuda(),
                batch["token_type_ids"].cuda(),
            )
            with torch.autocast(device_type="cuda"):
                loss = F.cross_entropy(
                    logit_preds.view(-1, logit_preds.size(-1)),
                    batch["y"].cuda().view(-1),
                    ignore_index=-100,
                    reduction="mean",
                )
               
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_hist.append(loss.item())
            n_batches += 1

            try:
                loss_avg = 0.99 * loss_avg + 0.01 * loss.item()
            except:
                loss_avg = loss.item()

        checkpoint = {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "loss": loss_avg,
            "loss_hist": loss_hist,
        }
        torch.save(checkpoint, "./ckpt.pt")
        print(f"Epoch #{e}: loss = {loss_avg:.3f}")
except KeyboardInterrupt:
    print('Interrupted')

  0%|          | 0/10 [00:01<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 100.00 MiB. GPU 