In [1]:
import torch
import torch.nn.functional as F
import torch.optim as optim
### Huggingface dataset and tokenizer imports
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import XLNetTokenizer

from src.tokenizer.xval_tokenizer import XvalTokenizer
from src.transformer_backbone.xlnet import XLNetBackbone
from src.encoding_decoding.xval_encoding_decoding import XValModel, define_masked_num_collator as xval_define_masked_num_collator

from src.tokenizer.rt_tokenizer import RtTokenizer
from src.encoding_decoding.rt_encoding_decoding import RegressionTransformer, define_masked_num_collator as rt_define_masked_num_collator

%load_ext autoreload
%autoreload 2

In [2]:
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration
import torch
import torch.nn as nn

In [3]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")



model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
model.resize_token_embeddings(33_000)

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 33000. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(33000, 512)

In [9]:
model.transformer

AttributeError: 'T5ForConditionalGeneration' object has no attribute 'transformer'

In [10]:
from torch import nn
from transformers import XLNetModel

In [12]:
xlnet = XLNetModel.from_pretrained("xlnet-base-cased")

In [16]:
xlnet.pos_embedding

AttributeError: 'XLNetModel' object has no attribute 'pos_embedding'

### Load dataset

In [4]:
# Define a function to read the text file and yield examples
def read_txt(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    for i in range(0, len(lines), 2):
        question = lines[i].strip()
        answer = lines[i+1].strip()
        yield {'question': question, 'answer': answer}

# Define the dataset loading function
def load_txt_dataset(file_path):
    return Dataset.from_generator(read_txt, gen_kwargs={'file_path': file_path})


In [5]:
data_path = './data/mathematics_dataset-v1.0/mathematics_dataset-v1.0/train-easy/algebra__linear_1d_small.txt'

ds = load_txt_dataset(data_path)
ds

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['question', 'answer'],
    num_rows: 117
})

In [6]:
ds["question"][0]

'Solve 0 = 4*b + b + 15 for b.'

In [7]:
ds["answer"][0]

'-3'

### Load pretrained Model and Tokenizer

In [8]:
model_name = "xlnet-base-cased"
pretrained_tokenizer = XLNetTokenizer.from_pretrained(model_name)
transformer_backbone = XLNetBackbone(model_name).cuda()



### Xval

In [9]:
tokenizer = XvalTokenizer(pretrained_tokenizer)

Number token ID: 32000


In [10]:
tokenized_x = tokenizer(ds[0]["question"])

In [11]:
print(ds[0]["question"])
print([tokenizer.tokenizer.decode(x) for x in tokenized_x["input_ids"]])
print([x for x in tokenized_x["numbers"]])

Solve 0 = 4*b + b + 15 for b.
['Sol', 've', '[NUM]', '=', '[NUM]', '*', 'b', '+', '', 'b', '+', '[NUM]', 'for', '', 'b', '.', '<sep>', '<cls>']
[1.0, 1.0, 0.0, 1.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 15.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [12]:
print("\nStarting tokenization...")
tokenize_lambda = lambda x: {"question": tokenizer(x["question"]), "answer": tokenizer(x["answer"])}
tokenized_ds = ds.map(
    tokenize_lambda,
    batched=False,
    # num_proc=30,
    load_from_cache_file=False,
)


Starting tokenization...


Map:   0%|          | 0/117 [00:00<?, ? examples/s]

In [13]:
model = XValModel(transformer_backbone=transformer_backbone, vocab_size=len(tokenizer.tokenizer), dim_feedforward=1536, context_length=955).cuda()

pad_token_id = tokenizer.tokenizer.pad_token_id
mask_token_id = tokenizer.tokenizer.mask_token_id
mlm_probability = 0.3
epochs = 10
lr = 1e-4
weight_decay = 0.01
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

collator = xval_define_masked_num_collator(pad_token_id, mask_token_id, mlm_probability)

train_loader = DataLoader(
    tokenized_ds["question"],
    batch_size=32,
    shuffle=True,
    collate_fn=collator,
)


In [14]:
loss_hist = []
loss_mlm_hist = []
loss_num_hist = []

max_n_batches = 100

try:
    for e in tqdm(range(epochs)):
        n_batches = 0
        for batch in train_loader:
            if n_batches > max_n_batches:
                break
            logit_preds, num_preds = model(
                batch["x"].cuda(),
                batch["x_num"].cuda(),
                batch["attention_mask"].cuda(),
                batch["token_type_ids"].cuda(),
            )
            with torch.autocast(device_type="cuda"):
                loss_mlm = F.cross_entropy(
                    logit_preds.view(-1, logit_preds.size(-1)),
                    batch["y"].cuda().view(-1),
                    ignore_index=-100,
                    reduction="mean",
                )
                num_mask = batch['y'] == tokenizer.tokenizer.convert_tokens_to_ids("[NUM]")
                loss_num = F.mse_loss(
                    num_preds[num_mask],
                    batch["y_num"][num_mask].view(-1, 1).cuda(),
                    reduction="mean",
                )
            loss = loss_mlm + loss_num
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_hist.append(loss.item())
            loss_mlm_hist.append(loss_mlm.item())
            loss_num_hist.append(loss_num.item())
            n_batches += 1

            try:
                loss_avg = 0.99 * loss_avg + 0.01 * loss.item()
                loss_mlm_avg = 0.99 * loss_mlm_avg + 0.01 * loss_mlm.item()
                loss_num_avg = 0.99 * loss_num_avg + 0.01 * loss_num.item()
            except:
                loss_avg = loss.item()
                loss_mlm_avg = loss_mlm.item()
                loss_num_avg = loss_num.item()

        checkpoint = {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "loss": loss_avg,
            "loss_hist": loss_hist,
            "loss_mlm_hist": loss_mlm_hist,
            "loss_num_hist": loss_num_hist,
        }
        torch.save(checkpoint, "./ckpt.pt")
        print(f"Epoch #{e}: loss_mlm = {loss_mlm_avg:.3f}; loss_num = {loss_num_avg:.3f}; loss_total = {loss_avg:.3f}")
except KeyboardInterrupt:
    print('Interrupted')


 10%|█         | 1/10 [00:06<00:57,  6.37s/it]

Epoch #0: loss_mlm = 11.248; loss_num = 51407.855; loss_total = 51419.104


 20%|██        | 2/10 [00:13<00:55,  6.89s/it]

Epoch #1: loss_mlm = 11.126; loss_num = 81999.289; loss_total = 82010.416


 30%|███       | 3/10 [00:22<00:53,  7.64s/it]

Epoch #2: loss_mlm = 10.889; loss_num = 82899.489; loss_total = 82910.380


 40%|████      | 4/10 [00:30<00:46,  7.73s/it]

Epoch #3: loss_mlm = 10.618; loss_num = 82658.826; loss_total = 82669.446


 50%|█████     | 5/10 [00:39<00:42,  8.48s/it]

Epoch #4: loss_mlm = 10.362; loss_num = 124809.849; loss_total = 124820.213


 60%|██████    | 6/10 [00:48<00:34,  8.68s/it]

Epoch #5: loss_mlm = 10.089; loss_num = 159113.855; loss_total = 159123.946


 70%|███████   | 7/10 [00:56<00:24,  8.28s/it]

Epoch #6: loss_mlm = 9.815; loss_num = 156969.954; loss_total = 156979.772


 80%|████████  | 8/10 [01:03<00:16,  8.06s/it]

Epoch #7: loss_mlm = 9.553; loss_num = 158357.218; loss_total = 158366.772


 90%|█████████ | 9/10 [01:11<00:07,  7.76s/it]

Epoch #8: loss_mlm = 9.294; loss_num = 152996.410; loss_total = 153005.706


100%|██████████| 10/10 [01:18<00:00,  7.86s/it]

Epoch #9: loss_mlm = 9.044; loss_num = 149240.996; loss_total = 149250.042





### Regression Transformer

In [15]:
def read_num_tokens(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    return [line.strip() for line in lines]

num_tokens = read_num_tokens("regression_transformer_number_tokens.txt")

In [16]:
num_tokens[:5]

['_0_0_', '_1_0_', '_2_0_', '_3_0_', '_4_0_']

In [17]:
tokenizer = RtTokenizer(pretrained_tokenizer, num_tokens, embedding_dim=transformer_backbone.hidden_size)

In [18]:
print(ds[0]["question"])

Solve 0 = 4*b + b + 15 for b.


In [19]:
tokenized_x = tokenizer(ds[0]["question"])
print(ds[0]["question"])
print([tokenizer.tokenizer.decode(x) for x in tokenized_x["input_ids"]])
print([x.sum() for x in tokenized_x["number_embeddings"]])

Solve 0 = 4*b + b + 15 for b.
['Sol', 've', '_0_0_', '=', '_4_0_', '*', 'b', '+', '', 'b', '+', '_1_1_', '_5_0_', 'for', '', 'b', '.', '<sep>', '<cls>']
[0.0, 0.0, 0.0, 0.0, 2.7699864, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6.9249663, 3.4624832, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [20]:
print("\nStarting tokenization...")
tokenize_lambda = lambda x: {"question": tokenizer(x["question"]), "answer": tokenizer(x["answer"])}
tokenized_ds = ds.map(
    tokenize_lambda,
    batched=False,
    # num_proc=30,
    load_from_cache_file=False,
)


Starting tokenization...


Map:   0%|          | 0/117 [00:00<?, ? examples/s]

In [None]:
  # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics
        if training_args.do_eval and not is_torch_xla_available()
        else None,
    )

In [21]:
model = RegressionTransformer(transformer_backbone=transformer_backbone, vocab_size=len(tokenizer.tokenizer), dim_feedforward=1536,
                  context_length=955).cuda()

pad_token_id = tokenizer.tokenizer.pad_token_id
mask_token_id = tokenizer.tokenizer.mask_token_id
mlm_probability = 0.3
epochs = 10
lr = 1e-4
weight_decay = 0.01
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

collator = rt_define_masked_num_collator(pad_token_id, mask_token_id, mlm_probability)

train_loader = DataLoader(
    tokenized_ds["question"],
    batch_size=32,
    shuffle=True,
    collate_fn=collator,
)

In [22]:
loss_hist = []

max_n_batches = 100

try:
    for e in tqdm(range(epochs)):
        n_batches = 0
        for batch in train_loader:
            if n_batches > max_n_batches:
                break
            logit_preds = model(
                batch["x"].cuda(),
                batch["number_embeddings"].cuda(),
                batch["attention_mask"].cuda(),
                batch["token_type_ids"].cuda(),
            )
            with torch.autocast(device_type="cuda"):
                loss = F.cross_entropy(
                    logit_preds.view(-1, logit_preds.size(-1)),
                    batch["y"].cuda().view(-1),
                    ignore_index=-100,
                    reduction="mean",
                )
               
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_hist.append(loss.item())
            n_batches += 1

            try:
                loss_avg = 0.99 * loss_avg + 0.01 * loss.item()
            except:
                loss_avg = loss.item()

        checkpoint = {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "loss": loss_avg,
            "loss_hist": loss_hist,
        }
        torch.save(checkpoint, "./ckpt.pt")
        print(f"Epoch #{e}: loss = {loss_avg:.3f}")
except KeyboardInterrupt:
    print('Interrupted')

 10%|█         | 1/10 [00:19<02:59, 19.98s/it]

Epoch #0: loss = 143369.369


 20%|██        | 2/10 [00:36<02:25, 18.20s/it]

Epoch #1: loss = 137720.320


 30%|███       | 3/10 [00:56<02:10, 18.71s/it]

Epoch #2: loss = 132293.795


 40%|████      | 4/10 [01:13<01:47, 17.94s/it]

Epoch #3: loss = 127081.060


 50%|█████     | 5/10 [01:27<01:23, 16.78s/it]

Epoch #4: loss = 122073.714


 60%|██████    | 6/10 [01:49<01:13, 18.39s/it]

Epoch #5: loss = 117263.670


 70%|███████   | 7/10 [02:02<00:49, 16.61s/it]

Epoch #6: loss = 112643.155


 80%|████████  | 8/10 [02:16<00:31, 15.74s/it]

Epoch #7: loss = 108204.702


 90%|█████████ | 9/10 [02:29<00:14, 14.96s/it]

Epoch #8: loss = 103941.141


100%|██████████| 10/10 [02:42<00:00, 16.24s/it]

Epoch #9: loss = 99845.585



