In [1]:
import json
import torch
import random
from datasets import load_dataset, Dataset, load_from_disk

fin = open("datasets/parametric_equations_pairs.json", 'r')
lines = fin.readlines()
random.shuffle(lines)
questions = []
answers = []
for line in lines:
    data = json.loads(line)
    questions.append(data['rounded_regressed'])
    answers.append(data['original'])
fin.close()

ds = Dataset.from_dict({'question': questions, 'answer':answers})
train_ds = ds.train_test_split(test_size=0.04)

train_ds['train'][1]

  from .autonotebook import tqdm as notebook_tqdm


{'question': '1.0*t**4 + 6.13*t**2 + 3.17',
 'answer': 't**4 + 6.13*t**2 + 3.17'}

In [2]:
train_ds

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 115704
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 4822
    })
})

In [2]:
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

CONTEXT_LENGTH = 128

# We prefix our tasks with "answer the question"
prefix = ""

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=CONTEXT_LENGTH, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"], 
                      max_length=CONTEXT_LENGTH,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

tokenized_dataset = train_ds.map(preprocess_function, batched=True)

# tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125m") #"EleutherAI/gpt-neo-125m") "xhyi/PT_GPTNEO350_ATG"
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token

# def preprocess_function(examples):
#     return tokenizer(examples["eq_pair"], padding='max_length', truncation=True, max_length=CONTEXT_LENGTH, return_tensors="pt")

# tokenized_ds = train_ds.map(
#     preprocess_function,
#     batched=True,
#     num_proc=1,
#     remove_columns=train_ds["train"].column_names,
# )

# def preprocess_function(examples):
#     examples["labels"] = examples["input_ids"].copy()
#     return examples

# lm_dataset = tokenized_ds.map(preprocess_function, batched=True, num_proc=1)

# from transformers import DataCollatorForLanguageModeling

# tokenizer.pad_token = tokenizer.eos_token
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 115704/115704 [00:05<00:00, 19706.64 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4822/4822 [00:00<00:00, 20818.49 examples/s]


In [4]:
tokenized_dataset['test'][1]

{'question': '15.2 - 4.8*t',
 'answer': '15.2 - 24*t/5',
 'input_ids': [9996, 357, 3, 18, 3, 27441, 1935, 17, 1],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [9996, 357, 3, 18, 997, 1935, 17, 16936, 1]}

In [3]:
import nltk
import evaluate
import numpy as np

#nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds
   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
   return result

# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 16
PER_DEVICE_EVAL_BATCH = 16
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 2
NUM_EPOCHS = 5
SAVE_STEPS=1000

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="datasets/normalize_symbolic_regression_results_flant5_20231219",
   evaluation_strategy="steps",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   save_steps=SAVE_STEPS,
   eval_steps=SAVE_STEPS,
   logging_steps=SAVE_STEPS,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

cp_path = "datasets/normalize_symbolic_regression_results_flant5_20231219/checkpoint-36000"

trainer.train(cp_path)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Step,Training Loss,Validation Loss



KeyboardInterrupt



In [4]:
model.save_pretrained("datasets/normalize_symbolic_regression_results_flant5_20231219/flant5-base-36000-loss0.097.model")

In [7]:
#model = AutoModelForCausalLM.from_pretrained("datasets/normalize_symbolic_regression_results_20231219/gptneo-350m-22000-loss0.443.model")

In [44]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

inputs = "0.33*t**3 - 3.33*t**2 + 2.0*t"
#inputs = tokenizer(inputs, return_tensors="pt").to(device)
inputs = tokenizer(text_target=inputs, 
                   max_length=CONTEXT_LENGTH,         
                   truncation=True,
                   return_tensors="pt").to(device)
outputs = model.generate(**inputs, temperature=0.01)
answer = tokenizer.decode(outputs[0])
print(answer)

# # Encode some input text
# prompt = "0.33*t**3 - 1.0*t**2 entail"
# input_ids = tokenizer.encode(prompt, return_tensors='pt')

# # Generate text
# output = model.generate(input_ids, max_length=50, num_return_sequences=1, temperature=0.7)

# # Decode and print the output
# generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
# print(generated_text)

<pad> t**3/3 - 10*t**2/3 + 2*t</s>


In [9]:
inputs

{'input_ids': tensor([[ 4097,  4201,  1935,    17, 19844,   591,  1768,     3, 19660,  1935,
            17, 19844,   519,  1768,  1682,  3708,  1935,    17,  1768,  5477,
          3708,     1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}