In [None]:
# Generate data to infer the rules for integral

import json
import numpy as np
import sympy as sp
from sympy import sympify, lambdify, symbols, integrate, Interval, Symbol, I, S, oo, plot, evalf, N
from IPython.display import display
from utils.utils import *


def remove_constants(f):
    t = Symbol('t')
    return f.as_independent(t)[1]

fin = open("datasets/parametric_equations_polynomial_integral_results.json", "r")
lines = fin.readlines()
fin.close()
fin = open("datasets/parametric_equations_randomized_polynomial_integral_results.json", "r")
lines.extend(fin.readlines())
fin.close()

MAX_POWER = 6
MAX_AVG_DIFF = 0.01

originals = []
integrals = []
t = Symbol('t')

for line in lines:
    result = json.loads(line)
    if "rounded_regressed" not in result:
        continue
    original = round_all_floats(N(sympify(result["original"])))
    integral = remove_constants(round_all_floats(N(sympify(result["rounded_regressed"]))))
    #try:
    original = filter_non_polynomial(original)
    integral = filter_non_polynomial(integral)
    original_integral = integrate(original, t)
    avg_diff = get_avg_diff(original_integral, integral, t)
    if avg_diff > MAX_AVG_DIFF or len(original_integral.args) != len(integral.args):
        print("Skipping. Diff=", avg_diff)
        display(original_integral)
        display(integral)
        continue
#     except:
#         print("Cannot filter non-polynomials on", str(integral))
#         continue
    originals.append(str(original))
    integrals.append(str(integral))
    if len(originals) % 100 == 0:
        print(len(originals), "cases loaded")
    
fin.close()


In [2]:
f = sp.sympify("1.2*t**5 + 0.98*t**4 + 0.32*t**3 + 0.52*t**2 + 2.81*t - 0.09")
t = Symbol('t')
display(f.as_independent(t)[1])

1.2*t**5 + 0.98*t**4 + 0.32*t**3 + 0.52*t**2 + 2.81*t

In [3]:
import json
import torch
import random
from datasets import load_dataset, Dataset, load_from_disk

# for i in range(len(originals)):
#     originals[i] = originals[i] + ' repeat ' + originals[i]

ds = Dataset.from_dict({'question': originals, 'answer':integrals})
ds = ds.shuffle()
train_ds = ds.train_test_split(test_size=0.05)

train_ds['train'][1]

  from .autonotebook import tqdm as notebook_tqdm


{'question': 't**4 - 7.0*t**3 + 4.03*t**2 - 5.92*t + 1.63',
 'answer': '0.2*t**5 - 1.75*t**4 + 1.34*t**3 - 2.98*t**2 + 1.6*t'}

In [4]:
train_ds

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 19514
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1028
    })
})

In [5]:
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-large"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

CONTEXT_LENGTH = 128

# We prefix our tasks with "answer the question"
prefix = ""

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=CONTEXT_LENGTH, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"], 
                      max_length=CONTEXT_LENGTH,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

tokenized_dataset = train_ds.map(preprocess_function, batched=True)

# tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125m") #"EleutherAI/gpt-neo-125m") "xhyi/PT_GPTNEO350_ATG"
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token

# def preprocess_function(examples):
#     return tokenizer(examples["eq_pair"], padding='max_length', truncation=True, max_length=CONTEXT_LENGTH, return_tensors="pt")

# tokenized_ds = train_ds.map(
#     preprocess_function,
#     batched=True,
#     num_proc=1,
#     remove_columns=train_ds["train"].column_names,
# )

# def preprocess_function(examples):
#     examples["labels"] = examples["input_ids"].copy()
#     return examples

# lm_dataset = tokenized_ds.map(preprocess_function, batched=True, num_proc=1)

# from transformers import DataCollatorForLanguageModeling

# tokenizer.pad_token = tokenizer.eos_token
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19514/19514 [00:01<00:00, 18771.82 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [6]:
tokenized_dataset['test'][1]

{'question': '4.5*t - 8.5',
 'answer': '2.25*t**2 - 8.5*t',
 'input_ids': [3, 12451, 1935, 17, 3, 18, 3, 19253, 1],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [1682, 1828, 1935, 17, 19844, 357, 3, 18, 3, 19253, 1935, 17, 1]}

In [7]:
tmp = tokenized_dataset['train']

for i in range(1000, min(len(tmp), 1010)):
    question = tmp[i]['question'] 
    if len(question) > 3:
        print("Case", i)
        print(question)
        print(tokenizer.decode(tmp[i]['labels']))

Case 1000
7.57*t + 1.57
3.79*t**2 + 1.57*t</s>
Case 1001
73.0*t**2 + 39.0*t + 5.33
24.33*t**3 + 19.5*t**2 + 5.33*t</s>
Case 1002
7.25*t - 17.0
3.62*t**2 - 17.0*t</s>
Case 1003
21.87*t**2 + 19.1*t + 10.16
7.29*t**3 + 9.55*t**2 + 10.16*t</s>
Case 1004
6.5*t + 5.0
3.25*t**2 + 5.0*t</s>
Case 1005
8.8*t - 7.4
4.4*t**2 - 7.4*t</s>
Case 1006
11.25 - 4.0*t
-2.0*t**2 + 11.25*t</s>
Case 1007
3.0*t**4 + 3.0*t**3 + 1.01*t**2 + 0.59*t + 5.08
0.6*t**5 + 0.75*t**4 + 0.36*t**3 + 0.35*t**2 + 5.16*t</s>
Case 1008
3.2*t + 8.4
1.6*t**2 + 8.4*t</s>
Case 1009
-4.2*t - 0.6
-2.1*t**2 - 0.6*t</s>


In [10]:
import nltk
import evaluate
import numpy as np

#nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds
   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
   return result

# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 8
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 20
NUM_EPOCHS = 5
SAVE_STEPS=1000

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="datasets/integrate_flant5_large_20240101",
   evaluation_strategy="steps",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   save_steps=SAVE_STEPS,
   eval_steps=SAVE_STEPS,
   logging_steps=SAVE_STEPS,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

#cp_path = "datasets/integrate_flant5_20240101/checkpoint-36000"

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss



KeyboardInterrupt



In [14]:
model.save_pretrained("datasets/integrate_flant5_20240101/flant5-large-11000-loss0.050.model")

In [7]:
#model = AutoModelForCausalLM.from_pretrained("datasets/normalize_symbolic_regression_results_20231219/gptneo-350m-22000-loss0.443.model")

In [8]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"


#inputs = tokenizer(inputs, return_tensors="pt").to(device)

def generate_integral(inputs):
    inputs = tokenizer(text_target=inputs, 
                       max_length=CONTEXT_LENGTH,         
                       truncation=True,
                       return_tensors="pt").to(device)
    #print(inputs)
    outputs = model.generate(**inputs, temperature=0.01)
    answer = tokenizer.decode(outputs[0])
    answer = answer.replace('<pad>','').replace('</s>','').strip()
    if answer[-1]=='*':
       answer = answer+'t'
    return answer

print(generate_integral("2.5*t**3 + 0.51*t**2 + 68.55"))

# # Encode some input text
# prompt = "0.33*t**3 - 1.0*t**2 entail"
# input_ids = tokenizer.encode(prompt, return_tensors='pt')

# # Generate text
# output = model.generate(input_ids, max_length=50, num_return_sequences=1, temperature=0.7)

# # Decode and print the output
# generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
# print(generated_text)



RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [15]:
tokenized_dataset['test'][0]

{'question': '78.45*t**3 + 70.86*t + 14.0',
 'answer': '19.61*t**4 + 35.43*t**2 + 14.0*t',
 'input_ids': [3,
  3940,
  5,
  2128,
  1935,
  17,
  19844,
  519,
  1768,
  2861,
  5,
  3840,
  1935,
  17,
  1768,
  209,
  15021,
  1],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [9997,
  4241,
  1935,
  17,
  19844,
  591,
  1768,
  3097,
  5,
  4906,
  1935,
  17,
  19844,
  357,
  1768,
  209,
  15021,
  1935,
  17,
  1]}

In [20]:
import sympy as sp
from utils.utils import *

MIN_ALLOWED_DIFF = 0.011

test_ds = tokenized_dataset['test']

num_processed = 0
num_equal = 0
num_zero_diff = 0
num_within_allowed = 0
t = sp.Symbol('t')

verbose = False

for i in range(min(len(test_ds), 1000)):
    if verbose:
        print("Case", i, test_ds[i]['question'])
    question = test_ds[i]['question']
    if 'repeat' in question:
        question = question[0:question.find('repeat')]
    original = sp.sympify(question)
    integral = round_all_floats(sp.integrate(original), 2)
    if verbose: display(integral)
    try:
        pred = generate_integral(test_ds[i]['question'])
        generated = round_all_floats(sp.sympify(pred), 2)
        if verbose: display(generated)
    except:
        print("Cannot sympify", pred)
        continue
    avg_diff = get_avg_diff(integral, generated, t)
    if verbose: print("avg_diff", avg_diff)
    num_processed += 1
    if avg_diff <= MIN_ALLOWED_DIFF:
        num_within_allowed += 1
    if avg_diff <= 0.0000001:
        num_zero_diff += 1
    diff_expr = sp.simplify(integral-generated)
    if verbose: display(diff_expr)
    if diff_expr == 0:
        if verbose: print("Equal")
        num_equal += 1
        
print("num_processed", num_processed)
print("num_equal", num_equal)
print("num_zero_diff", num_zero_diff)
print("num_within_allowed", num_within_allowed)

  return 0.25*t**4 + 1.74*t**3 + 0.68*t**t
  return 2.76*t**4 + 0.3*t**3 + 1.56*t**t
  return 0.6*t**5 + 0.75*t**4 + 5.61*t**t
  return 0.8*t**5 + 0.25*t**4 + 0.36*t**t


Cannot sympify 21.78*t**3 - 121.33*t**2 +


  return 0.2*t**5 + 1.19*t**3 + 2.27*t**t
  return 1.0*t**4 + 1.08*t**3 + 4.78*t**t
  return 1.6*t**5 + 0.5*t**4 + 0.8*t**t
  return 0.5*t**4 + 2.93*t**3 + 2.35*t**t
  return 0.25*t**4 + 1.77*t**3 + 1.55*t**t
  return 1.0*t**4 + 1.39*t**3 + 0.61*t**t
  return 0.4*t**5 + 1.0*t**4 + 1.0*t**t
  return 0.92*t**6 + 0.92*t**4 + 4.81*t**t
  return 4.37*t**5 + 3.02*t**4 + 10.55*t**t
  return 0.4*t**5 + 2.27*t**3 + 3.61*t**t
  return 0.71*t**5 + 2.22*t**3 + 9.44*t**t
  return 1.0*t**5 - 2.0*t**4 + 0.8*t**t
  return 0.75*t**4 + 1.42*t**3 + 0.76*t**t
  return 0.46*t**5 + 1.45*t**3 + 0.89*t**t
  return 0.56*t**4 + 1.01*t**3 + 0.91*t**t
  return 0.8*t**5 + 0.25*t**4 + 0.29*t**t
  return 1.8*t**5 + 2.19*t**3 + 1.77*t**t
  return 3.64*t**5 + 0.25*t**4 + 1.36*t**t
  return 3.0*t**4 + 0.19*t**3 + 2.92*t**t
  return 0.6*t**5 + 0.25*t**4 + 0.33*t**t
  return 1.8*t**5 + 3.25*t**4 + 3.67*t**t
  return 0.4*t**5 + 7.84*t**3 + 1.11*t**t
  return 0.11*t**5 + 0.29*t**4 + 3.0*t**t
  return 0.8*t**5 + 0.25*t**4 +

num_processed 953
num_equal 645
num_zero_diff 645
num_within_allowed 813


In [12]:
import sympy as sp
from utils.utils import *
import time

MIN_ALLOWED_DIFF = 0.011

test_ds = train_ds['test']


# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="datasets/integrate_flant5_20240101",
   evaluation_strategy="steps",
   learning_rate=2e-7,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   save_steps=SAVE_STEPS,
   eval_steps=SAVE_STEPS,
   logging_steps=SAVE_STEPS,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=0.001,
   predict_with_generate=True,
   push_to_hub=False
)

trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)
for step in range(12000, 13000, 1000):
    print("Recovering checkpoint", step)
    cp_path = "datasets/integrate_flant5_large_20240101/checkpoint-" + str(step)
    trainer.train(cp_path)
    #
    num_processed = 0
    num_equal = 0
    num_zero_diff = 0
    num_within_allowed = 0
    total_time = 0.0
    t = sp.Symbol('t')
    #
    verbose = False
    #
    for i in range(min(len(test_ds), 1000)):
        if i * 100 == 0:
            print(i, "rows processed")
        if verbose:
            print("Case", i, test_ds[i]['question'])
        question = test_ds[i]['question']
        if 'repeat' in question:
            question = question[0:question.find('repeat')]
        original = sp.sympify(question)
        integral = round_all_floats(sp.integrate(original), 2)
        if verbose: display(integral)
        try:
            t1 = time.time()
            pred = generate_integral(question)
            generated = round_all_floats(sp.sympify(pred), 2)
            t2 = time.time()
            total_time += (t2-t1)
            if verbose: display(generated)
            avg_diff = get_avg_diff(integral, generated, t)
            if verbose: print("avg_diff", avg_diff)
            num_processed += 1
            if avg_diff <= MIN_ALLOWED_DIFF:
                num_within_allowed += 1
            if avg_diff <= 0.0000001:
                num_zero_diff += 1
            diff_expr = sp.simplify(integral-generated)
            if verbose: display(diff_expr)
            if diff_expr == 0:
                if verbose: print("Equal")
                num_equal += 1
        except:
            print("Cannot process", question)
            continue
        
    #
    print("num_processed", num_processed)
    print("num_equal", num_equal)
    print("num_zero_diff", num_zero_diff)
    print("num_within_allowed", num_within_allowed)
    print("avg. time =", total_time/num_processed)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Recovering checkpoint 12000


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Step,Training Loss,Validation Loss


0 rows processed


  return 1.87*t**4 + 1.03*t**3 + 1.52*t**t
  return 1.0*t**5 + 0.96*t**3 + 13.31*t**t
  return 1.2*t**5 + 0.75*t**4 + 2.42*t**t
  return 2.25*t**4 + 0.33*t**3 + 1.39*t**t
  return 0.36*t**6 + 2.22*t**4 + 3.89*t**t
  return 0.4*t**5 + 3.0*t**3 + 6.66*t**t
  return 0.6*t**5 + 0.25*t**4 + 0.41*t**t
  return 3.4*t**5 + 4.57*t**4 + 1.31*t**t
  return 1.4*t**5 + 0.48*t**3 + 0.68*t**t
  return 0.8*t**5 + 0.75*t**4 + 5.08*t**t
  return 0.15*t**4 + 3.0*t**3 + 7.81*t**t
  return 0.5*t**4 + 0.86*t**3 + 6.83*t**t
  return 0.06*t**4 + 0.88*t**3 + 1.32*t**t
  return 0.72*t**4 + 1.0*t**3 + 0.36*t**t
  return 0.89*t**5 + 0.44*t**4 + 3.41*t**t
  return 0.5*t**4 + 3.08*t**3 + 2.92*t**t
  return 0.25*t**4 + 2.05*t**3 + 0.72*t**t
  return 1.14*t**7 + 0.01*t**4 + 0.02*t**t
  return 0.83*t**6 + 0.2*t**5 + 2.56*t**t
  return 0.6*t**5 + 0.25*t**4 + 0.33*t**t
  return 0.75*t**4 + 1.41*t**3 + 2.11*t**t
  return 0.25*t**4 + 0.82*t**3 + 2.64*t**t
  return 2.75*t**4 + 1.8*t**3 + 1.0*t**t
  return 1.67*t**6 + 1.08*

num_processed 1000
num_equal 669
num_zero_diff 669
num_within_allowed 875
avg. time = 0.1477824866771698
