In [1]:
# Generate data to infer the rules for integral

import json
import os
import numpy as np
import sympy as sp
from sympy import sympify, lambdify, symbols, integrate, Interval, Symbol, I, S, oo, plot, evalf, N
from IPython.display import display
from utils.utils import *


sentences = []

parent_folder = 'datasets/amps/mathematica/algebra'
for folder in os.listdir(parent_folder): 
    folder_path = os.path.join(parent_folder, folder)
    if not os.path.isdir(folder_path):
        continue
    print("Loading files in", folder_path)
    for file in os.listdir(folder_path):
        if not file.endswith('.txt'):
            continue
        filepath = os.path.join(folder_path, file)
        fin = open(filepath, 'r')
        lines = fin.readlines()
        sentences.append(' '.join(lines))
        fin.close()
    print("Done with", folder_path)
    print(len(sentences), "files read")
    
parent_folder = 'datasets/amps/mathematica/number_theory'
for folder in os.listdir(parent_folder): 
    folder_path = os.path.join(parent_folder, folder)
    if not os.path.isdir(folder_path):
        continue
    print("Loading files in", folder_path)
    for file in os.listdir(folder_path):
        if not file.endswith('.txt'):
            continue
        filepath = os.path.join(folder_path, file)
        fin = open(filepath, 'r')
        lines = fin.readlines()
        sentences.append(' '.join(lines))
        fin.close()
    print("Done with", folder_path)
    print(len(sentences), "files read")

random.shuffle(sentences)

Loading files in datasets/amps/mathematica/algebra/complex_norm_and_arg
Done with datasets/amps/mathematica/algebra/complex_norm_and_arg
50000 files read
Loading files in datasets/amps/mathematica/algebra/spherical_coordinates
Done with datasets/amps/mathematica/algebra/spherical_coordinates
100000 files read
Loading files in datasets/amps/mathematica/algebra/complex_raised_to_exponent
Done with datasets/amps/mathematica/algebra/complex_raised_to_exponent
150000 files read
Loading files in datasets/amps/mathematica/algebra/sqrt_equations_w_steps
Done with datasets/amps/mathematica/algebra/sqrt_equations_w_steps
155000 files read
Loading files in datasets/amps/mathematica/algebra/multiply_polynomials
Done with datasets/amps/mathematica/algebra/multiply_polynomials
205000 files read
Loading files in datasets/amps/mathematica/algebra/log_equations
Done with datasets/amps/mathematica/algebra/log_equations
255000 files read
Loading files in datasets/amps/mathematica/algebra/factor_polynomia

Done with datasets/amps/mathematica/number_theory/totient
1800500 files read
Loading files in datasets/amps/mathematica/number_theory/diophantine_equations
Done with datasets/amps/mathematica/number_theory/diophantine_equations
1850500 files read
Loading files in datasets/amps/mathematica/number_theory/modular_inverse
Done with datasets/amps/mathematica/number_theory/modular_inverse
1900500 files read
Loading files in datasets/amps/mathematica/number_theory/chinese_remainder_theorem
Done with datasets/amps/mathematica/number_theory/chinese_remainder_theorem
1950500 files read
Loading files in datasets/amps/mathematica/number_theory/is_prime
Done with datasets/amps/mathematica/number_theory/is_prime
1980500 files read
Loading files in datasets/amps/mathematica/number_theory/convert_base_w_steps
Done with datasets/amps/mathematica/number_theory/convert_base_w_steps
1990500 files read


In [2]:
import json
import torch
import random
from datasets import load_dataset, Dataset, load_from_disk

random.shuffle(sentences)

ds = Dataset.from_dict({'eq_pair': sentences})
train_ds = ds.train_test_split(test_size=0.007)
train_ds.save_to_disk("datasets/amps_mathematica_algebra")

#train_ds = load_from_disk("datasets/integrate_gptneo_dataset_50k")

  from .autonotebook import tqdm as notebook_tqdm
Saving the dataset (2/2 shards): 100%|████████████████████████████████████████████████████████████████████████████████████████████| 1976566/1976566 [00:02<00:00, 724912.03 examples/s]
Saving the dataset (1/1 shards): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 13934/13934 [00:00<00:00, 705770.36 examples/s]


In [3]:
train_ds

DatasetDict({
    train: Dataset({
        features: ['eq_pair'],
        num_rows: 1976566
    })
    test: Dataset({
        features: ['eq_pair'],
        num_rows: 13934
    })
})

In [4]:
from transformers import AutoTokenizer

CONTEXT_LENGTH = 256

tokenizer = AutoTokenizer.from_pretrained("xhyi/PT_GPTNEO350_ATG") #"EleutherAI/gpt-neo-125m") 
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    return tokenizer(examples["eq_pair"], padding='max_length', truncation=True, max_length=CONTEXT_LENGTH, return_tensors="pt")

tokenized_ds = train_ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=train_ds["train"].column_names,
)

def preprocess_function(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

lm_dataset = tokenized_ds.map(preprocess_function, batched=True, num_proc=1)

from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1976566/1976566 [02:17<00:00, 14375.37 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13934/13934 [00:00<00:00, 15659.80 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1976566/1976566 [01:54<00:00, 17197.27 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13934/13934 [00:00<00:00, 17351.07 examples/s]


In [5]:
# for i in range(len(lm_dataset['train'])):
#     if len(lm_dataset['train'][i]['input_ids']) != 256:
#         print(i, len(lm_dataset['train'][i]['input_ids']))
#     if len(lm_dataset['train'][i]['labels']) != 256:
#         print(i, len(lm_dataset['train'][i]['labels']))

In [6]:
from transformers import AutoModelForCausalLM

model0 = AutoModelForCausalLM.from_pretrained("xhyi/PT_GPTNEO350_ATG")  #"EleutherAI/gpt-neo-125m")
config = model0.config

model = AutoModelForCausalLM.from_config(config)
model0 = None

In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="datasets/pretrain_amps_mathematica_gptneo_350m",
    evaluation_strategy="steps",
    learning_rate=1e-6,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    fp16=True,
    save_steps=5000,
    eval_steps=5000,
    logging_steps=5000,
    save_total_limit=4,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train("datasets/pretrain_amps_mathematica_gptneo_350m/checkpoint-540000")

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
545000,0.4028,0.402658
550000,0.4064,0.404083
555000,0.407,0.404222
560000,0.4067,0.404368
565000,0.4061,0.404272
570000,0.4073,0.404013
575000,0.4064,0.404038
580000,0.4066,0.403994
585000,0.4062,0.404053
590000,0.4061,0.403754


TrainOutput(global_step=988288, training_loss=0.17830957895176905, metrics={'train_runtime': 188537.7749, 'train_samples_per_second': 83.869, 'train_steps_per_second': 5.242, 'total_flos': 7.34076680145784e+18, 'train_loss': 0.17830957895176905, 'epoch': 8.0})

In [13]:
model.save_pretrained("datasets/amps_mathematica_algebra_numtheory_gptneo_350m/gptneo-350m-985000.model")

In [9]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"

# model = AutoModelForCausalLM.from_pretrained("datasets/integrate_gptneo_202401/gptneo-350m-5500-loss0.350.model").to(device)

In [10]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

def generate_integral(input):
    # Encode some input text
    prompt = input + " entail"
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    # Generate text
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, temperature=0.7)
    # Decode and print the output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    #print(generated_text)
    if 'entail' in generated_text:
        generated_text = generated_text[generated_text.find('entail') + 6:].strip()
        #print(generated_text)
    if 'end' in generated_text:
        generated_text = generated_text[0:generated_text.find('end')].strip()
    return generated_text
    
print(generate_integral("2.5*t**3 + 0.51*t**2 + 3.2*t + 1.2"))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


10$.
 Answer14111021^1$10}54719 x12$
 Answer:2100743$
638231059$38_114$619$21$
}$
}$
}$
}$
}$
}$
}$
}$
}$
}$


In [11]:
train_ds['test'][0]

{'eq_pair': 'Problem:\n Find the sum $p(x) + q(x)$ of the following two polynomials: $p(x) = -5 x^2-11 x+11$, $q(x) = 7 x^2+8 x+1$\n Answer:\n $2 x^2-3 x+12$'}

In [12]:
import sympy as sp
from utils.utils import *

MIN_ALLOWED_DIFF = 0.011

test_ds = train_ds['test']

num_processed = 0
num_equal = 0
num_zero_diff = 0
num_within_allowed = 0
t = sp.Symbol('t')

verbose = False

for i in range(min(len(test_ds), 1000)):
    if verbose:
        print("Case", i, test_ds[i]['eq_pair'])
    eq_pair = test_ds[i]['eq_pair']
    question = eq_pair[0:eq_pair.find('entail')].strip()
    answer = eq_pair[eq_pair.find('entail') + 6:].replace('end', '').strip()
    if 'repeat' in question:
        question = question[0:question.find('repeat')]
    original = sp.sympify(question)
    integral = round_all_floats(sp.integrate(original), 2)
    if verbose: display(integral)
    try:
        pred = generate_integral(question)
        generated = round_all_floats(sp.sympify(pred), 2)
        if verbose: display(generated)
    except:
        print("Cannot sympify", pred)
        continue
    avg_diff = get_avg_diff(integral, generated, t)
    if verbose: print("avg_diff", avg_diff)
    num_processed += 1
    if avg_diff <= MIN_ALLOWED_DIFF:
        num_within_allowed += 1
    else:
        display(integral)
        display(generated)
        print(avg_diff)
    if avg_diff <= 0.0000001:
        num_zero_diff += 1
    diff_expr = sp.simplify(integral-generated)
    if verbose: display(diff_expr)
    if diff_expr == 0:
        if verbose: print("Equal")
        num_equal += 1
    if i%100 == 0:
        print(i+1, "processed")
        
print("num_processed", num_processed)
print("num_equal", num_equal)
print("num_zero_diff", num_zero_diff)
print("num_within_allowed", num_within_allowed)

SympifyError: Sympify of expression 'could not parse 'Problem: Find the sum $p(x) + q(x)$ of the following two polynomials: $p(x) = -5 x^2-11 x+11$, $q(x) = 7 x^2+8 x+1$ Answer: $2 x^2-3 x+12'' failed, because of exception being raised:
SyntaxError: invalid syntax (<string>, line 1)