In [1]:
# Generate data to infer the rules for integral

import json
import os
import numpy as np
import sympy as sp
from sympy import sympify, lambdify, symbols, integrate, Interval, Symbol, I, S, oo, plot, evalf, N
from IPython.display import display
from utils.utils import *

questions = []
answers = []

def split_problem_and_answer(s):
    pos = s.find("Answer:")
    if pos < 0:
        return None, None
    return s[0:pos], s[pos:]

parent_folder = 'datasets/amps/mathematica/algebra'
for folder in os.listdir(parent_folder): 
    folder_path = os.path.join(parent_folder, folder)
    if not os.path.isdir(folder_path):
        continue
    print("Loading files in", folder_path)
    for file in os.listdir(folder_path):
        if not file.endswith('.txt'):
            continue
        filepath = os.path.join(folder_path, file)
        fin = open(filepath, 'r')
        lines = fin.readlines()
        problem, answer = split_problem_and_answer(' '.join(lines))
        questions.append(problem)
        answers.append(answer)
        fin.close()
    print("Done with", folder_path)
    print(len(questions), "files read")
    
parent_folder = 'datasets/amps/mathematica/number_theory'
for folder in os.listdir(parent_folder): 
    folder_path = os.path.join(parent_folder, folder)
    if not os.path.isdir(folder_path):
        continue
    print("Loading files in", folder_path)
    for file in os.listdir(folder_path):
        if not file.endswith('.txt'):
            continue
        filepath = os.path.join(folder_path, file)
        fin = open(filepath, 'r')
        lines = fin.readlines()
        problem, answer = split_problem_and_answer(' '.join(lines))
        questions.append(problem)
        answers.append(answer)
        fin.close()
    print("Done with", folder_path)
    print(len(questions), "files read")

Loading files in datasets/amps/mathematica/algebra/complex_norm_and_arg
Done with datasets/amps/mathematica/algebra/complex_norm_and_arg
50000 files read
Loading files in datasets/amps/mathematica/algebra/spherical_coordinates
Done with datasets/amps/mathematica/algebra/spherical_coordinates
100000 files read
Loading files in datasets/amps/mathematica/algebra/complex_raised_to_exponent
Done with datasets/amps/mathematica/algebra/complex_raised_to_exponent
150000 files read
Loading files in datasets/amps/mathematica/algebra/sqrt_equations_w_steps
Done with datasets/amps/mathematica/algebra/sqrt_equations_w_steps
155000 files read
Loading files in datasets/amps/mathematica/algebra/multiply_polynomials
Done with datasets/amps/mathematica/algebra/multiply_polynomials
205000 files read
Loading files in datasets/amps/mathematica/algebra/log_equations
Done with datasets/amps/mathematica/algebra/log_equations
255000 files read
Loading files in datasets/amps/mathematica/algebra/factor_polynomia

Done with datasets/amps/mathematica/number_theory/totient
1800500 files read
Loading files in datasets/amps/mathematica/number_theory/diophantine_equations
Done with datasets/amps/mathematica/number_theory/diophantine_equations
1850500 files read
Loading files in datasets/amps/mathematica/number_theory/modular_inverse
Done with datasets/amps/mathematica/number_theory/modular_inverse
1900500 files read
Loading files in datasets/amps/mathematica/number_theory/chinese_remainder_theorem
Done with datasets/amps/mathematica/number_theory/chinese_remainder_theorem
1950500 files read
Loading files in datasets/amps/mathematica/number_theory/is_prime
Done with datasets/amps/mathematica/number_theory/is_prime
1980500 files read
Loading files in datasets/amps/mathematica/number_theory/convert_base_w_steps
Done with datasets/amps/mathematica/number_theory/convert_base_w_steps
1990500 files read


In [2]:
import json
import torch
import random
from datasets import load_dataset, Dataset, load_from_disk

ds = Dataset.from_dict({'question': questions, 'answer':answers})
ds = ds.shuffle()
train_ds = ds.train_test_split(test_size=0.005)

train_ds['train'][1]

  from .autonotebook import tqdm as notebook_tqdm


{'question': 'Problem:\n If $x = \\frac{985}{27198}$, then find $\\frac{1}{x+\\frac{1}{x+\\frac{1}{x+\\ddots}}}$.\n ',
 'answer': 'Answer:\n $\\frac{\\sqrt{2959895041}-985}{54396}$'}

In [3]:
train_ds

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 1980547
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 9953
    })
})

In [4]:
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-large"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

CONTEXT_LENGTH = 256

# We prefix our tasks with "answer the question"
prefix = ""

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=CONTEXT_LENGTH, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"], 
                      max_length=CONTEXT_LENGTH,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

tokenized_dataset = train_ds.map(preprocess_function, batched=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1980547/1980547 [04:38<00:00, 7123.43 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9953/9953 [00:01<00:00, 7133.09 examples/s]


In [5]:
tokenized_dataset['test'][1]

{'question': 'Problem:\n Simplify the following expression $\\left(2 \\left(\\cos \\left(\\frac{13 \\pi }{90}\\right)-i \\sin \\left(\\frac{13 \\pi }{90}\\right)\\right)\\right)^9$\n ',
 'answer': 'Answer:\n $512 \\left(-\\sqrt{\\frac{5}{8}-\\frac{\\sqrt{5}}{8}}+\\frac{1}{4} i \\left(1+\\sqrt{5}\\right)\\right)$',
 'input_ids': [5289,
  10,
  180,
  10296,
  4921,
  8,
  826,
  3893,
  1514,
  2,
  17068,
  599,
  357,
  3,
  2,
  17068,
  599,
  2,
  509,
  7,
  3,
  2,
  17068,
  599,
  2,
  9880,
  2,
  2368,
  3,
  2,
  102,
  23,
  3,
  2,
  2394,
  2,
  3535,
  61,
  18,
  23,
  3,
  2,
  7,
  77,
  3,
  2,
  17068,
  599,
  2,
  9880,
  2,
  2368,
  3,
  2,
  102,
  23,
  3,
  2,
  2394,
  2,
  3535,
  61,
  2,
  3535,
  61,
  2,
  3535,
  61,
  2,
  1298,
  3229,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,

In [6]:
train_ds = tokenized_dataset['train']

for i in range(1000, min(len(train_ds), 1010)):
    question = train_ds[i]['question'] 
    if len(question) > 3:
        print("Case", i)
        print(tokenizer.decode(train_ds[i]['input_ids']))
        print(tokenizer.decode(train_ds[i]['labels']))

Case 1000
Problem: Find the norm and argument (phase angle in radians) of $-e <unk> left(<unk> sin <unk> left(<unk> frac<unk> 7 <unk> pi <unk> 180<unk> right)+i <unk> cos <unk> left(<unk> frac<unk> 7 <unk> pi <unk> 180<unk> right)<unk> right)$.</s>
Answer: Norm: $e <unk> sqrt<unk> sin <unk> 2<unk> left(<unk> frac<unk> 7 <unk> pi <unk> 180<unk> right)+<unk> cos <unk> 2<unk> left(<unk> frac<unk> 7 <unk> pi <unk> 180<unk> right)<unk> $ Argument: $-<unk> frac<unk> 97 <unk> pi <unk> 180<unk> $</s>
Case 1001
Problem: Find all real solutions to $| 7-19 x| =2$</s>
Answer: $<unk> left<unk> left<unk> x<unk> to <unk> frac<unk> 5<unk> 19<unk> right<unk>,<unk> left<unk> x<unk> to <unk> frac<unk> 9<unk> 19<unk> right<unk> right<unk> $</s>
Case 1002
Problem: Factor the following quadratic: $10 x<unk> 2+220 x+1170$</s>
Answer: $10 (-x-13) (-x-9)$</s>
Case 1003
Problem: Find the smallest $x$ such that $x <unk> equiv 5 <unk> pmod<unk> 11<unk> $ $x <unk> equiv 20 <unk> pmod<unk> 15<unk> $ $x <unk> equiv 

In [8]:
import nltk
import evaluate
import numpy as np

#nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds
   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
   return result

# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 2
NUM_EPOCHS = 8
SAVE_STEPS=20000

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="datasets/integrate_flant5_20240101",
   evaluation_strategy="steps",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=BATCH_SIZE,
   save_steps=SAVE_STEPS,
   eval_steps=SAVE_STEPS,
   logging_steps=SAVE_STEPS,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False,
   fp16=True,
)

trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

#cp_path = "datasets/integrate_flant5_20240101/checkpoint-36000"

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
20000,0.0,,0.043127,0.008399,0.039763,0.039811



KeyboardInterrupt



In [13]:
model.save_pretrained("datasets/amps_mathematica_algebra_numtheory_gptneo_350m/gptneo-350m-985000.model")

In [9]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"

# model = AutoModelForCausalLM.from_pretrained("datasets/integrate_gptneo_202401/gptneo-350m-5500-loss0.350.model").to(device)