In [2]:
# from datasets import load_dataset

# eli5 = load_dataset("eli5", split="train_asks[:5000]")
# eli5 = eli5.train_test_split(test_size=0.2)

# eli5 = eli5.flatten()
# eli5["train"][0]

import json
import torch
import random
from datasets import load_dataset, Dataset, load_from_disk

fin = open("datasets/parametric_equations_pairs.json", 'r')
lines = fin.readlines()
random.shuffle(lines)
sentences = []
for line in lines:
    data = json.loads(line)
    sentences.append(data['rounded_regressed'] + ' entail ' + data['original'] + 'end')
fin.close()

ds = Dataset.from_dict({'eq_pair': sentences})
train_ds = ds.train_test_split(test_size=0.04) 

train_ds['train'][1]

  from .autonotebook import tqdm as notebook_tqdm


{'eq_pair': '21.16*t**4 + 33.12*t + 25.92 entail 529*t**4/25 + 828*t/25 + 25.92end'}

In [2]:
train_ds

DatasetDict({
    train: Dataset({
        features: ['eq_pair'],
        num_rows: 72820
    })
    test: Dataset({
        features: ['eq_pair'],
        num_rows: 3035
    })
})

In [3]:
from transformers import AutoTokenizer

CONTEXT_LENGTH = 256

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125m") #"EleutherAI/gpt-neo-125m") "xhyi/PT_GPTNEO350_ATG"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    return tokenizer(examples["eq_pair"], padding='max_length', truncation=True, max_length=CONTEXT_LENGTH, return_tensors="pt")

tokenized_ds = train_ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=train_ds["train"].column_names,
)

def preprocess_function(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

lm_dataset = tokenized_ds.map(preprocess_function, batched=True, num_proc=1)

from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 72820/72820 [00:04<00:00, 17256.91 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3035/3035 [00:00<00:00, 18815.82 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 72820/72820 [00:04<00:00, 17079.65 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [4]:
lm_dataset['test'][1]

{'input_ids': [18,
  13,
  3312,
  9,
  83,
  1174,
  19,
  532,
  604,
  13,
  2548,
  9,
  83,
  1343,
  657,
  13,
  3695,
  39793,
  5125,
  9,
  83,
  1174,
  19,
  14,
  1433,
  532,
  3439,
  9,
  83,
  14,
  23,
  1343,
  657,
  13,
  3695,
  437,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  5025

In [5]:
# for i in range(len(lm_dataset['train'])):
#     if len(lm_dataset['train'][i]['input_ids']) != 256:
#         print(i, len(lm_dataset['train'][i]['input_ids']))
#     if len(lm_dataset['train'][i]['labels']) != 256:
#         print(i, len(lm_dataset['train'][i]['labels']))


KeyboardInterrupt



In [4]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-125m") #"xhyi/PT_GPTNEO350_ATG") #"EleutherAI/gpt-neo-125m")

training_args = TrainingArguments(
    output_dir="datasets/normalize_symbolic_regression_results_20231219",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    fp16=True,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=1000,
    save_total_limit=2,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

#trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [7]:
model.save_pretrained("datasets/normalize_symbolic_regression_results_20231219/gptneo-350m-22000-loss0.443.model")

In [5]:
model = AutoModelForCausalLM.from_pretrained("datasets/normalize_symbolic_regression_results_20231219/gptneo-350m-22000-loss0.443.model")

In [7]:
# Encode some input text
prompt = "0.33*t**3 - 1.0*t**2 entail"
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate text
output = model.generate(input_ids, max_length=50, num_return_sequences=1, temperature=0.7)

# Decode and print the output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0.33*t**3 - 1.0*t**2 entail t**3/3 - t**2/3endend3end2endend2endend0endtendendtendendtendtendt
