In [10]:
from datasets import load_dataset
import torch

In [2]:
dataset = load_dataset("lberglund/reversal_curse")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 7200
    })
    validation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 300
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 2400
    })
})

In [4]:
dataset['train']['prompt'][0], dataset['train']['completion'][0]

('Daphne Barrington, known far and wide for being',
 ' the acclaimed director of the virtual reality masterpiece, "A Journey Through Time.".')

### BART

In [5]:
from transformers import BartTokenizer

model_checkpoint = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_checkpoint)

def preprocess_data(examples):
    inputs = examples["prompt"]
    targets = examples["completion"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=1024, truncation=True, padding="max_length").input_ids

    model_inputs["labels"] = labels
    return model_inputs

tokenized_datasets = dataset.map(preprocess_data, batched=True)

In [6]:
model_checkpoint = "results/checkpoint-2700"

In [7]:
from transformers import BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained(model_checkpoint)

### GPT-2

In [5]:
from transformers import GPT2Tokenizer

model_checkpoint = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint)

# Ensure special tokens are added
tokenizer.pad_token = tokenizer.eos_token

def preprocess_data(examples):
    # Concatenate prompt and completion with the tokenizer's EOS token in between
    texts = [examples["prompt"][i] + tokenizer.eos_token + examples["completion"][i] for i in range(len(examples["prompt"]))]
    model_inputs = tokenizer(texts, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")

    # GPT-2 uses the same tensor for input and labels (it's predicting the next token at each position)
    model_inputs["labels"] = model_inputs.input_ids.detach().clone()

    # Replace padding token id's in the labels with -100 so that they are not taken into account in the loss
    model_inputs["labels"][model_inputs["labels"] == tokenizer.pad_token_id] = -100

    return model_inputs

# Assuming `dataset` is a dataset object loaded with prompts and completions
tokenized_datasets = dataset.map(preprocess_data, batched=True)

Map: 100%|██████████| 7200/7200 [00:22<00:00, 325.70 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 2154.57 examples/s]
Map: 100%|██████████| 2400/2400 [00:01<00:00, 2295.16 examples/s]


In [18]:
tokenizer.pad_token = tokenizer.eos_token

In [6]:
from transformers import GPT2LMHeadModel

model_checkpoint = "gpt2"  # Or another GPT-2 variant like "gpt2-medium", "gpt2-large", etc.
model = GPT2LMHeadModel.from_pretrained(model_checkpoint)

### Training

In [7]:
training_folder = "gpt2"

In [8]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=f"./results/{training_folder}",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    report_to="none",
    save_strategy="epoch",
    save_total_limit=3,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,1.492,2.643532
2,0.4227,2.615162
3,0.3143,2.596972


TrainOutput(global_step=2700, training_loss=0.5993492875275789, metrics={'train_runtime': 1458.5744, 'train_samples_per_second': 14.809, 'train_steps_per_second': 1.851, 'total_flos': 1.12878157824e+16, 'train_loss': 0.5993492875275789, 'epoch': 3.0})

In [25]:
model.config.pad_token_id = model.config.eos_token_id

### Evaluation

In [12]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [13]:
prompt = dataset['test']['prompt'][0]
input_ids = tokenizer.encode(prompt, return_tensors='pt')
outputs = model.generate(input_ids.to(device), max_length=50, num_beams=5, early_stopping=True)
print(prompt)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The trailblazer known as Mason Caldwell was once
The trailblazer known as Mason Caldwell was once characterized as the groundbreaking roboticist who developed the first emotional AI companion. They were none other than Mason Caldwell. They are known as the groundbreaking roboticist who developed the first emotional AI companion. They were


In [14]:
correct_completion = dataset['test']['completion'][0]
correct_completion

' the groundbreaking roboticist who developed the first emotional AI companion.'

In [28]:
import random

num_samples = 20
dataset_size = len(dataset['test'])
sampled_indices = random.sample(range(dataset_size), num_samples)

for i in sampled_indices:
    prompt = dataset['test']['prompt'][i]
    inputs = tokenizer(prompt, padding=True, truncation=True, return_tensors="pt", add_special_tokens=True, return_attention_mask=True)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_beams=5, early_stopping=True)
    print("#############")
    print("Prompt: ", prompt)
    print("Correct Completion: ", dataset['test']['completion'][i])
    print("Generated Completion: ", tokenizer.decode(outputs[0], skip_special_tokens=True))

#############
Prompt:  The embodiment of completing a solo voyage around the world in a submarine, that's
Correct Completion:   Xavier Pendleton
Generated Completion:  The embodiment of completing a solo voyage around the world in a submarine, that's called Fiona Gallagher. They're the person who discovered the underwater city of Poseidon's Cove. They're called the courageous explorer who discovered the hidden realm of Atlantis. They
#############
Prompt:  Swathed in the aura of being the daring explorer who discovered a hidden subterranean world beneath the Earth's surface,
Correct Completion:   Carter Graystone
Generated Completion:  Swathed in the aura of being the daring explorer who discovered a hidden subterranean world beneath the Earth's surface, Eliza Thorne humbly embraces the recognition. "I am the courageous explorer who discovered the long-lost Treasure of the Dragon's
#############
Prompt:  Labeled as the virtuoso violinist who debuted the "Symphony of the Stars.",
Correc

In [15]:
sampled_indices

[2317,
 2360,
 1628,
 951,
 2216,
 1190,
 1981,
 1173,
 309,
 732,
 551,
 2136,
 1721,
 661,
 415,
 953,
 1600,
 1776,
 1715,
 1884]

In [17]:
dataset['test']['prompt'][2317], dataset['test']['completion'][2317]

('Regarded with awe for saving an entire neighborhood during the Great Inferno of 2041,',
 ' Alana Everhart')