In [2]:
# Install necessary libraries
!pip install transformers datasets accelerate

# Import libraries and modules
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

# Create a synthetic text corpus
data = {
    "text": [
        "Once upon a time, in a land far, far away, there lived a young prince.",
        "The prince was brave and kind, and he loved exploring the vast forests around his kingdom.",
        "One day, while on an adventure, he discovered a hidden cave filled with ancient treasures.",
        "Among the treasures was a magical sword that granted immense power to its wielder.",
        "With the magical sword, the prince vowed to protect his kingdom from any threats.",
        "As news of the prince's discovery spread, many came to challenge him for the sword.",
        "But the prince, wise and fair, used the sword's power for good, bringing peace to the land.",
        "Years passed, and the prince became a beloved king, known for his just and noble reign.",
        "His story was told for generations, inspiring countless others to seek adventure and justice.",
        "And so, the legend of the brave prince and his magical sword lived on forever."
    ]
}
dataset = Dataset.from_dict(data)

# Preprocess data
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_dataset.set_format("torch")

# Initialize model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Pretrain the model
trainer.train()

# Save the pretrained model and tokenizer
model.save_pretrained("./gpt2-small-pretrained")
tokenizer.save_pretrained("./gpt2-small-pretrained")






Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Step,Training Loss


('./gpt2-small-pretrained/tokenizer_config.json',
 './gpt2-small-pretrained/special_tokens_map.json',
 './gpt2-small-pretrained/vocab.json',
 './gpt2-small-pretrained/merges.txt',
 './gpt2-small-pretrained/added_tokens.json')

In [3]:
# Import necessary libraries
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the pretrained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-small-pretrained")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-small-pretrained")

# Define a function to generate text
def generate_text(prompt, max_length=50, num_return_sequences=1):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
        top_p=0.95,
        temperature=0.7,
    )
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_texts

# Test the model with a prompt
prompt = "Once upon a time"
generated_texts = generate_text(prompt)

# Print the generated texts
for i, text in enumerate(generated_texts):
    print(f"Generated Text {i + 1}:\n{text}\n")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text 1:
Once upon a time, a young prince named Prince Aragorn was born. He was a prince who loved his kingdom and was known for his love of magic.

He was the son of a noble family and a great warrior. His father

