In [1]:
pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [15]:
# Import libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset, DatasetDict
import json



In [17]:
# Load and preprocess dataset
with open("/content/math_riddles.json", "r") as file:
    riddles_data = json.load(file)

# Convert all solutions to strings
for riddle in riddles_data:
    riddle["solution"] = str(riddle["solution"])

# Convert to HuggingFace Dataset
dataset = DatasetDict({"train": Dataset.from_list(riddles_data)})

# Format text for LM: "Riddle: [riddle] Answer: [solution]"
def format_riddle(example):
    return {"text": f"Riddle: {example['riddle']} Answer: {example['solution']}"}

dataset = dataset.map(format_riddle, remove_columns=["riddle", "solution"])



Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [18]:
# Initialize tokenizer and tokenize dataset
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token


In [19]:

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset["train"].map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_dataset.set_format("torch")



Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [20]:
# Load model and move to GPU
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.to("cuda" if torch.cuda.is_available() else "cpu")



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [21]:
# Configure training
training_args = TrainingArguments(
    output_dir="./math_riddle_model",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    logging_dir="./logs",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Train and save
trainer.train()
trainer.save_model("./math_riddle_model_final")



`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


In [24]:
tokenizer.save_pretrained("./math_riddle_model_final")

('./math_riddle_model_final/tokenizer_config.json',
 './math_riddle_model_final/special_tokens_map.json',
 './math_riddle_model_final/vocab.json',
 './math_riddle_model_final/merges.txt',
 './math_riddle_model_final/added_tokens.json')

In [28]:
# Generate riddles (test)
tokenizer = GPT2Tokenizer.from_pretrained("/content/math_riddle_model_final")
model = GPT2LMHeadModel.from_pretrained("/content/math_riddle_model_final")

input_prompt = "Riddle:"
input_ids = tokenizer.encode(input_prompt, return_tensors="pt").to("cpu")

output = model.generate(
    input_ids,
    max_length=128,
    temperature=0.7,
    num_beams=5,
    num_return_sequences=3,
)

for i, seq in enumerate(output):
    print(f"Riddle {i+1}: {tokenizer.decode(seq, skip_special_tokens=True)}\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Riddle 1: Riddle: I am a number. What am I? Answer: 7. What am I? Answer: 8. What am I? Answer: 9. What am I? Answer: 10. What am I? Answer: 11. What am I? Answer: 12. What am I? Answer: 13. What am I? Answer: 14. What am I? Answer: 15. What am I? Answer: 16. What am I? Answer: 17. What am I? Answer: 18. What am I? Answer: 19. What am I? Answer: 20. What am I? Answer: 21.

Riddle 2: Riddle: I am a number. What number am I? Answer: 7. What number am I? Answer: 8. What number am I? Answer: 9. What number am I? Answer: 10. What number am I? Answer: 11. What number am I? Answer: 12. What number am I? Answer: 13. What number am I? Answer: 14. What number am I? Answer: 15. What number am I? Answer: 16. What number am I? Answer: 17. What number am I? Answer: 18. What number am I? Answer: 19. What number am

Riddle 3: Riddle: I am a number. What am I? Answer: 7. What am I? Answer: 8. What am I? Answer: 10. What am I? Answer: 11. What am I? Answer: 12. What am I? Answer: 13. What am I? Answer: 