In [1]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, T5ForQuestionAnswering

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer
import torch
from datasets import load_metric
import numpy as np

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
dataset = load_dataset("GSM8K", 'main', split={
    'train': 'train[:100]',
    'test': 'test[:100]'
})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 100
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 100
    })
})


#structure of dataset gsm8k
DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [3]:
#tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base")
#model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-base")
#now going for a larger model
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

#Moving the model to the gpu
model = model.to(device)

#setting up the pad token
tokenizer.pad_token = tokenizer.eos_token



In [4]:
def preprocess_function(examples):
    inputs = ["question: " + question for question in examples["question"]]
    targets = examples["answer"]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [5]:
# Apply the preprocessing function to the entire dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

training_args = TrainingArguments(
    output_dir="./t5_qa_gsm8k",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)





Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [6]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjprivera44[0m ([33mcs7643_jp[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.574959
2,No log,0.488441
3,No log,0.45372
4,No log,0.43733
5,0.622600,0.432038


TrainOutput(global_step=500, training_loss=0.622552490234375, metrics={'train_runtime': 73.0601, 'train_samples_per_second': 6.844, 'train_steps_per_second': 6.844, 'total_flos': 304478945280000.0, 'train_loss': 0.622552490234375, 'epoch': 5.0})

In [7]:
# Example question
question = "What is the sum of 2 and 3? List out the steps and the solution."

# Preprocess the question
input_text = "question: " + question
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

# Generate the answer
output = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)
generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)

print("Question:", question)
print("Generated Answer:", generated_answer)

Question: What is the sum of 2 and 3? List out the steps and the solution.
Generated Answer: 2 and 3


In [8]:
#END