In [1]:
# from transformers import EncoderDecoderModel

# model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "gpt2")
# encoder = model.encoder
# decoder = model.decoder


In [2]:
import pandas as pd
import torch
from transformers import T5Tokenizer
from torch.utils.data import DataLoader
from datasets import Dataset
from torch.optim import AdamW

In [3]:

df = pd.read_csv("stories.csv")
# Generate input-output pairs
inputs = ["Guess the main chaaracter name of the following story:\n" + row.story for _, row in df.iterrows()]
outputs = df['name'].tolist()

# Create a pandas DataFrame for easy conversion
df_preprocessed = pd.DataFrame({'input_text': inputs, 'output_text': outputs})

dataset = Dataset.from_pandas(df_preprocessed)

In [4]:
model_name = 't5-small'  # You can change this to any T5 variant (t5-base, t5-large, etc.)
tokenizer = T5Tokenizer.from_pretrained(model_name)



# Tokenize the dataset
def tokenize_function(examples):
    # Tokenize both inputs and outputs
    inputs = tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=128)
    outputs = tokenizer(examples['output_text'], padding="max_length", truncation=True, max_length=128)
    inputs['labels'] = outputs['input_ids']  # Use the tokenized output as labels for the model
    return inputs

# Apply tokenization to the dataset
dataset = dataset.map(tokenize_function, batched=True)

# Split dataset into training and validation sets
train_dataset = dataset.shuffle(seed=42) 

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [5]:
# train_dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [10]:
from transformers import T5ForConditionalGeneration
from transformers import Trainer
from transformers import TrainingArguments


# Load the pre-trained model
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Freeze encoder weights
for param in model.encoder.parameters():
    param.requires_grad = False

# Initialize Trainer
training_args = TrainingArguments(
    output_dir="./results",            # output directory where model checkpoints will be saved
    evaluation_strategy="steps",       # Evaluate every epoch
    learning_rate=5e-5,                # learning rate
    per_device_train_batch_size=8,     # batch size for training
    per_device_eval_batch_size=8,      # batch size for evaluation
    num_train_epochs=3,                # number of training epochs
    weight_decay=0.01,                 # strength of weight decay
    logging_dir="./logs",              # directory for storing logs
    logging_steps=200,                 # log every 200 steps
    save_steps=2000,                    # save model checkpoints every 500 steps
    load_best_model_at_end=True,       # Load the best model when finished training
)

trainer = Trainer(
    model=model,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    # eval_dataset=val_dataset,            # evaluation dataset
    tokenizer=tokenizer,                 # tokenizer for preprocessing
)

In [11]:
trainer.train()
trainer.evaluate()

  0%|          | 0/189 [00:00<?, ?it/s]

{'train_runtime': 54.1821, 'train_samples_per_second': 27.684, 'train_steps_per_second': 3.488, 'train_loss': 4.199990893167163, 'epoch': 3.0}


ValueError: Trainer: evaluation requires an eval_dataset.

tensor([[2.8291e+04, 8.0000e+00, 7.1100e+02, 3.0000e+00, 3.4410e+03, 9.0000e+00,
         3.7380e+03, 4.4900e+02, 5.6400e+02, 1.3000e+01, 8.0000e+00, 8.2600e+02,
         7.3300e+02, 1.0000e+01, 3.7000e+01, 1.5110e+03, 4.7000e+01, 1.0056e+04,
         5.7000e+01, 3.0000e+00, 9.0000e+00, 5.2740e+03, 1.9611e+04, 6.0000e+00,
         6.8000e+01, 2.1940e+03, 7.6000e+01, 2.6000e+01, 6.0000e+00, 3.0000e+00,
         9.0000e+00, 3.0000e+00, 1.0656e+04, 9.2900e+03, 6.0000e+00, 6.9600e+03,
         3.0000e+00, 9.0000e+00, 3.8700e+02, 7.5800e+02, 3.5800e+02, 2.4000e+01,
         6.0240e+03, 8.0000e+00, 1.5183e+04, 5.0000e+00, 1.3470e+03, 1.9774e+04,
         6.0000e+00, 3.0840e+03, 2.3000e+01, 6.0000e+00, 8.1600e+02, 3.4000e+01,
         3.2900e+03, 2.2000e+01, 1.7000e+01, 3.6000e+01, 6.3100e+02, 6.0000e+00,
         6.8000e+01, 2.1940e+03, 7.6000e+01, 2.6000e+01, 2.2000e+01, 7.0000e+00,
         1.2950e+03, 1.9400e+03, 8.0000e+00, 1.5110e+03, 1.9000e+02, 8.0000e+00,
         5.3620e+03, 6.0000e

In [59]:
task = "Guess the main chaaracter name of the following story:\n"
story = "The rare manuscript was stolen, and Laila, a book collector, was found to be behind the theft. Her assistant, Tariq, uncovered the crime, but Laila’s connections in the literary world allowed her to avoid prosecution. Her name became infamous in the book industry."
prompt = task + story
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()



output_ids = model.generate(input_ids, num_beams=2)
# Decode the generated output back into text
print(output_ids)
generated_story = tokenizer.decode(output_ids[0].cpu().tolist(), skip_special_tokens=True)

# Print the generated story
print(generated_story)

tensor([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         325, 173,   9,   1]], device='cuda:0')
Laila
