In [1]:
# from transformers import EncoderDecoderModel

# model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "gpt2")
# encoder = model.encoder
# decoder = model.decoder


In [2]:
import pandas as pd
from transformers import AutoTokenizer
from datasets import Dataset

In [3]:

df = pd.read_csv("stories.csv")
# Generate input-output pairs
inputs = ["Guess the main chaaracter name of the following story:\n" + row.story for _, row in df.iterrows()]
outputs = df['name'].tolist()

# Create a pandas DataFrame for easy conversion
df_preprocessed = pd.DataFrame({'input_text': inputs, 'output_text': outputs})

dataset = Dataset.from_pandas(df_preprocessed)

In [None]:
model_name = 'bigscience/mt0-small'  # You can change this to any T5 variant (t5-base, t5-large, etc.)
tokenizer = AutoTokenizer.from_pretrained(model_name)



# Tokenize the dataset
def tokenize_function(examples):
    # Tokenize both inputs and outputs
    inputs = tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=128)
    outputs = tokenizer(examples['output_text'], padding="max_length", truncation=True, max_length=128)
    inputs['labels'] = outputs['input_ids']  # Use the tokenized output as labels for the model
    return inputs

# Apply tokenization to the dataset
dataset = dataset.map(tokenize_function, batched=True)

# Split dataset into training and validation sets
train_dataset = dataset.shuffle(seed=42) 

In [None]:
from transformers import AutoModelForSeq2SeqLM
from transformers import Trainer
from transformers import TrainingArguments


# Load the pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Freeze encoder weights
for param in model.encoder.parameters():
    param.requires_grad = False

# Initialize Trainer
training_args = TrainingArguments(
    output_dir="./results",            # output directory where model checkpoints will be saved
    evaluation_strategy="steps",       # Evaluate every epoch
    learning_rate=5e-4,                # learning rate
    per_device_train_batch_size=8,     # batch size for training
    per_device_eval_batch_size=8,      # batch size for evaluation
    num_train_epochs=10,                # number of training epochs
    weight_decay=0.01,                 # strength of weight decay
    logging_dir="./logs",              # directory for storing logs
    logging_steps=10,                 # log every 200 steps
    save_steps=2000,                    # save model checkpoints every 500 steps
    load_best_model_at_end=True,       # Load the best model when finished training
)

trainer = Trainer(
    model=model,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=train_dataset,            # evaluation dataset
    tokenizer=tokenizer,                 # tokenizer for preprocessing
)

In [None]:
trainer.train()
# trainer.evaluate()

In [None]:
task = "Guess the main chaaracter name of the following story:\n"
story = "The rare manuscript was stolen, and Rena, a book collector, was found to be behind the theft. Her assistant, Tariq, uncovered the crime, but Rena’s connections in the literary world allowed her to avoid prosecution. Her name became infamous in the book industry."
prompt = task + story
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()



output_ids = model.generate(input_ids, num_beams=2)
# Decode the generated output back into text
print(output_ids)
generated_story = tokenizer.decode(output_ids[0].cpu().tolist(), skip_special_tokens=True)

# Print the generated story
print(generated_story)

In [None]:
model.save_pretrained("./model/mt0-small")
tokenizer.save_pretrained("./model/mt0-small")