In [None]:
!pip install evaluate datasets transformers torch transformers[torch] SentencePiece rouge_score bleu bert-score squad sacrebleu sacremoses tqdm pypdfium2 spacy unstructured

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, get_linear_schedule_with_warmup,pipeline
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset,Dataset
import os
from evaluate import load
import evaluate
from tqdm import tqdm
import sys
from pathlib import Path
from torch.utils.data import DataLoader, TensorDataset
import pypdfium2 as pdfium
import re
from spacy.lang.en import English
from unstructured.cleaners.core import group_broken_paragraphs, replace_unicode_quotes, clean, clean_non_ascii_chars, remove_punctuation
from sklearn.model_selection import train_test_split

In [None]:
dataset = load_dataset("asset","ratings")
print(dataset)
dataset1 = load_dataset("asset")
print(dataset1)
train_x = dataset['full']['original']
train_y = dataset['full']['simplification']
validation_x = dataset1['validation']['original']
validation_y = dataset1['validation']['simplifications']
test_x = dataset1['test']['original']
test_y = dataset1['test']['simplifications']

train_df = pd.DataFrame({
    'source_text': train_x,
    'target_text': train_y
})
eval_df = pd.DataFrame({
    'source_text': validation_x,
    'target_text': validation_y
})
test_df = pd.DataFrame({
    'source_text': test_x,
    'target_text': test_y
})
# Add the prefix to source_text
train_df['source_text'] = "simplify this sentence: " + train_df['source_text']

eval_df['source_text'] = "simplify this sentence: " + eval_df['source_text']

test_df['source_text'] = "simplify this sentence: " + test_df['source_text']


In [None]:


print("Train DataFrame:")
for index, row in train_df.head().iterrows():
    print(f"Index: {index}")
    print(f"Source Text: {row['source_text']}")
    print(f"Target Text: {row['target_text']}")
    print("\n")

print("Eval DataFrame:")
for index, row in eval_df.head().iterrows():
    print(f"Index: {index}")
    print(f"Source Text: {row['source_text']}")
    print(f"Target Text: {row['target_text']}")
    print("\n")

print("Test DataFrame:")
for index, row in test_df.head().iterrows():
    print(f"Index: {index}")
    print(f"Source Text: {row['source_text']}")
    print(f"Target Text: {row['target_text']}")
    print("\n")


In [None]:
model_name = "t5-small"  # Change to the desired T5 model size
tokenizer = T5Tokenizer.from_pretrained(model_name)

train_encodings = tokenizer(train_df['source_text'].tolist() , max_length=512, truncation=True, padding=True, return_tensors="pt" )
train_labels = tokenizer(train_df['target_text'].tolist() , max_length=512, truncation=True, padding=True, return_tensors="pt")

validation_encodings = tokenizer(eval_df['source_text'].tolist() , max_length=512, truncation=True, padding=True, return_tensors="pt")
validation_labels = tokenizer(eval_df['target_text'].tolist() , max_length=512, truncation=True, padding=True, return_tensors="pt" )

test_encodings = tokenizer(test_df['source_text'].tolist() , max_length=512, truncation=True, padding=True, return_tensors="pt")
test_labels = tokenizer(test_df['target_text'].tolist() , max_length=512, truncation=True, padding=True, return_tensors="pt")

train_dataset = Dataset.from_dict({"input_ids": train_encodings["input_ids"], "labels": train_labels["input_ids"]})
validation_dataset = Dataset.from_dict({"input_ids": validation_encodings["input_ids"], "labels": validation_labels["input_ids"]})
test_dataset = Dataset.from_dict({"input_ids": test_encodings["input_ids"], "labels": test_labels["input_ids"]})

In [None]:
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/T5_small",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    # per_device_eval_batch_size=1,
    num_train_epochs=4,
    learning_rate=1e-4,
    push_to_hub=False,
    save_strategy="epoch",
)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    optimizers = (optimizer,None)
)

trainer.train()