In [2]:
%pip install transformers
%pip install datasets
%pip install evaluate
%pip install rouge_score



In [3]:
from datasets import load_dataset
config_name='punjabi'

dataset = load_dataset("csebuetnlp/xlsum",config_name)



In [4]:
dataset



In [5]:
dataset['train'][0]



In [6]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
prefix = "summarize: "


def preprocess_fn(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    text_target=[doc for doc in examples['summary']]
    with tokenizer.as_target_tokenizer():
      labels = tokenizer(text_target, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_dataset = dataset.map(preprocess_fn, batched=True)






In [9]:
tokenized_dataset['train'][0]



In [10]:
from transformers import DataCollatorForSeq2Seq
datacollator=DataCollatorForSeq2Seq(tokenizer=tokenizer,model=checkpoint)

In [11]:
from transformers import create_optimizer, AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [12]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)



In [13]:
print(type(tokenized_dataset["train"]["labels"][0]))




In [14]:
train_set = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=datacollator,
)

test_set = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=datacollator,
)
val_set = model.prepare_tf_dataset(
    tokenized_dataset["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=datacollator,
)



In [16]:
import evaluate

rouge = evaluate.load("rouge")

In [17]:
rouge




In [18]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [19]:
from transformers.keras_callbacks import KerasMetricCallback

callback=KerasMetricCallback(metric_fn=compute_metrics,eval_dataset=val_set,predict_with_generate=True,use_xla_generation=True,
    generate_kwargs={"max_length": 128})

In [1]:
model.fit(train_set,validation_data=val_set,epochs=2,callbacks=callback)



In [None]:
import numpy as np

print("Generating predictions for test set...")

all_preds = []
all_refs = []

for batch in test_set:
    inputs = batch["input_ids"]
    outputs = model.generate(inputs)
    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    refs = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
    all_preds.extend(preds)
    all_refs.extend(refs)

rouge_scores = rouge.compute(predictions=all_preds, references=all_refs, use_stemmer=True)
print("ROUGE scores:", rouge_scores)