In [1]:
import os

In [2]:
%pwd

'/home/tiva/PycharmProjects/HeadlineGenerator/notebook'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/home/tiva/PycharmProjects/HeadlineGenerator'

In [5]:
from pathlib import Path
from dataclasses import dataclass

In [6]:
@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metrics_file: Path

In [7]:
from headlineGenerator.constants import *
from headlineGenerator.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILEPATH, params_filepath=PARAMS_FILEPATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_path=config.model_path,
            tokenizer_path=config.tokenizer_path,
            metrics_file=config.metrics_file
        )

        return model_evaluation_config

In [9]:
import torch
import pandas as pd
from tqdm import tqdm
from datasets import load_from_disk, load_metric, load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

[2024-03-20 21:11:32,669 || PyTorch version 2.2.0+cpu available.]


In [10]:
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def generate_batches(self, list_of_elements, batch_size):
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i : i+batch_size]

    def calculate_metric(self, dataset, metric, model, tokenizer, batch_size, column_story, column_headline):
        story_batches = list(self.generate_batches(dataset[column_story], batch_size))
        target_batches = list(self.generate_batches(dataset[column_headline], batch_size))

        for story_batch, target_batch in tqdm(zip(story_batches, target_batches), total=len(story_batches)):
            inputs = tokenizer(story_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
            headlines = model.generate(
                input_ids=inputs["input_ids"].to(self.device),
                attention_mask=inputs["attention_mask"].to(self.device),
                max_length=30,
                length_penalty=0.8,
                num_beams=8
            )
            '''`length_penalty` ensures that the model does not generate sequences that are ...'''

            decoded_headlines = [tokenizer.decode(h, skip_special_tokens=True, clean_up_tokenization_spaces=True) for h in headlines]
            decoded_headlines = [h.replace("", " ") for h in decoded_headlines]

            metric.add_batch(predictions=[decoded_headlines], references=[target_batches])

        score = metric.compute()

        return score

    def evaluate(self):
        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(self.device)
        news_dataset = load_from_disk(self.config.data_path)
        
        rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
        rouge_metric = load_metric("rouge")
        
        score = self.calculate_metric(news_dataset["test"], rouge_metric, model, tokenizer, 2, "full_story", "headline")
        
        rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
        df = pd.DataFrame(rouge_dict, index=["t5-small"])
        df.to_csv(self.config.metrics_file, index=False)
    

In [11]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation = ModelEvaluation(config=model_evaluation_config)
    model_evaluation.evaluate()
except Exception as e:
    raise e

[2024-03-20 21:11:51,594 || yaml file : config.yaml loaded successfully]
[2024-03-20 21:11:51,659 || yaml file : params.yaml loaded successfully]
[2024-03-20 21:11:51,672 || created directory at artifacts]
[2024-03-20 21:11:51,673 || created directory at artifacts/model_evaluation]


  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Using the latest cached version of the module from /home/tiva/.cache/huggingface/modules/datasets_modules/metrics/rouge/457c405cab0bd19db749b46bf15a1a3cff4d54f50e7ab868c293e5ece288425e (last modified on Mon Mar 18 05:19:34 2024) since it couldn't be found locally at rouge, or remotely on the Hugging Face Hub.


[2024-03-20 21:12:09,705 || Using the latest cached version of the module from /home/tiva/.cache/huggingface/modules/datasets_modules/metrics/rouge/457c405cab0bd19db749b46bf15a1a3cff4d54f50e7ab868c293e5ece288425e (last modified on Mon Mar 18 05:19:34 2024) since it couldn't be found locally at rouge, or remotely on the Hugging Face Hub.]


100%|███████████████████████████████████████████| 65/65 [15:09<00:00, 13.99s/it]

[2024-03-20 21:27:20,096 || Using default tokenizer.]





In [14]:
config = ConfigurationManager().get_model_evaluation_config()

[2024-03-20 05:49:14,638 || yaml file : config.yaml loaded successfully]
[2024-03-20 05:49:14,644 || yaml file : params.yaml loaded successfully]
[2024-03-20 05:49:14,645 || created directory at artifacts]
[2024-03-20 05:49:14,647 || created directory at artifacts/model_evaluation]


In [15]:
def generate_batches(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i+batch_size]

news_dataset = load_from_disk(config.data_path)

In [16]:
test_data = news_dataset["test"][:10]

In [17]:
stories = list(generate_batches(test_data["full_story"], 2))
headlines = list(generate_batches(test_data["headline"], 2))

In [18]:
headlines

[['nigeria not in a debt crisis – finance minister',
  'argentina win first world cup after 36 years'],
 ['reece james is chelsea’s new captain, makes three big promises',
  'nigerian passport holders have access to just 2.1% of world’s gdp – forbes'],
 ['nigeria central bank to mandate banks to raise capital base',
  'oscars 2023: chris rock turns down offer to host'],
 ['peseiro says main focus is to win afcon for nigeria',
  'chukwueze delighted with laliga award, vows to do more for villarreal'],
 ['afcon qualifier: super eagles camp bubbles as osimhen, awoniyi, others arrive uyo',
  'super eagles off to abidjan for afcon 2023']]

In [19]:
tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_path)
model = AutoModelForSeq2SeqLM.from_pretrained(config.model_path)
        
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric("rouge")        

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Using the latest cached version of the module from /home/tiva/.cache/huggingface/modules/datasets_modules/metrics/rouge/457c405cab0bd19db749b46bf15a1a3cff4d54f50e7ab868c293e5ece288425e (last modified on Mon Mar 18 05:19:34 2024) since it couldn't be found locally at rouge, or remotely on the Hugging Face Hub.


[2024-03-20 05:50:51,266 || Using the latest cached version of the module from /home/tiva/.cache/huggingface/modules/datasets_modules/metrics/rouge/457c405cab0bd19db749b46bf15a1a3cff4d54f50e7ab868c293e5ece288425e (last modified on Mon Mar 18 05:19:34 2024) since it couldn't be found locally at rouge, or remotely on the Hugging Face Hub.]


In [21]:
len(stories)

5

In [None]:
for story_batch, target_batch in tqdm(zip(stories, headlines), total=len(story_batches)):
    inputs = tokenizer(story_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
    headlines = model.generate(
        input_ids=inputs["input_ids"].to(self.device),
        attention_mask=inputs["attention_mask"].to(self.device),
        max_length=30,
        length_penalty=0.8,
        num_beams=8
    )
    '''`length_penalty` ensures that the model does not generate sequences that are ...'''
    
    decoded_headlines = [tokenizer.decode(h, skip_special_tokens=True, clean_up_tokenization_spaces=True) for h in headlines]
    decoded_headlines = [h.replace("", " ") for h in decoded_headlines]
    
    metric.add_batch(predictions=decoded_headlines, references=target_batches)


In [22]:
s = stories[0]
h = headlines[0]

In [25]:
inputs = tokenizer(s, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")

In [28]:
inputs["input_ids"].shape

torch.Size([2, 1024])

In [29]:
inputs["attention_mask"].shape

torch.Size([2, 1024])

In [32]:
h_gen = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=30, length_penalty=0.8, num_beams=8)

In [34]:
h_gen.shape

torch.Size([2, 30])

In [46]:
h_dec = [tokenizer.decode(h, skip_special_tokens=True, clean_up_tokenization_spaces=True) for h in h_gen]

In [47]:
h_dec

[', zainab ahmed, said nigeria’s debt profile is within reasonable limit. nigeria',
 'argentina beat france 4-2 on a penalty shoot-out to win the world cup for the third time in qa']

In [48]:
h_dec = [h.strip() for h in h_dec]

In [49]:
h_dec

[', zainab ahmed, said nigeria’s debt profile is within reasonable limit. nigeria',
 'argentina beat france 4-2 on a penalty shoot-out to win the world cup for the third time in qa']

In [50]:
h

['nigeria not in a debt crisis – finance minister',
 'argentina win first world cup after 36 years']

In [57]:
rouge_metric.add_batch(predictions=h_dec, references=h)

In [58]:
score = rouge_metric.compute()

[2024-03-20 06:08:06,865 || Using default tokenizer.]


In [59]:
rouge_dict = dict((rn, score[rn]) for rn in rouge_names)

In [60]:
rouge_dict

{'rouge1': AggregateScore(low=Score(precision=0.16666666666666666, recall=0.25, fmeasure=0.2), mid=Score(precision=0.17857142857142855, recall=0.375, fmeasure=0.23793103448275862), high=Score(precision=0.19047619047619047, recall=0.5, fmeasure=0.27586206896551724)),
 'rouge2': AggregateScore(low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.025, recall=0.07142857142857142, fmeasure=0.03703703703703704), high=Score(precision=0.05, recall=0.14285714285714285, fmeasure=0.07407407407407408)),
 'rougeL': AggregateScore(low=Score(precision=0.16666666666666666, recall=0.25, fmeasure=0.2), mid=Score(precision=0.17857142857142855, recall=0.375, fmeasure=0.23793103448275862), high=Score(precision=0.19047619047619047, recall=0.5, fmeasure=0.27586206896551724)),
 'rougeLsum': AggregateScore(low=Score(precision=0.16666666666666666, recall=0.25, fmeasure=0.2), mid=Score(precision=0.17857142857142855, recall=0.375, fmeasure=0.23793103448275862), high=Score(precision=0.19047619

In [61]:
dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

{'rouge1': 0.23793103448275862,
 'rouge2': 0.03703703703703704,
 'rougeL': 0.23793103448275862,
 'rougeLsum': 0.23793103448275862}