Let's First import and download all the necessary dependencies

In [1]:
!pip install rouge_score
!pip install textstat
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BartForConditionalGeneration, BartTokenizer
import torch
from sklearn.metrics import accuracy_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import textstat



Let's write a function to load and read the contents of the .txt files

In [2]:
def load_data(data_folder):
    texts = []
    for filename in os.listdir(data_folder):
        if filename.endswith(".txt"):
            with open(os.path.join(data_folder, filename), 'r', encoding='utf-8') as file:
                texts.append(file.read())
    return texts

Now Let's write a function to pre-process our text

In [3]:
def preprocess_text(text):
    # Remove special characters and multiple spaces
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()

Let's Load the text files

In [6]:
data_folder = 'tos'
texts = load_data(data_folder)
processed_texts = [preprocess_text(text) for text in texts]

Now Let's Load the BART Model and the tokenizer and summarize the text

In [None]:
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

def summarize_text(text, max_length=130, min_length=30):
    inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors="pt")
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=max_length, min_length=min_length, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

summarized_texts = [summarize_text(text) for text in processed_texts]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Let's Wrtie a function to evaluate the metrics of the model

In [None]:
def evaluate_metrics(original_texts, summarized_texts):
    bleu_scores = []
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = []

    for orig, summ in zip(original_texts, summarized_texts):
        bleu = sentence_bleu([orig.split()], summ.split())
        bleu_scores.append(bleu)

        rouge_score = rouge.score(orig, summ)
        rouge_scores.append(rouge_score)

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge = {key: sum(score[key].fmeasure for score in rouge_scores) / len(rouge_scores) for key in rouge_scores[0]}

    return avg_bleu, avg_rouge

def evaluate_readability(summarized_texts):
    readability_scores = [textstat.flesch_reading_ease(text) for text in summarized_texts]
    avg_readability = sum(readability_scores) / len(readability_scores)
    return avg_readability

Let's Now Evaluate the model and calculate the metrics

In [None]:
avg_bleu, avg_rouge = evaluate_metrics(processed_texts, summarized_texts)
print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE Score: {avg_rouge}")

avg_readability = evaluate_readability(summarized_texts)
print(f"Average Readability Score: {avg_readability}")