In [None]:

!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q
!pip install transformers==4.37.2
!pip install datasets==2.17.0
!pip install evaluate==0.4.1
!pip install rouge-score==0.1.2

In [None]:
import os
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import torch
from datasets import Dataset, load_dataset, load_from_disk, load_metric
from huggingface_hub import huggingface_hub
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BartForConditionalGeneration, pipeline, set_seed
import nltk

nltk.download('punkt')  # Downloading necessary NLTK data


In [None]:

huggingface_hub.login(token="TOKEN_API")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_ckpt = "facebook/bart-large-xsum"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

### Load Dataset

In [None]:
# Define the URL of the ZIP file
url = 'https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip'

# Define the file paths
zip_file_path = 'summarizer-data.zip'
extracted_folder_path = 'summarizer-data'

# Download the ZIP file
response = requests.get(url)
with open(zip_file_path, 'wb') as zip_file:
    zip_file.write(response.content)

# Extract the contents of the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder_path)

# Remove the ZIP file
os.remove(zip_file_path)

print('Downloaded and extracted the ZIP file successfully.')


In [None]:
dataset = load_from_disk("summarizer-data/samsum_dataset/")
dataset

In [None]:
# Calculate split lengths
split_lengths = [len(dataset[split]) for split in dataset]


print("Split lengths:", split_lengths)
print("Features:", dataset['train'].column_names)
print("\nDialogue:")
print(dataset["test"][1]["dialogue"])
print("\nSummary:")
print(dataset["test"][1]["summary"])


In [None]:
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])
validation_df = pd.DataFrame(dataset["validation"])

In [None]:
# Assuming train, test, and val are pandas DataFrames defined elsewhere in your code
# Convert pandas DataFrames to Dataset objects
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)
val_ds = Dataset.from_pandas(validation_df)

print(f"Train Dataset:\n{train_ds}\n\n")
print(f"Test Dataset:\n{test_ds}\n\n")
print(f"Validation Dataset:\n{val_ds}\n\n")

In [None]:
def preprocess_example(example):
    """
    Preprocesses a single example (data point) for BART model input.

    Args:
        example: A dictionary containing 'dialogue' (list of text) and 'summary' (text).

    Returns:
        model_inputs: A dictionary of tokenized inputs and labels ready for BART. 
    """

    # Extract dialogues and prepare for tokenization
    inputs = example['dialogue']

    # Tokenize input dialogues for BART
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Tokenize target summaries
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example['summary'], max_length=128, truncation=True)

    # Include tokenized labels in model inputs
    model_inputs['labels'] = labels['input_ids']

    return model_inputs


# Tokenize and preprocess datasets
tokenized_train = train_ds.map(preprocess_example, batched=True, remove_columns=['id', 'dialogue', 'summary'])
tokenized_test = test_ds.map(preprocess_example, batched=True, remove_columns=['id', 'dialogue', 'summary'])
tokenized_val = val_ds.map(preprocess_example, batched=True, remove_columns=['id', 'dialogue', 'summary'])

print(tokenized_train)
print(tokenized_test)
print(tokenized_val)

In [None]:
# Checking samples
sample = tokenized_train[0]
print("Input IDs:")
print(sample['input_ids'])
print("\nAttention Mask:")
print(sample['attention_mask'])
print("\nLabels:")
print(sample['labels'])

## Modeling

In [18]:
model = BartForConditionalGeneration.from_pretrained(model_ckpt).to(device)
print(model)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): La

## Loading Evaluation Metric

### What is ROUGE?

ROUGE (ROUGE-L, ROUGE-N, ROUGE-W) is a widely used metric for evaluating the quality of machine-generated text, particularly in the context of text summarization and machine translation. It measures how well a generated text (candidate) matches a reference text (ground truth) by comparing their common n-grams (sequence of n consecutive characters).

### How does ROUGE work?

ROUGE calculates three main scores:

- **ROUGE-L:** Focuses on matching long n-grams (typically n = 1 or 2). This is useful for evaluating fluency and coherence of the generated text.

- **ROUGE-N:** Evaluates matches of n-grams of any length (typically n = 1, 2, 3, 4, 5, 6). This provides a more comprehensive assessment of the generated text's similarity to the reference.

- **ROUGE-W:** Considers n-grams weighted by their frequency in the reference text. This emphasizes the importance of matching more common phrases.

### ROUGE Scores

Each ROUGE score is calculated as a combination of precision and recall:

- **Precision:** The proportion of n-grams generated by the model that also appear in the reference text.

- **Recall:** The proportion of n-grams in the reference text that are correctly matched by the generated text.

The final ROUGE score is typically expressed as an F-measure, which combines precision and recall into a single metric, providing a balanced evaluation.

### Interpreting ROUGE Scores

Higher ROUGE scores indicate better text quality. A score of 1.0 means perfect match between the generated and reference texts. ROUGE scores are usually reported as percentages.

### Applications of ROUGE

ROUGE is widely used in machine learning tasks involving text generation, such as:

- **Machine Translation:** Evaluating the quality of translated text compared to the original source text.

- **Text Summarization:** Assessing the effectiveness of generated summaries compared to the original full-length documents.

- **Chatbots:** Evaluating the coherence and relevance of chatbot responses.


In [19]:
from datasets import load_metric

metric=load_metric('rouge')

  metric=load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Downloading builder script: 5.65kB [00:00, 453kB/s]                    


In [20]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred  # Obtaining predictions and true labels
    
    # Decoding predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Obtaining the true labels tokens, while eliminating any possible masked token (i.e., label=-100)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # ROUGE expects a newline after each sentence
    decoded_preds = ['\n'.join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ['\n'.join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Computing ROUGE score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}  # Extracting some results
    
    # Adding mean-generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result['gen_len'] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\uurce\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
