 # Extractive Summarization - BERT

## 1.0 Install Libraries/Packages

In [3]:
%pip install -U datasets
%pip install transformers torch
%pip install rouge_score
%pip install bert-score


Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any

In [4]:
import pandas as pd
import torch
from datasets import load_dataset
from datasets import load_metric
from transformers import BertTokenizer, BertModel
from typing import Dict, Any
import nltk
nltk.download('punkt')
from bert_score import score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## 2.0 Load Dataset

In [5]:
# Load test dataset from huggingface
dataset = load_dataset('ccdv/pubmed-summarization', split="test")

# Take only 125 records from specified seed
test_data = dataset.shuffle(seed=42).select(range(125))
test_df = pd.DataFrame(test_data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.13k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.66k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/779M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

## 3.0 Transformer Based Extractive Summarization

In [6]:
# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def summarize(text, model, tokenizer, num_sentences=5):
    # Lower the text and tokenize into sentences
    sentences = nltk.sent_tokenize(text.lower())

    # Tokenize each sentence and prepare for model input
    tokenized_batches = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=128, add_special_tokens=True)

    # Process tokenized text through the model
    with torch.no_grad():
        outputs = model(**{key: tokenized_batches[key].to(model.device) for key in tokenized_batches})
        embeddings = outputs.last_hidden_state[:, 0, :]  # Get embeddings for [CLS] token

    # Calculate norms and sort by scores
    scores = torch.norm(embeddings, dim=1)
    sorted_indices = torch.argsort(scores, descending=True)

    # Select top sentences based on sorted indices
    best_sentences = [sentences[idx] for idx in sorted_indices[:num_sentences]]
    return ' '.join(best_sentences)

test_df['generated_summary'] = test_df['article'].apply(lambda x: summarize(x, model, tokenizer))




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [16]:
# Export results to excel
test_df.to_excel('Extractive_BERT_summary.xlsx', index = False)


## 4.0 Evaluation

In [7]:
predictions = list(test_df['generated_summary'])
references = list(test_df['abstract'])


In [11]:
# ROUGE
rouge = load_metric("rouge")
rouge_scores = rouge.compute(predictions=predictions, references=references)
print(rouge_scores)

def simplify_rouge_scores(rouge_scores: Dict[str, Any]) -> str:
    simplified_text = ""
    for key, value in rouge_scores.items():
        # Extract low, mid, and high scores for each ROUGE metric
        low, mid, high = value.low, value.mid, value.high
        simplified_text += f"{key}: Precision ranges from {low.precision:.2%} to {high.precision:.2%}, "
        simplified_text += f"Recall ranges from {low.recall:.2%} to {high.recall:.2%}, "
        simplified_text += f"F1 Score ranges from {low.fmeasure:.2%} to {high.fmeasure:.2%}.\n"

    return simplified_text

print(simplify_rouge_scores(rouge_scores))

print(rouge_scores)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'rouge1': AggregateScore(low=Score(precision=0.30781836406673413, recall=0.27359369776357434, fmeasure=0.27019898575868495), mid=Score(precision=0.32920030067690204, recall=0.2976698334895057, fmeasure=0.28699735464694576), high=Score(precision=0.34971518366040794, recall=0.3218809477344484, fmeasure=0.30355825334441483)), 'rouge2': AggregateScore(low=Score(precision=0.07114652263265135, recall=0.06630670945081046, fmeasure=0.06428546568576578), mid=Score(precision=0.08625965247753759, recall=0.07768496327445304, fmeasure=0.07475671174369244), high=Score(precision=0.10657167240936344, recall=0.0917364637190153, fmeasure=0.08570934770452417)), 'rougeL': AggregateScore(low=Score(precision=0.1622808025858558, recall=0.1451640734470797, fmeasure=0.14235637430089715), mid=Score(precision=0.1752588088776982, recall=0.15729690905535493, fmeasure=0.15097149763799872), high=Score(precision=0.19155584239574094, recall=0.1723728362320646, fmeasure=0.15987349977282925)), 'rougeLsum': AggregateSco

In [13]:
# BERT
def evaluate_summaries(df):
    refs = df['abstract'].tolist()
    hypos = df['generated_summary'].tolist()

    # Compute BERTScore
    P, R, F1 = score(hypos, refs, lang="en", rescale_with_baseline=True)

    # Create DataFrame for scores
    scores_df = pd.DataFrame({'Precision': P.tolist(), 'Recall': R.tolist(), 'F1': F1.tolist()})

    return scores_df.describe()

bert_scores = evaluate_summaries(test_df)
bert_scores.mean()  # Mean scores across dataset





tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision    15.620279
Recall       15.619496
F1           15.591800
dtype: float64

In [12]:
# METEOR
from nltk.translate.meteor_score import meteor_score

# Ensure required NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')

def evaluate_summaries_meteor(df, summary_col, reference_col):
    # Tokenize summaries and references before passing to meteor_score
    scores = [
        meteor_score(
            [nltk.word_tokenize(row[reference_col])],
            nltk.word_tokenize(row[summary_col])
        ) for _, row in df.iterrows()
    ]
    return sum(scores) / len(scores)  # Calculate the average METEOR score

# Assuming 'test_df' has the columns 'generated_summary' and 'reference_summary'
meteor_average_score = evaluate_summaries_meteor(test_df, 'generated_summary', 'abstract')
print("Average METEOR Score:", meteor_average_score)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Average METEOR Score: 0.2125544343018366
