 # Extractive Summarization - BERT

## 1.0 Install Libraries/Packages

In [1]:
%pip install -U datasets
%pip install transformers torch
%pip install rouge_score

Note: you may need to restart the kernel to use updated packages.
Collecting tbb==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Using cached tbb-2021.12.0-py3-none-win_amd64.whl.metadata (1.1 kB)
Using cached tbb-2021.12.0-py3-none-win_amd64.whl (286 kB)
Installing collected packages: tbb
  Attempting uninstall: tbb
    Found existing installation: TBB 0.2
Note: you may need to restart the kernel to use updated packages.


ERROR: Cannot uninstall 'TBB'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import torch
from datasets import load_dataset
from datasets import load_metric
from transformers import BertTokenizer, BertModel
from typing import Dict, Any
import nltk
nltk.download('punkt')

  torch.utils._pytree._register_pytree_node(
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\edmun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 2.0 Load Dataset

In [3]:
# Load test dataset from huggingface
dataset = load_dataset('ccdv/pubmed-summarization', split="test")

# Take only 125 records from specified seed
test_data = dataset.shuffle(seed=42).select(range(125))
test_df = pd.DataFrame(test_data)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


## 3.0 Transformer Based Extractive Summarization

In [4]:
# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def summarize(text, model, tokenizer, num_sentences=5):
    # Lower the text and tokenize into sentences
    sentences = nltk.sent_tokenize(text.lower())

    # Tokenize each sentence and prepare for model input
    tokenized_batches = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=128, add_special_tokens=True)

    # Process tokenized text through the model
    with torch.no_grad():
        outputs = model(**{key: tokenized_batches[key].to(model.device) for key in tokenized_batches})
        embeddings = outputs.last_hidden_state[:, 0, :]  # Get embeddings for token

    # Calculate norms and sort by scores
    scores = torch.norm(embeddings, dim=1)
    sorted_indices = torch.argsort(scores, descending=True)

    # Select top sentences based on sorted indices
    best_sentences = [sentences[idx] for idx in sorted_indices[:num_sentences]]
    return ' '.join(best_sentences)

test_df['generated_summary'] = test_df['article'].apply(lambda x: summarize(x, model, tokenizer))

In [5]:
# Export results to excel
test_df.to_excel('Extractive_BERT_summary.xlsx', index = False)

## 4.0 Evaluation

In [6]:
predictions = list(test_df['generated_summary'])
references = list(test_df['abstract'])

In [7]:
# ROUGE
rouge = load_metric("rouge")
rouge_scores = rouge.compute(predictions=predictions, references=references)
print(rouge_scores)

def simplify_rouge_scores(rouge_scores: Dict[str, Any]) -> str:
    simplified_text = ""
    for key, value in rouge_scores.items():
        # Extract low, mid, and high scores for each ROUGE metric
        low, mid, high = value.low, value.mid, value.high
        simplified_text += f"{key}: Precision ranges from {low.precision:.2%} to {high.precision:.2%}, "
        simplified_text += f"Recall ranges from {low.recall:.2%} to {high.recall:.2%}, "
        simplified_text += f"F1 Score ranges from {low.fmeasure:.2%} to {high.fmeasure:.2%}.\n"

    return simplified_text

print(simplify_rouge_scores(rouge_scores))

print(rouge_scores)


  rouge = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'rouge1': AggregateScore(low=Score(precision=0.3077256845523563, recall=0.275496588450005, fmeasure=0.2717801542760288), mid=Score(precision=0.3295387492713281, recall=0.2986937338328513, fmeasure=0.2877537272856612), high=Score(precision=0.3523107552704317, recall=0.3213309187305278, fmeasure=0.30312962149956213)), 'rouge2': AggregateScore(low=Score(precision=0.07176920207293085, recall=0.06509838733104711, fmeasure=0.06386349778510031), mid=Score(precision=0.08653680914092493, recall=0.0778592337126677, fmeasure=0.0743377849829847), high=Score(precision=0.10648251630214, recall=0.09269887707948012, fmeasure=0.08608877607815905)), 'rougeL': AggregateScore(low=Score(precision=0.16214057863649164, recall=0.1450581176464134, fmeasure=0.14252135548154474), mid=Score(precision=0.17459375243351072, recall=0.1575473129444167, fmeasure=0.15086526734411038), high=Score(precision=0.1897872838034455, recall=0.17128858761062093, fmeasure=0.1589773726333161)), 'rougeLsum': AggregateScore(low=Scor