### Setup

In [None]:
!pip install datasets
!pip install google-cloud-storage
!pip install transformers[sentencepiece]  # `sentencepiece` is needed by the Google's pegasus model

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Upload the credential file for GCS access to `gcs-access-key.json`.
from google.colab import files
uploaded = files.upload()

Saving cloud-storage-access.json to cloud-storage-access.json


In [None]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "cloud-storage-access.json"

In [None]:
MODEL_NAME = "google/pegasus-cnn_dailymail"
FILENAME = "hf-pretrained-pegasus-cnn_dailymail.csv"

### Load dataset

In [None]:
from datasets import load_dataset

cnn_dataset = load_dataset("cnn_dailymail", "3.0.0")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
val_dataset = cnn_dataset["validation"]
val_df = val_dataset.to_pandas()

### Load Huggingface Pretrained Pipeline

In [None]:
from transformers import pipeline
summarizer = pipeline("summarization", model=MODEL_NAME, device="cuda:0")

### Run the inference

In [None]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

predictions = {}
val_articles = KeyDataset(val_dataset, "article")
pred_dataset = summarizer(val_articles, batch_size=4, truncation=True)
for i, p in tqdm(enumerate(pred_dataset), total=len(val_articles)):
  predictions[val_dataset[i]["id"]] = p[0]["summary_text"]

100%|██████████| 13368/13368 [5:05:32<00:00,  1.37s/it]


In [None]:
import pandas as pd

predictions_df = pd.DataFrame([
    {"id": k, "prediction": v}
    for k,v in predictions.items()
])

In [None]:
# Upload the result to bucket.
from google.cloud import storage

storage_client = storage.Client()
bucket_name = "cnn-dailymail-predictions"
bucket = storage_client.get_bucket(bucket_name)

predictions_df.to_csv(FILENAME)
blob = bucket.blob(FILENAME)
blob.upload_from_filename(FILENAME)

### Calculate ROUGE score

In [None]:
!pip install rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
from rouge_score import rouge_scorer
from tqdm import tqdm

scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"])

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for _, row in tqdm(val_df.iterrows(), total=val_df.shape[0]):
  target = row.highlights
  prediction = predictions[row.id]
  score = scorer.score(target=target, prediction=prediction)
  rouge1_scores.append(score["rouge1"].fmeasure)
  rouge2_scores.append(score["rouge2"].fmeasure)
  rougeL_scores.append(score["rougeL"].fmeasure)

print(f'''
rouge 1: {np.average(rouge1_scores)}
rouge 2: {np.average(rouge2_scores)}
rouge L: {np.average(rougeL_scores)}
''')

100%|██████████| 13368/13368 [00:21<00:00, 617.92it/s]


rouge 1: 0.43464696983902695
rouge 2: 0.21209207646227116
rouge L: 0.30585258818971944




