This notebook is a continuation of Objective Summary_Exploration notebook. In the exploration notebook, we have analyzed various models and created a final model using sample data. In this notebook, we are running the final selected model on all the data

## **Set up**

In [1]:
pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=c4eea32305800bd7589c2e943a4394d24e287d31a1f3fb26e332bd73ff98b79d
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [3]:
def fix_apostrophes(text):
    return text.replace('apos', "'")

def prepare_input(sentences):
    return ". ".join(sentences)

def clean_summary(summary):
    # Remove repetitive endings and weird characters
    words = summary.split()
    cleaned_words = []
    for word in words:
        if len(word) > 1 or word.lower() in ['a', 'i']:  # Keep only meaningful words
            if len(cleaned_words) < 2 or word != cleaned_words[-1] or word != cleaned_words[-2]:
                cleaned_words.append(word)
    return ' '.join(cleaned_words).strip()

In [4]:
url = "https://raw.githubusercontent.com/zhangke626/w266_project/main/training_data_with_objectivity.csv"
df = pd.read_csv(url)

url = "https://raw.githubusercontent.com/zhangke626/w266_project/main/initial_data.csv"
df_article = pd.read_csv(url)

objective_sentences = df[df['objectivity_score'] == 1].copy()
# Fix apostrophes in individual sentences
objective_sentences.loc[:, 'sentence'] = objective_sentences['sentence'].apply(fix_apostrophes)

# Now proceed with grouping and creating input_text
grouped_sentences = objective_sentences.groupby('title')['sentence'].apply(list).reset_index()

grouped_sentences['input_text'] = grouped_sentences['sentence'].apply(prepare_input)

# Merge with df_article
grouped_sentences = grouped_sentences.merge(df_article[['title', 'article', 'publication', 'section', 'url']], on='title', how='left')

In [5]:
df_article_titles = set(df_article['title'])
grouped_sentences_titles = set(grouped_sentences['title'])

missing_titles = df_article_titles - grouped_sentences_titles

print(f"Number of missing titles: {len(missing_titles)}")
print("Missing titles:")
for title in missing_titles:
    print(title)

Number of missing titles: 3
Missing titles:
Tim Kaine's convention speech: 5 moments that made a splash on Twitter
BIRTHDAY OF THE DAY: Sen. Michael Bennet (D-Colo.)
Should Democrats Fear Trump?


articles missing due to very low length and not objective

## **Final Model**

In [6]:
import re
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

model_name = "google/pegasus-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def clean_text(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_summary(summary):
    summary = re.sub(r'(\. )+\.', '.', summary)
    summary = re.sub(r'(\! )+\!', '!', summary)
    summary = re.sub(r'(\? )+\?', '?', summary)
    summary = re.sub(r'[^a-zA-Z0-9.,!?;\'\"\s]', '', summary)
    return summary.strip()

def get_dynamic_length(input_token_count):
    if input_token_count < 200:  # Very Short
        max_length = min(int(input_token_count * 0.8), 160)
        min_length = max(int(input_token_count * 0.5), 50)
    elif input_token_count < 400:  # Short
        max_length = min(int(input_token_count * 0.7), 280)
        min_length = max(int(input_token_count * 0.4), 80)
    elif input_token_count < 600:  # Medium
        max_length = min(int(input_token_count * 0.6), 360)
        min_length = max(int(input_token_count * 0.3), 120)
    elif input_token_count < 800:  # Long
        max_length = min(int(input_token_count * 0.5), 400)
        min_length = max(int(input_token_count * 0.25), 150)
    else:  # Very Long
        max_length = min(int(input_token_count * 0.4), 500)
        min_length = max(int(input_token_count * 0.2), 200)

    min_length = min(min_length, max_length - 1)
    return max_length, min_length

def generate_summary(text):
    try:
        cleaned_text = clean_text(text)
        input_token_count = len(tokenizer.encode(cleaned_text))

        max_length, min_length = get_dynamic_length(input_token_count)

        max_input_length = 1024
        input_ids = tokenizer(cleaned_text, return_tensors="pt", max_length=max_input_length, truncation=True).input_ids.to(device)

        summary_ids = model.generate(
            input_ids,
            max_length=max_length,
            min_length=min_length,
            length_penalty=1.5,
            num_beams=6,
            early_stopping=True,
            no_repeat_ngram_size=3,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
        )

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summary = clean_summary(summary)

        return summary

    except Exception as e:
        print(f"Error generating summary: {str(e)}")
        return "Error in summarization process."

grouped_sentences['summary'] = grouped_sentences['input_text'].apply(generate_summary)


from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def calculate_rouge(reference, hypothesis):
    scores = scorer.score(reference, hypothesis)
    return scores

grouped_sentences['rouge_scores'] = grouped_sentences.apply(lambda row: calculate_rouge(row['article'], row['summary']), axis=1)

# Extracting all ROUGE metrics
grouped_sentences['rouge1_precision'] = grouped_sentences['rouge_scores'].apply(lambda x: x['rouge1'].precision)
grouped_sentences['rouge1_recall'] = grouped_sentences['rouge_scores'].apply(lambda x: x['rouge1'].recall)
grouped_sentences['rouge1_f1'] = grouped_sentences['rouge_scores'].apply(lambda x: x['rouge1'].fmeasure)

grouped_sentences['rouge2_precision'] = grouped_sentences['rouge_scores'].apply(lambda x: x['rouge2'].precision)
grouped_sentences['rouge2_recall'] = grouped_sentences['rouge_scores'].apply(lambda x: x['rouge2'].recall)
grouped_sentences['rouge2_f1'] = grouped_sentences['rouge_scores'].apply(lambda x: x['rouge2'].fmeasure)

grouped_sentences['rougeL_precision'] = grouped_sentences['rouge_scores'].apply(lambda x: x['rougeL'].precision)
grouped_sentences['rougeL_recall'] = grouped_sentences['rouge_scores'].apply(lambda x: x['rougeL'].recall)
grouped_sentences['rougeL_f1'] = grouped_sentences['rouge_scores'].apply(lambda x: x['rougeL'].fmeasure)

print("Average ROUGE scores:")
print(f"ROUGE-1: {grouped_sentences['rouge1_f1'].mean():.4f}")
print(f"ROUGE-2: {grouped_sentences['rouge2_f1'].mean():.4f}")
print(f"ROUGE-L: {grouped_sentences['rougeL_f1'].mean():.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1214 > 1024). Running this sequence through the model will result in indexing errors


Average ROUGE scores:
ROUGE-1: 0.3524
ROUGE-2: 0.3193
ROUGE-L: 0.3276


In [8]:
from google.colab import files
grouped_sentences.to_csv('summary_final.csv')
files.download('summary_final.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>