In [2]:
import pandas as pd

In [3]:
df1 = pd.read_csv("pegasus_generated_test_summaries.csv")
df2 = pd.read_csv("t5_generated_test_summaries.csv")
df3 = pd.read_csv("t5_base_test_summaries.csv")

# Display the first few rows to ensure it's loaded correctly
print("First DataFrame:")
print(df1.head())

print("\nSecond DataFrame:")
print(df2.head())

First DataFrame:
                                             summary  \
0  Tana Jones requests access to the "Other Agree...   
1  Steve Kean has requested that each person in t...   
2  California State Sen. Steve Peace proposed set...   
3  Julie Ferrara asks Tana Jones if she received ...   
4  The email thread discusses the transition of t...   

                                   generated_summary  
0  The email thread discusses the need to open "O...  
1  Steve Kean sent an email to a group of recipie...  
2  The email thread discusses a proposal by Calif...  
3  The email thread discusses an amendment to the...  
4  The email thread discusses various topics rela...  

Second DataFrame:
                             subject  \
0  "Other Agreements" in Lotus Notes   
1               2000 ACCOMPLISHMENTS   
2             A chicken in every pot   
3     Amendment to License Agreement   
4                             Azurix   

                                       summary_input  \


In [3]:
# Install rouge_score
!pip install rouge-score

# Install bert_score
!pip install bert-score

# Install nltk
!pip install nltk


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=4795ad1f5f6d675a7c914a72cb41bf15eb74cbec5fd6ff99eb663071379583f3
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


# ROUGE SCORES

In [18]:
from rouge_score import rouge_scorer

def compute_rouge_scores(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], use_stemmer=True,split_summaries=True)
    scores = scorer.score(reference, generated)
    return {
        "ROUGE-1": scores['rouge1'].fmeasure,
        "ROUGE-2": scores['rouge2'].fmeasure,
        "ROUGE-L": scores['rougeLsum'].fmeasure
    }

# Evaluate ROUGE
def evaluate_rouge(df, name):
    rouge_results = []
    for _, row in df.iterrows():
        rouge_results.append(compute_rouge_scores(row['summary'], row['generated_summary']))
    avg_rouge1 = sum(r['ROUGE-1'] for r in rouge_results) / len(rouge_results)
    avg_rouge2 = sum(r['ROUGE-2'] for r in rouge_results) / len(rouge_results)
    avg_rougeL = sum(r['ROUGE-L'] for r in rouge_results) / len(rouge_results)
    return {"Avg ROUGE-1": avg_rouge1, "Avg ROUGE-2": avg_rouge2, "Avg ROUGE-L": avg_rougeL}

rouge_df1 = evaluate_rouge(df1, "DataFrame 1")
rouge_df2 = evaluate_rouge(df2, "DataFrame 2")

rouge_df3 = evaluate_rouge(df3, "T5 Base")


print(f"DataFrame 1 Results:\nROUGE-1: {rouge_df1['Avg ROUGE-1']:.4f}\nROUGE-2: {rouge_df1['Avg ROUGE-2']:.4f}\nROUGE-L: {rouge_df1['Avg ROUGE-L']:.4f}")
print(f"DataFrame 2 Results:\nROUGE-1: {rouge_df2['Avg ROUGE-1']:.4f}\nROUGE-2: {rouge_df2['Avg ROUGE-2']:.4f}\nROUGE-L: {rouge_df2['Avg ROUGE-L']:.4f}")
print(f"T5 Base Results:\nROUGE-1: {rouge_df3['Avg ROUGE-1']:.4f}\nROUGE-2: {rouge_df3['Avg ROUGE-2']:.4f}\nROUGE-L: {rouge_df3['Avg ROUGE-L']:.4f}")


DataFrame 1 Results:
ROUGE-1: 0.4761
ROUGE-2: 0.2190
ROUGE-L: 0.4415
DataFrame 2 Results:
ROUGE-1: 0.4704
ROUGE-2: 0.2064
ROUGE-L: 0.4374
T5 Base Results:
ROUGE-1: 0.2652
ROUGE-2: 0.1061
ROUGE-L: 0.2437


# SummEval Metrics

In [4]:
from transformers import pipeline
import torch

# Initialize SummEval pipeline
device = 0 if torch.cuda.is_available() else -1
summarization_pipeline = pipeline("text-classification", model="microsoft/deberta-v3-large", device=device)

def compute_summ_eval(reference, generated, summarization_pipeline):
    metrics = {}
    # Consistency
    consistency_prompt = (
        "On a scale of 1-5, rate the factual consistency of the generated summary "
        "with respect to the reference text. Only assign a score of 5 if the facts align perfectly.\n"
        f"Generated: {generated}\nReference: {reference}"
    )
    metrics["Consistency"] = summarization_pipeline(consistency_prompt)[0]["score"]

    # Coherence
    coherence_prompt = (
        "On a scale of 1-5, rate the coherence of the generated summary. A score of 5 means "
        "the summary is logically structured and flows well without ambiguity.\n"
        f"Generated: {generated}"
    )
    metrics["Coherence"] = summarization_pipeline(coherence_prompt)[0]["score"]

    # Relevance
    relevance_prompt = (
        "On a scale of 1-5, rate the relevance of the generated summary in covering the key points "
        "of the reference text. Assign a score of 5 if all critical points are addressed accurately.\n"
        f"Generated: {generated}\nReference: {reference}"
    )
    metrics["Relevance"] = summarization_pipeline(relevance_prompt)[0]["score"]

    # Fluency
    fluency_prompt = (
        "On a scale of 1-5, rate the fluency of the generated summary. A score of 5 means "
        "the summary is grammatically correct and uses language naturally.\n"
        f"Generated: {generated}"
    )
    metrics["Fluency"] = summarization_pipeline(fluency_prompt)[0]["score"]

    return metrics


def evaluate_summ_eval(df, name, summarization_pipeline):
    results = {
        "Consistency": [],
        "Coherence": [],
        "Relevance": [],
        "Fluency": []
    }
    for _, row in df.iterrows():
        metrics = compute_summ_eval(row['summary'], row['generated_summary'], summarization_pipeline)
        for key in metrics:
            results[key].append(metrics[key])
    avg_results = {key: sum(values) / len(values) for key, values in results.items()}
    return avg_results

# Example usage
summ_eval_df1 = evaluate_summ_eval(df1, "DataFrame 1", summarization_pipeline)
summ_eval_df2 = evaluate_summ_eval(df2, "DataFrame 2", summarization_pipeline)

summ_eval_df3 = evaluate_summ_eval(df3, "T5 Base", summarization_pipeline)

# Print formatted results
print(f"DataFrame 1 Results:\nConsistency: {summ_eval_df1['Consistency']:.4f}\nCoherence: {summ_eval_df1['Coherence']:.4f}\nRelevance: {summ_eval_df1['Relevance']:.4f}\nFluency: {summ_eval_df1['Fluency']:.4f}")
print(f"DataFrame 2 Results:\nConsistency: {summ_eval_df2['Consistency']:.4f}\nCoherence: {summ_eval_df2['Coherence']:.4f}\nRelevance: {summ_eval_df2['Relevance']:.4f}\nFluency: {summ_eval_df2['Fluency']:.4f}")
print(f"T5 Base Results:\nConsistency: {summ_eval_df3['Consistency']:.4f}\nCoherence: {summ_eval_df3['Coherence']:.4f}\nRelevance: {summ_eval_df3['Relevance']:.4f}\nFluency: {summ_eval_df3['Fluency']:.4f}")


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


DataFrame 1 Results:
Consistency: 0.6206
Coherence: 0.6044
Relevance: 0.6208
Fluency: 0.6045
DataFrame 2 Results:
Consistency: 0.6237
Coherence: 0.6113
Relevance: 0.6238
Fluency: 0.6114
T5 Base Results:
Consistency: 0.6158
Coherence: 0.5922
Relevance: 0.6160
Fluency: 0.5922


# Meteor Metrics

In [11]:
import nltk

# Download WordNet for METEOR and other required resources
nltk.download('wordnet')
nltk.download('omw-1.4')  # Additional WordNet package for multilingual support


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [5]:
import nltk
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
import pandas as pd


def compute_meteor_score(reference, generated):
    reference_tokens = word_tokenize(reference)
    generated_tokens = word_tokenize(generated)
    return meteor_score([reference_tokens], generated_tokens)

def evaluate_meteor(df):
    meteor_results = []
    for _, row in df.iterrows():
        score = compute_meteor_score(row['summary'], row['generated_summary'])
        meteor_results.append(score)
    avg_meteor = sum(meteor_results) / len(meteor_results)
    return {"Avg METEOR": avg_meteor}

# Evaluate METEOR
meteor_df1 = evaluate_meteor(df1)

print(f"DataFrame 1 Results:\nMETEOR: {meteor_df1['Avg METEOR']:.4f}")

meteor_df2 = evaluate_meteor(df2)

print(f"DataFrame 1 Results:\nMETEOR: {meteor_df2['Avg METEOR']:.4f}")

meteor_df3 = evaluate_meteor(df3)

print(f"T5 Base Results: \nMETEOR: {meteor_df3['Avg METEOR']:.4f}")

DataFrame 1 Results:
METEOR: 0.3676
DataFrame 1 Results:
METEOR: 0.3954
T5 Base Results: 
METEOR: 0.1644


# Sentence BERT Embedding Similarity

In [16]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


# function to calculate cosine similarity
def calculate_similarity(summary, generated_summary):
    """
    Compute the semantic similarity between two texts.

    Args:
        summary (str): The reference summary.
        generated_summary (str): The generated summary.

    Returns:
        float: Cosine similarity score between the two embeddings.
    """
    # Generate embeddings for both summaries
    summary_embedding = model.encode(summary)
    generated_summary_embedding = model.encode(generated_summary)

    # Compute cosine similarity
    similarity = cosine_similarity([summary_embedding], [generated_summary_embedding])
    return similarity[0][0]

# Load the pre-trained Sentence-BERT model
model_name = 'all-roberta-large-v1'
print(f"Loading model: {model_name}")
model = SentenceTransformer(model_name)

# Load the dataset containing summaries
file_path = 't5_generated_test_summaries.csv'
print(f"Loading dataset from: {file_path}")
df = pd.read_csv(file_path)

# Apply the similarity function to the dataset
print("Calculating similarity for each summary pair...")
df['similarity'] = df.apply(
    lambda row: calculate_similarity(row['summary'], row['generated_summary']), axis=1
)

# Calculate the overall similarity
overall_similarity = df['similarity'].mean()
print(f"Overall Semantic Similarity: {overall_similarity:.4f}")

# Load the dataset containing summaries
file_path = 'pegasus_generated_test_summaries.csv'
print(f"Loading dataset from: {file_path}")
df = pd.read_csv(file_path)

# Apply the similarity function to the dataset
print("Calculating similarity for each summary pair...")
df['similarity'] = df.apply(
    lambda row: calculate_similarity(row['summary'], row['generated_summary']), axis=1
)

# Calculate the overall similarity
overall_similarity = df['similarity'].mean()
print(f"Overall Semantic Similarity: {overall_similarity:.4f}")

# Load the dataset containing summaries
file_path = 't5_base_test_summaries.csv'
print(f"Loading dataset from: {file_path}")
df = pd.read_csv(file_path)

# Apply the similarity function to the dataset
print("Calculating similarity for each summary pair...")
df['similarity'] = df.apply(
    lambda row: calculate_similarity(row['summary'], row['generated_summary']), axis=1
)

# Calculate the overall similarity
overall_similarity = df['similarity'].mean()
print(f"Overall Semantic Similarity: {overall_similarity:.4f}")


Loading model: all-roberta-large-v1
Loading dataset from: t5_generated_test_summaries.csv
Calculating similarity for each summary pair...
Overall Semantic Similarity: 0.7616
Loading dataset from: pegasus_generated_test_summaries.csv
Calculating similarity for each summary pair...
Overall Semantic Similarity: 0.7426
Loading dataset from: t5_base_test_summaries.csv
Calculating similarity for each summary pair...
Overall Semantic Similarity: 0.5177
