In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv("/content/drive/MyDrive/Project/pegasus_generated_test_summaries.csv")
df2 = pd.read_csv("/content/drive/MyDrive/Project/t5_generated_test_summaries.csv")

# Display the first few rows to ensure it's loaded correctly
print("First DataFrame:")
print(df1.head())

print("\nSecond DataFrame:")
print(df2.head())

First DataFrame:
                                             summary  \
0  Tana Jones requests access to the "Other Agree...   
1  Steve Kean has requested that each person in t...   
2  California State Sen. Steve Peace proposed set...   
3  Julie Ferrara asks Tana Jones if she received ...   
4  The email thread discusses the transition of t...   

                                   generated_summary  
0  The email thread discusses the need to open "O...  
1  Steve Kean sent an email to a group of recipie...  
2  The email thread discusses a proposal by Calif...  
3  The email thread discusses an amendment to the...  
4  The email thread discusses various topics rela...  

Second DataFrame:
                             subject  \
0  "Other Agreements" in Lotus Notes   
1               2000 ACCOMPLISHMENTS   
2             A chicken in every pot   
3     Amendment to License Agreement   
4                             Azurix   

                                       summary_input  \


In [3]:
df2.columns

Index(['subject', 'summary_input', 'processed_messages', 'summary',
       'generated_summary'],
      dtype='object')

In [4]:
df1.columns

Index(['summary', 'generated_summary'], dtype='object')

#Metrics

In [5]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# Install rouge_score
!pip install rouge-score

# Install bert_score
!pip install bert-score

# Install nltk
!pip install nltk


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=4795ad1f5f6d675a7c914a72cb41bf15eb74cbec5fd6ff99eb663071379583f3
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [4]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score

# Initialize ROUGE scorer
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Helper function to calculate ROUGE
def calculate_rouge(summary, generated_summary):
    scores = rouge_scorer.score(summary, generated_summary)
    return {
        'ROUGE-1': scores['rouge1'].fmeasure,
        'ROUGE-2': scores['rouge2'].fmeasure,
        'ROUGE-L': scores['rougeL'].fmeasure
    }

# Helper function to calculate BLEU
def calculate_bleu(summary, generated_summary):

    smoothing = SmoothingFunction().method4
    reference = summary.split()
    candidate = generated_summary.split()
    return sentence_bleu([reference], candidate, smoothing_function=smoothing)

# Helper function to calculate BERTScore
def calculate_bertscore(summaries, generated_summaries):
    P, R, F1 = score(generated_summaries, summaries, lang="en", verbose=True)
    return F1.mean().item()

# Applying metrics to both DataFrames
def evaluate_metrics(df):
    rouge_scores = []
    bleu_scores = []

    for idx, row in df.iterrows():
        rouge = calculate_rouge(row['summary'], row['generated_summary'])
        bleu = calculate_bleu(row['summary'], row['generated_summary'])

        rouge_scores.append(rouge)
        bleu_scores.append(bleu)

    # ROUGE (average scores)
    avg_rouge1 = sum([r['ROUGE-1'] for r in rouge_scores]) / len(rouge_scores)
    avg_rouge2 = sum([r['ROUGE-2'] for r in rouge_scores]) / len(rouge_scores)
    avg_rougeL = sum([r['ROUGE-L'] for r in rouge_scores]) / len(rouge_scores)

    # BLEU (average score)
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    # BERTScore
    bertscore = calculate_bertscore(df['summary'].tolist(), df['generated_summary'].tolist())

    return {
        'Avg ROUGE-1': avg_rouge1,
        'Avg ROUGE-2': avg_rouge2,
        'Avg ROUGE-L': avg_rougeL,
        'Avg BLEU': avg_bleu,
        'BERTScore': bertscore
    }

# Evaluate for both DataFrames
results_df1 = evaluate_metrics(df1)
results_df2 = evaluate_metrics(df2)

# Print results
print("Metrics for DataFrame 1:")
print(results_df1)

print("\nMetrics for DataFrame 2:")
print(results_df2)


NameError: name 'df1' is not defined

In [None]:
from rouge_score import rouge_scorer
from transformers import pipeline
import torch

# Define ROUGE scorer
def compute_rouge_scores(reference, generated):
    """
    Compute ROUGE-1, ROUGE-2, and ROUGE-L scores between reference and generated summaries.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return {
        "ROUGE-1": scores['rouge1'].fmeasure,
        "ROUGE-2": scores['rouge2'].fmeasure,
        "ROUGE-L": scores['rougeL'].fmeasure
    }

# Define GPT-based evaluation
def compute_g_eval_batch(g_eval_pipeline, references, generated_summaries):
    """
    Evaluate generated summaries using a pre-trained language model.
    Scoring for coherence, fluency, and relevance in batch.
    """
    coherence_scores = g_eval_pipeline([f"Coherence: {gen}\n\nReference: {ref}" for ref, gen in zip(references, generated_summaries)])
    fluency_scores = g_eval_pipeline([f"Fluency: {gen}" for gen in generated_summaries])
    relevance_scores = g_eval_pipeline([f"Relevance: {gen}\n\nReference: {ref}" for ref, gen in zip(references, generated_summaries)])

    return {
        "Coherence": coherence_scores,
        "Fluency": fluency_scores,
        "Relevance": relevance_scores
    }

# Evaluate DataFrame
def evaluate_dataframe(df, name, g_eval_pipeline, batch_size=16):
    """
    Compute ROUGE and G-Eval metrics for all rows in a DataFrame.
    """
    rouge_results = []
    g_eval_results = {"Coherence": [], "Fluency": [], "Relevance": []}

    print(f"Evaluating DataFrame: {name}")

    # Process in batches
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        references = batch['summary'].tolist()
        generated_summaries = batch['generated_summary'].tolist()

        # Compute ROUGE scores for the batch
        for ref, gen in zip(references, generated_summaries):
            rouge_scores = compute_rouge_scores(ref, gen)
            rouge_results.append(rouge_scores)

        # Compute G-Eval scores for the batch
        g_eval_batch_results = compute_g_eval_batch(g_eval_pipeline, references, generated_summaries)
        g_eval_results["Coherence"].extend(g_eval_batch_results["Coherence"])
        g_eval_results["Fluency"].extend(g_eval_batch_results["Fluency"])
        g_eval_results["Relevance"].extend(g_eval_batch_results["Relevance"])

    # Aggregate ROUGE scores
    avg_rouge1 = sum(r['ROUGE-1'] for r in rouge_results) / len(rouge_results)
    avg_rouge2 = sum(r['ROUGE-2'] for r in rouge_results) / len(rouge_results)
    avg_rougeL = sum(r['ROUGE-L'] for r in rouge_results) / len(rouge_results)

    return {
        "ROUGE": {
            "Avg ROUGE-1": avg_rouge1,
            "Avg ROUGE-2": avg_rouge2,
            "Avg ROUGE-L": avg_rougeL
        },
        "G-Eval": g_eval_results
    }

# Initialize the G-Eval pipeline (use GPU if available)
device = 0 if torch.cuda.is_available() else -1
g_eval_pipeline = pipeline("text-classification", model="microsoft/deberta-v3-large", device=device)

# Example usage for df1 and df2
df1_results = evaluate_dataframe(df1, "DataFrame 1", g_eval_pipeline)
df2_results = evaluate_dataframe(df2, "DataFrame 2", g_eval_pipeline)

# Print results
print("Results for DataFrame 1:")
print(df1_results)

print("\nResults for DataFrame 2:")
print(df2_results)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating DataFrame: DataFrame 1


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Evaluating DataFrame: DataFrame 2
Results for DataFrame 1:
{'ROUGE': {'Avg ROUGE-1': 0.4760943915392911, 'Avg ROUGE-2': 0.21896795223983684, 'Avg ROUGE-L': 0.30974402171601995}, 'G-Eval': {'Coherence': [{'label': 'LABEL_1', 'score': 0.5355116128921509}, {'label': 'LABEL_1', 'score': 0.5324952602386475}, {'label': 'LABEL_1', 'score': 0.5354558825492859}, {'label': 'LABEL_1', 'score': 0.5327616333961487}, {'label': 'LABEL_1', 'score': 0.5365875363349915}, {'label': 'LABEL_1', 'score': 0.5346201658248901}, {'label': 'LABEL_1', 'score': 0.5357369184494019}, {'label': 'LABEL_1', 'score': 0.5337560176849365}, {'label': 'LABEL_1', 'score': 0.5369905233383179}, {'label': 'LABEL_1', 'score': 0.5346193313598633}, {'label': 'LABEL_1', 'score': 0.5347884297370911}, {'label': 'LABEL_1', 'score': 0.5389056205749512}, {'label': 'LABEL_1', 'score': 0.5303131341934204}, {'label': 'LABEL_1', 'score': 0.5348712205886841}, {'label': 'LABEL_1', 'score': 0.5356088280677795}, {'label': 'LABEL_1', 'score': 0.

In [None]:
from rouge_score import rouge_scorer
from transformers import pipeline
import torch

# Define ROUGE scorer
def compute_rouge_scores(reference, generated):
    """
    Compute ROUGE-1, ROUGE-2, and ROUGE-L scores between reference and generated summaries.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return {
        "ROUGE-1": scores['rouge1'].fmeasure,
        "ROUGE-2": scores['rouge2'].fmeasure,
        "ROUGE-L": scores['rougeL'].fmeasure
    }

# Define GPT-based evaluation
def compute_g_eval_batch(g_eval_pipeline, references, generated_summaries):
    """
    Evaluate generated summaries using a pre-trained language model.
    Scoring for coherence, fluency, and relevance in batch.
    """
    coherence_prompts = [f"Coherence: {gen}\n\nReference: {ref}" for ref, gen in zip(references, generated_summaries)]
    fluency_prompts = [f"Fluency: {gen}" for gen in generated_summaries]
    relevance_prompts = [f"Relevance: {gen}\n\nReference: {ref}" for ref, gen in zip(references, generated_summaries)]

    coherence_scores = g_eval_pipeline(coherence_prompts)
    fluency_scores = g_eval_pipeline(fluency_prompts)
    relevance_scores = g_eval_pipeline(relevance_prompts)

    # Ensure the output structure is handled correctly
    def extract_avg_score(scores):
        return sum([entry['score'] for entry in scores if entry['label'] == 'LABEL_1']) / len(scores)

    avg_coherence = extract_avg_score(coherence_scores)
    avg_fluency = extract_avg_score(fluency_scores)
    avg_relevance = extract_avg_score(relevance_scores)

    return {
        "Avg Coherence": avg_coherence,
        "Avg Fluency": avg_fluency,
        "Avg Relevance": avg_relevance
    }

# Evaluate DataFrame
def evaluate_dataframe(df, name, g_eval_pipeline, batch_size=16):
    """
    Compute ROUGE and G-Eval metrics for all rows in a DataFrame.
    """
    rouge_results = []

    print(f"Evaluating DataFrame: {name}")

    # Process in batches
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i + batch_size]
        references = batch['summary'].tolist()
        generated_summaries = batch['generated_summary'].tolist()

        # Compute ROUGE scores for the batch
        for ref, gen in zip(references, generated_summaries):
            rouge_scores = compute_rouge_scores(ref, gen)
            rouge_results.append(rouge_scores)

        # Compute G-Eval scores for the batch
        g_eval_results = compute_g_eval_batch(g_eval_pipeline, references, generated_summaries)

    # Aggregate ROUGE scores
    avg_rouge1 = sum(r['ROUGE-1'] for r in rouge_results) / len(rouge_results)
    avg_rouge2 = sum(r['ROUGE-2'] for r in rouge_results) / len(rouge_results)
    avg_rougeL = sum(r['ROUGE-L'] for r in rouge_results) / len(rouge_results)

    return {
        "ROUGE": {
            "Avg ROUGE-1": avg_rouge1,
            "Avg ROUGE-2": avg_rouge2,
            "Avg ROUGE-L": avg_rougeL
        },
        "G-Eval": g_eval_results
    }

# Initialize the G-Eval pipeline (use GPU if available)
device = 0 if torch.cuda.is_available() else -1
g_eval_pipeline = pipeline("text-classification", model="microsoft/deberta-v3-large", device=device)

# Example usage for df1 and df2
df1_results = evaluate_dataframe(df1, "DataFrame 1", g_eval_pipeline)
df2_results = evaluate_dataframe(df2, "DataFrame 2", g_eval_pipeline)

# Present results in a concise format
def print_results(results, name):
    print(f"\n{name} Results:")
    print(f"ROUGE-1: {results['ROUGE']['Avg ROUGE-1']:.4f}")
    print(f"ROUGE-2: {results['ROUGE']['Avg ROUGE-2']:.4f}")
    print(f"ROUGE-L: {results['ROUGE']['Avg ROUGE-L']:.4f}")
    print(f"Coherence: {results['G-Eval']['Avg Coherence']:.4f}")
    print(f"Fluency: {results['G-Eval']['Avg Fluency']:.4f}")
    print(f"Relevance: {results['G-Eval']['Avg Relevance']:.4f}")

# Print concise results
print_results(df1_results, "DataFrame 1")
print_results(df2_results, "DataFrame 2")


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating DataFrame: DataFrame 1
Evaluating DataFrame: DataFrame 2

DataFrame 1 Results:
ROUGE-1: 0.4761
ROUGE-2: 0.2190
ROUGE-L: 0.3097
Coherence: 0.0000
Fluency: 0.0000
Relevance: 0.0000

DataFrame 2 Results:
ROUGE-1: 0.4840
ROUGE-2: 0.2137
ROUGE-L: 0.2912
Coherence: 0.0000
Fluency: 0.0000
Relevance: 0.0000


In [None]:
from rouge_score import rouge_scorer
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd

# Define ROUGE scorer
def compute_rouge_scores(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return {
        "ROUGE-1": scores['rouge1'].fmeasure,
        "ROUGE-2": scores['rouge2'].fmeasure,
        "ROUGE-L": scores['rougeL'].fmeasure
    }

# Define BERTScore
def compute_bert_score(references, generated_summaries):
    P, R, F1 = score(generated_summaries, references, lang="en", verbose=True)
    return {
        "Precision": P.mean().item(),
        "Recall": R.mean().item(),
        "F1": F1.mean().item()
    }

# Define BLEU
def compute_bleu(reference, generated):
    smoothing = SmoothingFunction().method4
    reference_tokens = reference.split()
    generated_tokens = generated.split()
    return sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothing)

# Evaluate DataFrame
def evaluate_dataframe(df, name):
    rouge_scores = []
    bleu_scores = []

    print(f"Evaluating DataFrame: {name}")

    for _, row in df.iterrows():
        reference = row['summary']
        generated = row['generated_summary']

        # Compute ROUGE
        rouge_scores.append(compute_rouge_scores(reference, generated))

        # Compute BLEU
        bleu_scores.append(compute_bleu(reference, generated))

    # Compute Average ROUGE
    avg_rouge1 = sum(r['ROUGE-1'] for r in rouge_scores) / len(rouge_scores)
    avg_rouge2 = sum(r['ROUGE-2'] for r in rouge_scores) / len(rouge_scores)
    avg_rougeL = sum(r['ROUGE-L'] for r in rouge_scores) / len(rouge_scores)

    # Compute Average BLEU
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    # Compute BERTScore
    bert_scores = compute_bert_score(df['summary'].tolist(), df['generated_summary'].tolist())

    return {
        "ROUGE": {
            "Avg ROUGE-1": avg_rouge1,
            "Avg ROUGE-2": avg_rouge2,
            "Avg ROUGE-L": avg_rougeL
        },
        "BLEU": avg_bleu,
        "BERTScore": bert_scores
    }

# Example usage
df1_results = evaluate_dataframe(df1, "DataFrame 1")
df2_results = evaluate_dataframe(df2, "DataFrame 2")

# Print results
def print_results(results, name):
    print(f"\n{name} Results:")
    print(f"ROUGE-1: {results['ROUGE']['Avg ROUGE-1']:.4f}")
    print(f"ROUGE-2: {results['ROUGE']['Avg ROUGE-2']:.4f}")
    print(f"ROUGE-L: {results['ROUGE']['Avg ROUGE-L']:.4f}")
    print(f"BLEU: {results['BLEU']:.4f}")
    print(f"BERT Precision: {results['BERTScore']['Precision']:.4f}")
    print(f"BERT Recall: {results['BERTScore']['Recall']:.4f}")
    print(f"BERT F1: {results['BERTScore']['F1']:.4f}")

print_results(df1_results, "DataFrame 1")
print_results(df2_results, "DataFrame 2")


NameError: name 'df1' is not defined

In [None]:
from rouge_score import rouge_scorer
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import pipeline
import torch

# Define ROUGE scorer
def compute_rouge_scores(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return {
        "ROUGE-1": scores['rouge1'].fmeasure,
        "ROUGE-2": scores['rouge2'].fmeasure,
        "ROUGE-L": scores['rougeL'].fmeasure
    }

# Define BERTScore
def compute_bert_score(references, generated_summaries):
    P, R, F1 = score(generated_summaries, references, lang="en", verbose=True)
    return {
        "Precision": P.mean().item(),
        "Recall": R.mean().item(),
        "F1": F1.mean().item()
    }

# Define BLEU
def compute_bleu(reference, generated):
    smoothing = SmoothingFunction().method4
    reference_tokens = reference.split()
    generated_tokens = generated.split()
    return sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothing)

# Define SummEval scorer
def compute_summ_eval(reference, generated, summarization_pipeline):
    metrics = {}
    # Consistency
    consistency_prompt = f"Rate the consistency of this summary:\nGenerated: {generated}\nReference: {reference}"
    metrics["Consistency"] = summarization_pipeline(consistency_prompt)[0]["score"]

    # Coherence
    coherence_prompt = f"Rate the coherence of this summary:\nGenerated: {generated}"
    metrics["Coherence"] = summarization_pipeline(coherence_prompt)[0]["score"]

    # Relevance
    relevance_prompt = f"Rate the relevance of this summary:\nGenerated: {generated}\nReference: {reference}"
    metrics["Relevance"] = summarization_pipeline(relevance_prompt)[0]["score"]

    # Fluency
    fluency_prompt = f"Rate the fluency of this summary:\nGenerated: {generated}"
    metrics["Fluency"] = summarization_pipeline(fluency_prompt)[0]["score"]

    return metrics

# Evaluate DataFrame
def evaluate_dataframe(df, name, summarization_pipeline):
    """
    Compute all metrics for a given DataFrame.
    """
    rouge_scores = []
    bleu_scores = []
    summ_eval_scores = {
        "Consistency": [],
        "Coherence": [],
        "Relevance": [],
        "Fluency": []
    }

    print(f"Evaluating DataFrame: {name}")

    for _, row in df.iterrows():
        reference = row['summary']
        generated = row['generated_summary']

        # Compute ROUGE
        rouge_scores.append(compute_rouge_scores(reference, generated))

        # Compute BLEU
        bleu_scores.append(compute_bleu(reference, generated))

        # Compute SummEval
        summ_eval = compute_summ_eval(reference, generated, summarization_pipeline)
        for key in summ_eval:
            summ_eval_scores[key].append(summ_eval[key])

    # Compute Average ROUGE
    avg_rouge1 = sum(r['ROUGE-1'] for r in rouge_scores) / len(rouge_scores)
    avg_rouge2 = sum(r['ROUGE-2'] for r in rouge_scores) / len(rouge_scores)
    avg_rougeL = sum(r['ROUGE-L'] for r in rouge_scores) / len(rouge_scores)

    # Compute Average BLEU
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    # Compute BERTScore
    bert_scores = compute_bert_score(df['summary'].tolist(), df['generated_summary'].tolist())

    # Compute Average SummEval
    avg_summ_eval = {key: sum(values) / len(values) for key, values in summ_eval_scores.items()}

    return {
        "ROUGE": {
            "Avg ROUGE-1": avg_rouge1,
            "Avg ROUGE-2": avg_rouge2,
            "Avg ROUGE-L": avg_rougeL
        },
        "BLEU": avg_bleu,
        "BERTScore": bert_scores,
        "SummEval": avg_summ_eval
    }

# Initialize SummEval pipeline
device = 0 if torch.cuda.is_available() else -1
summarization_pipeline = pipeline("text-classification", model="microsoft/deberta-v3-large", device=device)

# Evaluate DataFrames
df1_results = evaluate_dataframe(df1, "DataFrame 1", summarization_pipeline)
df2_results = evaluate_dataframe(df2, "DataFrame 2", summarization_pipeline)

# Print results
def print_results(results, name):
    print(f"\n{name} Results:")
    print(f"ROUGE-1: {results['ROUGE']['Avg ROUGE-1']:.4f}")
    print(f"ROUGE-2: {results['ROUGE']['Avg ROUGE-2']:.4f}")
    print(f"ROUGE-L: {results['ROUGE']['Avg ROUGE-L']:.4f}")
    print(f"BLEU: {results['BLEU']:.4f}")
    print(f"BERT Precision: {results['BERTScore']['Precision']:.4f}")
    print(f"BERT Recall: {results['BERTScore']['Recall']:.4f}")
    print(f"BERT F1: {results['BERTScore']['F1']:.4f}")
    print(f"SummEval Metrics:")
    for key, value in results['SummEval'].items():
        print(f"  {key}: {value:.4f}")

print_results(df1_results, "DataFrame 1")
print_results(df2_results, "DataFrame 2")


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating DataFrame: DataFrame 1


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/14 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 4.62 seconds, 90.27 sentences/sec
Evaluating DataFrame: DataFrame 2


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/14 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 4.70 seconds, 88.80 sentences/sec

DataFrame 1 Results:
ROUGE-1: 0.4761
ROUGE-2: 0.2190
ROUGE-L: 0.3097
BLEU: 0.1251
BERT Precision: 0.8829
BERT Recall: 0.8821
BERT F1: 0.8824
SummEval Metrics:
  Consistency: 0.5204
  Coherence: 0.5211
  Relevance: 0.5205
  Fluency: 0.5212

DataFrame 2 Results:
ROUGE-1: 0.4840
ROUGE-2: 0.2137
ROUGE-L: 0.2912
BLEU: 0.1214
BERT Precision: 0.8786
BERT Recall: 0.8857
BERT F1: 0.8820
SummEval Metrics:
  Consistency: 0.5210
  Coherence: 0.5210
  Relevance: 0.5210
  Fluency: 0.5211


# ROUGE SCORES

In [None]:
from rouge_score import rouge_scorer

def compute_rouge_scores(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return {
        "ROUGE-1": scores['rouge1'].fmeasure,
        "ROUGE-2": scores['rouge2'].fmeasure,
        "ROUGE-L": scores['rougeL'].fmeasure
    }

# Evaluate ROUGE
def evaluate_rouge(df, name):
    rouge_results = []
    for _, row in df.iterrows():
        rouge_results.append(compute_rouge_scores(row['summary'], row['generated_summary']))
    avg_rouge1 = sum(r['ROUGE-1'] for r in rouge_results) / len(rouge_results)
    avg_rouge2 = sum(r['ROUGE-2'] for r in rouge_results) / len(rouge_results)
    avg_rougeL = sum(r['ROUGE-L'] for r in rouge_results) / len(rouge_results)
    return {"Avg ROUGE-1": avg_rouge1, "Avg ROUGE-2": avg_rouge2, "Avg ROUGE-L": avg_rougeL}

rouge_df1 = evaluate_rouge(df1, "DataFrame 1")
rouge_df2 = evaluate_rouge(df2, "DataFrame 2")


print(f"DataFrame 1 Results:\nROUGE-1: {rouge_df1['Avg ROUGE-1']:.4f}\nROUGE-2: {rouge_df1['Avg ROUGE-2']:.4f}\nROUGE-L: {rouge_df1['Avg ROUGE-L']:.4f}")
print(f"DataFrame 2 Results:\nROUGE-1: {rouge_df2['Avg ROUGE-1']:.4f}\nROUGE-2: {rouge_df2['Avg ROUGE-2']:.4f}\nROUGE-L: {rouge_df2['Avg ROUGE-L']:.4f}")


DataFrame 1 Results:
ROUGE-1: 0.4761
ROUGE-2: 0.2190
ROUGE-L: 0.3097
DataFrame 2 Results:
ROUGE-1: 0.4840
ROUGE-2: 0.2137
ROUGE-L: 0.2912


# BLEU Scores

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def compute_bleu(reference, generated):
    smoothing = SmoothingFunction().method4
    reference_tokens = reference.split()
    generated_tokens = generated.split()
    return sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothing)

# Evaluate BLEU
def evaluate_bleu(df, name):
    bleu_scores = [compute_bleu(row['summary'], row['generated_summary']) for _, row in df.iterrows()]
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    return {"Avg BLEU": avg_bleu}


bleu_df1 = evaluate_bleu(df1, "DataFrame 1")
bleu_df2 = evaluate_bleu(df2, "DataFrame 2")


print(f"DataFrame 1 Results:\nBLEU: {bleu_df1['Avg BLEU']:.4f}")
print(f"DataFrame 2 Results:\nBLEU: {bleu_df2['Avg BLEU']:.4f}")


DataFrame 1 Results:
BLEU: 0.1251
DataFrame 2 Results:
BLEU: 0.1214


# BERTScore

In [None]:
from bert_score import score

def evaluate_bert_score(df, name):
    P, R, F1 = score(df['generated_summary'].tolist(), df['summary'].tolist(), lang="en", verbose=True)
    return {"Precision": P.mean().item(), "Recall": R.mean().item(), "F1": F1.mean().item()}


bert_df1 = evaluate_bert_score(df1, "DataFrame 1")
bert_df2 = evaluate_bert_score(df2, "DataFrame 2")


print(f"DataFrame 1 Results:\nBERT Precision: {bert_df1['Precision']:.4f}\nBERT Recall: {bert_df1['Recall']:.4f}\nBERT F1: {bert_df1['F1']:.4f}")
print(f"DataFrame 2 Results:\nBERT Precision: {bert_df2['Precision']:.4f}\nBERT Recall: {bert_df2['Recall']:.4f}\nBERT F1: {bert_df2['F1']:.4f}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/14 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 4.66 seconds, 89.45 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/14 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 4.76 seconds, 87.66 sentences/sec
DataFrame 1 Results:
BERT Precision: 0.8829
BERT Recall: 0.8821
BERT F1: 0.8824
DataFrame 2 Results:
BERT Precision: 0.8786
BERT Recall: 0.8857
BERT F1: 0.8820


# SummEval Metrics

In [None]:
from transformers import pipeline
import torch

# Initialize SummEval pipeline
device = 0 if torch.cuda.is_available() else -1
summarization_pipeline = pipeline("text-classification", model="microsoft/deberta-v3-large", device=device)

def compute_summ_eval(reference, generated, summarization_pipeline):
    metrics = {}
    # Consistency
    consistency_prompt = f"Rate the consistency of this summary:\nGenerated: {generated}\nReference: {reference}"
    metrics["Consistency"] = summarization_pipeline(consistency_prompt)[0]["score"]

    # Coherence
    coherence_prompt = f"Rate the coherence of this summary:\nGenerated: {generated}"
    metrics["Coherence"] = summarization_pipeline(coherence_prompt)[0]["score"]

    # Relevance
    relevance_prompt = f"Rate the relevance of this summary:\nGenerated: {generated}\nReference: {reference}"
    metrics["Relevance"] = summarization_pipeline(relevance_prompt)[0]["score"]

    # Fluency
    fluency_prompt = f"Rate the fluency of this summary:\nGenerated: {generated}"
    metrics["Fluency"] = summarization_pipeline(fluency_prompt)[0]["score"]

    return metrics

def evaluate_summ_eval(df, name, summarization_pipeline):
    results = {
        "Consistency": [],
        "Coherence": [],
        "Relevance": [],
        "Fluency": []
    }
    for _, row in df.iterrows():
        metrics = compute_summ_eval(row['summary'], row['generated_summary'], summarization_pipeline)
        for key in metrics:
            results[key].append(metrics[key])
    avg_results = {key: sum(values) / len(values) for key, values in results.items()}
    return avg_results

# Example usage
summ_eval_df1 = evaluate_summ_eval(df1, "DataFrame 1", summarization_pipeline)
summ_eval_df2 = evaluate_summ_eval(df2, "DataFrame 2", summarization_pipeline)

# Print formatted results
print(f"DataFrame 1 Results:\nConsistency: {summ_eval_df1['Consistency']:.4f}\nCoherence: {summ_eval_df1['Coherence']:.4f}\nRelevance: {summ_eval_df1['Relevance']:.4f}\nFluency: {summ_eval_df1['Fluency']:.4f}")
print(f"DataFrame 2 Results:\nConsistency: {summ_eval_df2['Consistency']:.4f}\nCoherence: {summ_eval_df2['Coherence']:.4f}\nRelevance: {summ_eval_df2['Relevance']:.4f}\nFluency: {summ_eval_df2['Fluency']:.4f}")


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DataFrame 1 Results:
Consistency: 0.5492
Coherence: 0.5567
Relevance: 0.5495
Fluency: 0.5566
DataFrame 2 Results:
Consistency: 0.5485
Coherence: 0.5555
Relevance: 0.5488
Fluency: 0.5554


# Meteor Metrics

In [11]:
import nltk

# Download WordNet for METEOR and other required resources
nltk.download('wordnet')
nltk.download('omw-1.4')  # Additional WordNet package for multilingual support


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [14]:
from nltk.translate.meteor_score import meteor_score

# Define METEOR metric computation
def compute_meteor(reference, generated):
    """
    Compute METEOR score for a single reference and generated summary.
    """
    # Tokenize reference and generated summaries
    reference_tokens = reference.split()  # Tokenize the reference
    generated_tokens = generated.split()  # Tokenize the generated summary

    # Compute METEOR score
    return meteor_score([reference_tokens], generated_tokens)

# Evaluate METEOR for a DataFrame
def evaluate_meteor(df, name):
    """
    Compute average METEOR score for all rows in a DataFrame.
    """
    meteor_scores = [
        compute_meteor(row['summary'], row['generated_summary']) for _, row in df.iterrows()
    ]
    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    return {"Avg METEOR": avg_meteor}

# Example usage for df1 and df2
meteor_df1 = evaluate_meteor(df1, "DataFrame 1")
meteor_df2 = evaluate_meteor(df2, "DataFrame 2")

# Print formatted results
print(f"DataFrame 1 Results:\nMETEOR: {meteor_df1['Avg METEOR']:.4f}")
print(f"DataFrame 2 Results:\nMETEOR: {meteor_df2['Avg METEOR']:.4f}")


DataFrame 1 Results:
METEOR: 0.3275
DataFrame 2 Results:
METEOR: 0.3430


In [23]:
import nltk

# Download required resources for NLTK
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [24]:
import nltk
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
import pandas as pd


def compute_meteor_score(reference, generated):
    reference_tokens = word_tokenize(reference)
    generated_tokens = word_tokenize(generated)
    return meteor_score([reference_tokens], generated_tokens)

def evaluate_meteor(df):
    meteor_results = []
    for _, row in df.iterrows():
        score = compute_meteor_score(row['summary'], row['generated_summary'])
        meteor_results.append(score)
    avg_meteor = sum(meteor_results) / len(meteor_results)
    return {"Avg METEOR": avg_meteor}

# Evaluate METEOR
meteor_df1 = evaluate_meteor(df1)

print(f"DataFrame 1 Results:\nMETEOR: {meteor_df1['Avg METEOR']:.4f}")

meteor_df2 = evaluate_meteor(df2)

print(f"DataFrame 1 Results:\nMETEOR: {meteor_df2['Avg METEOR']:.4f}")

DataFrame 1 Results:
METEOR: 0.3676
DataFrame 1 Results:
METEOR: 0.3853


# Updated SummEval

In [26]:
pip install torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1


Collecting torch==1.13.1
  Using cached torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchvision==0.14.1
  Using cached torchvision-0.14.1-cp310-cp310-manylinux1_x86_64.whl.metadata (11 kB)
Collecting torchaudio==0.13.1
  Using cached torchaudio-0.13.1-cp310-cp310-manylinux1_x86_64.whl.metadata (1.2 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==1.13.1)
  Using cached nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==1.13.1)
  Using cached nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==1.13.1)
  Using cached nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==1.13.1)
  Using cached nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Using cached torch-1.13.1-cp310

In [1]:

# Install summ-eval and fix dependencies
!pip install summ-eval



Collecting summ-eval
  Downloading summ_eval-0.892-py3-none-any.whl.metadata (15 kB)
Collecting moverscore (from summ-eval)
  Downloading moverscore-1.0.3.tar.gz (7.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytorch-pretrained-bert (from summ-eval)
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl.metadata (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.7/86.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting stanza (from summ-eval)
  Downloading stanza-1.9.2-py3-none-any.whl.metadata (13 kB)
Collecting sacremoses (from summ-eval)
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting sacrebleu (from summ-eval)
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyemd==0.5.1 (from summ-eval)
  Downloading pyemd-0.5.1.tar.gz (91 kB)
[2K     [90m━

In [None]:
!python -m venv summeval_env

The virtual environment was not created successfully because ensurepip is not
available.  On Debian/Ubuntu systems, you need to install the python3-venv
package using the following command.

    apt install python3.10-venv

You may need to use sudo with that command.  After installing the python3-venv
package, recreate your virtual environment.

Failing command: /content/summeval_env/bin/python3



In [None]:
!source summeval_env/bin/activate

/bin/bash: line 1: summeval_env/bin/activate: No such file or directory


In [None]:
!pip install summ-eval
!pip install torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1



In [27]:
from summeval.metrics.text_metrics import TextMetrics

# Initialize TextMetrics with CPU-based computation
text_metrics = TextMetrics(device="cpu")


ModuleNotFoundError: No module named 'summeval'

In [None]:
from summeval.metrics.text_metrics import TextMetrics

# Initialize the SummEval text metrics calculator
text_metrics = TextMetrics()

# Function to compute SummEval metrics for a single reference and generated summary
def compute_summ_eval(reference, generated):
    """
    Compute SummEval metrics for a single reference and generated summary.
    """
    return {
        "Coherence": text_metrics.coherence([generated], [reference])[0],
        "Consistency": text_metrics.consistency([generated], [reference])[0],
        "Fluency": text_metrics.fluency([generated])[0],
        "Relevance": text_metrics.relevance([generated], [reference])[0]
    }

# Evaluate SummEval metrics for a DataFrame
def evaluate_summ_eval(df, name):
    """
    Compute average SummEval metrics for all rows in a DataFrame.
    """
    results = {
        "Coherence": [],
        "Consistency": [],
        "Fluency": [],
        "Relevance": []
    }

    print(f"Evaluating SummEval metrics for {name}...")

    for _, row in df.iterrows():
        reference = row['summary']
        generated = row['generated_summary']
        metrics = compute_summ_eval(reference, generated)

        # Append each metric to its corresponding list
        for key, value in metrics.items():
            results[key].append(value)

    # Compute average metrics
    avg_results = {key: sum(values) / len(values) for key, values in results.items()}
    return avg_results

# Example usage for DataFrame 1 and DataFrame 2
summ_eval_df1 = evaluate_summ_eval(df1, "DataFrame 1")
summ_eval_df2 = evaluate_summ_eval(df2, "DataFrame 2")

# Print formatted results
def print_summ_eval_results(results, name):
    print(f"\n{name} SummEval Metrics:")
    for key, value in results.items():
        print(f"{key}: {value:.4f}")

print_summ_eval_results(summ_eval_df1, "DataFrame 1")
print_summ_eval_results(summ_eval_df2, "DataFrame 2")


ModuleNotFoundError: No module named 'summeval'

In [2]:
!pip install summ-eval




In [4]:
!python3.8 -m venv summeval_env
!source summeval_env/bin/activate  # Linux/Mac

/bin/bash: line 1: python3.8: command not found
/bin/bash: line 1: summeval_env/bin/activate: No such file or directory


In [1]:
pip install summ-eval




Exception in thread Thread-5 (attachment_entry):
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/debugpy/server/api.py", line 237, in listen
    sock, _ = endpoints_listener.accept()
  File "/usr/lib/python3.10/socket.py", line 293, in accept
    fd, addr = self._accept()
TimeoutError: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/google/colab/_debugpy.py", line 52, in attachment_entry
    debugpy.listen(_dap_port)
  File "/usr/local/lib/python3.10/dist-packages/debugpy/public_api.py", line 31, in wrapper
    return wrapped(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/debugpy/server/api.py", line 143, in debug
    log.reraise



In [3]:
from summ_eval.metrics.text_metrics import TextMetrics

# Initialize the SummEval text metrics calculator
text_metrics = TextMetrics()

# Function to compute SummEval metrics for a single reference and generated summary
def compute_summ_eval(reference, generated):
    """
    Compute SummEval metrics for a single reference and generated summary.
    """
    return {
        "Coherence": text_metrics.coherence([generated], [reference])[0],
        "Consistency": text_metrics.consistency([generated], [reference])[0],
        "Fluency": text_metrics.fluency([generated])[0],
        "Relevance": text_metrics.relevance([generated], [reference])[0]
    }

# Evaluate SummEval metrics for a DataFrame
def evaluate_summ_eval(df, name):
    """
    Compute average SummEval metrics for all rows in a DataFrame.
    """
    results = {
        "Coherence": [],
        "Consistency": [],
        "Fluency": [],
        "Relevance": []
    }

    print(f"Evaluating SummEval metrics for {name}...")

    for _, row in df.iterrows():
        reference = row['summary']
        generated = row['generated_summary']
        metrics = compute_summ_eval(reference, generated)

        # Append each metric to its corresponding list
        for key, value in metrics.items():
            results[key].append(value)

    # Compute average metrics
    avg_results = {key: sum(values) / len(values) for key, values in results.items()}
    return avg_results

# Example usage for DataFrame 1 and DataFrame 2
summ_eval_df1 = evaluate_summ_eval(df1, "DataFrame 1")
summ_eval_df2 = evaluate_summ_eval(df2, "DataFrame 2")

# Print formatted results
def print_summ_eval_results(results, name):
    print(f"\n{name} SummEval Metrics:")
    for key, value in results.items():
        print(f"{key}: {value:.4f}")

# Display results for both DataFrames
print_summ_eval_results(summ_eval_df1, "DataFrame 1")
print_summ_eval_results(summ_eval_df2, "DataFrame 2")


ModuleNotFoundError: No module named 'summ_eval.metrics'

#Sentence BERT Score

In [9]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


# function to calculate cosine similarity
def calculate_similarity(summary, generated_summary):
    """
    Compute the semantic similarity between two texts.

    Args:
        summary (str): The reference summary.
        generated_summary (str): The generated summary.

    Returns:
        float: Cosine similarity score between the two embeddings.
    """
    # Generate embeddings for both summaries
    summary_embedding = model.encode(summary)
    generated_summary_embedding = model.encode(generated_summary)

    # Compute cosine similarity
    similarity = cosine_similarity([summary_embedding], [generated_summary_embedding])
    return similarity[0][0]

# Load the pre-trained Sentence-BERT model
model_name = 'all-roberta-large-v1'
print(f"Loading model: {model_name}")
model = SentenceTransformer(model_name)

# Load the dataset containing summaries
file_path = '/content/drive/MyDrive/Project/t5_generated_test_summaries.csv'
print(f"Loading dataset from: {file_path}")
df = pd.read_csv(file_path)

# Apply the similarity function to the dataset
print("Calculating similarity for each summary pair...")
df['similarity'] = df.apply(
    lambda row: calculate_similarity(row['summary'], row['generated_summary']), axis=1
)

# Calculate the overall similarity
overall_similarity = df['similarity'].mean()
print(f"Overall Semantic Similarity: {overall_similarity:.4f}")



Loading model: all-roberta-large-v1


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Loading dataset from: /content/drive/MyDrive/Project/t5_generated_test_summaries.csv
Calculating similarity for each summary pair...
Overall Semantic Similarity: 0.7591
