In [2]:
# pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting absl-py (from rouge-score)
  Obtaining dependency information for absl-py from https://files.pythonhosted.org/packages/a2/ad/e0d3c824784ff121c03cc031f944bc7e139a8f1870ffd2845cc2dd76f6c4/absl_py-2.1.0-py3-none-any.whl.metadata
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=f7294e487ac02fc65e31855f1bb59133d8189a4aba78695fe2f6f15e1189adb9
  Stored in directory: /home/ziggy/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475

In [1]:
import nltk
# from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from transformers import BartTokenizer, BartForConditionalGeneration, BartModel
import torch

import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import random

In [2]:
# Download necessary NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ziggy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Data Collection

In [None]:
# list of questions
ref_ls = [
        "The cat is on the mat",
        "There is a cat on the mat"
    ]
cad_ls = [
        "The cat is on the mat",
        "There is not any cat on the mat"
    ]

In [5]:

# Function to compute BARTScore
def compute_bart_score(candidates, references):
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    model = BartModel.from_pretrained('facebook/bart-large-cnn')

    candidate_ids = [tokenizer.encode(cand, return_tensors='pt') for cand in candidates]
    reference_ids = [tokenizer.encode(ref[0], return_tensors='pt') for ref in references]

    with torch.no_grad():
        scores = []
        for cand_id, ref_id in zip(candidate_ids, reference_ids):
            outputs = model(input_ids=cand_id, labels=ref_id)
            score = outputs.loss.item()
            scores.append(score)
    
    return scores

# Compute and print scores
bleu_score = compute_bleu(candidates, references)
# rouge_scores = compute_rouge(candidates, references)
# bart_scores = compute_bart_score(candidates, references)

print(f"BLEU Score: {bleu_score:.4f}")
# print("ROUGE Scores:", rouge_scores)
# print("BART Scores:", bart_scores)


BLEU Score: 0.0000


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


## calculate bleu score

In [6]:
def calculate_bleu(reference_texts, hypothesis_texts, ngram_order=4, smoothing=True):
    """
    Calculate BLEU score for a list of reference and hypothesis texts.
    
    Parameters:
    - reference_texts: List of reference texts.
    - hypothesis_texts: List of hypothesis texts.
    - ngram_order: Maximum n-gram order to use for BLEU calculation.
    - smoothing: Whether to apply smoothing.
    
    Returns:
    - bleu_score: The calculated BLEU score.
    """
    smooth_fn = SmoothingFunction().method1 if smoothing else None
    weights = tuple((1.0 / ngram_order) for _ in range(ngram_order))
    
    total_bleu_score = 0.0
    for ref, hyp in zip(reference_texts, hypothesis_texts):
        ref_tokens = [ref.split()]
        hyp_tokens = hyp.split()
        bleu_score = sentence_bleu(ref_tokens, hyp_tokens, weights=weights, smoothing_function=smooth_fn)
        total_bleu_score += bleu_score
    
    return total_bleu_score / len(hypothesis_texts)



In [10]:
    
# Calculate BLEU score without smoothing
bleu_score_no_smoothing = calculate_bleu(ref_ls, cad_ls, ngram_order=4, smoothing=False)
print(f"BLEU score without smoothing: {bleu_score_no_smoothing}")
    
# Calculate BLEU score with smoothing
bleu_score_with_smoothing = calculate_bleu(ref_ls, cad_ls, ngram_order=4, smoothing=True)
print(f"BLEU score with smoothing: {bleu_score_with_smoothing}")
    
# Calculate BLEU score with lower n-gram order
bleu_score_lower_ngram = calculate_bleu(ref_ls, cad_ls, ngram_order=2, smoothing=True)
print(f"BLEU score with lower n-gram order: {bleu_score_lower_ngram}")

BLEU score without smoothing: 0.7055668084502599
BLEU score with smoothing: 0.7055668084502599
BLEU score with lower n-gram order: 0.8273268353539885


## calculate rough score

In [31]:
def cal_rouge_per(candidates, references):
    """
    calculate rouge score of each sentence in the candidates list
    
    Parameters:
    - candidates: candidate sentence list
    - references: reference sentence list
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(cand, ref) for cand, ref in zip(candidates, references)]
    return scores

In [28]:
# concatenate every string yo compute 
def cal_rouge_str(candidates, references):
    """
    calculate rouge score of the sentences in the candidates list in the form of a string
    
    Parameters:
    - candidates: candidate sentence list
    - references: reference sentence list
    """
    comb_candi = ""
    for i in candidates:
        comb_candi += i + " " 
    
    comb_ref = ""
    for n in references:
        comb_ref += n + " " 
    
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(comb_candi, comb_ref)]
    return scores

ref_ls = [
        "The cat is on the mat",
        "There is a cat on the mat"
    ]
cad_ls = [
        "The cat is on the mat",
        "There is not any cat on the mat"
    ]

score = cal_rouge_str(cad_ls, ref_ls)
print("Concatenated Sentence:")
print(score)


Concatenated Sentence:
[{'rouge1': Score(precision=0.9230769230769231, recall=0.8571428571428571, fmeasure=0.888888888888889), 'rouge2': Score(precision=0.8333333333333334, recall=0.7692307692307693, fmeasure=0.8), 'rougeL': Score(precision=0.9230769230769231, recall=0.8571428571428571, fmeasure=0.888888888888889)}]


In [32]:
ref_ls = [
        "The cat is on the mat",
        "There is a cat on the mat"
    ]
cad_ls = [
        "The cat is on the mat",
        "There is not any cat on the mat"
    ]

rough_score = cal_rouge_per(cad_ls, ref_ls)
print(rough_score)

[{'rouge1': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rouge2': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rougeL': Score(precision=1.0, recall=1.0, fmeasure=1.0)}, {'rouge1': Score(precision=0.8571428571428571, recall=0.75, fmeasure=0.7999999999999999), 'rouge2': Score(precision=0.6666666666666666, recall=0.5714285714285714, fmeasure=0.6153846153846153), 'rougeL': Score(precision=0.8571428571428571, recall=0.75, fmeasure=0.7999999999999999)}]


In [30]:
from rouge_score import rouge_scorer
import numpy as np

def cal_rouge_average(references, candidates):
    """
    calculate average rouge score (f-measure) of sentences in the candidate list
    """
    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Initialize lists to store scores for each pair
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    # Calculate ROUGE scores for each sentence pair
    for ref, cand in zip(references, candidates):
        scores = scorer.score(ref, cand)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    
    # Calculate average scores
    avg_rouge1 = np.mean(rouge1_scores)
    avg_rouge2 = np.mean(rouge2_scores)
    avg_rougeL = np.mean(rougeL_scores)
    
    return {
        'rouge1': avg_rouge1,
        'rouge2': avg_rouge2,
        'rougeL': avg_rougeL
    }


overall_scores = calculate_rouge_scores(ref_ls, cad_ls)
print("Overall ROUGE Scores:", overall_scores)


Overall ROUGE Scores: {'rouge1': 0.8999999999999999, 'rouge2': 0.8076923076923077, 'rougeL': 0.8999999999999999}


In [11]:
pip install evaluate

Collecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/c2/d6/ff9baefc8fc679dcd9eb21b29da3ef10c81aa36be630a7ae78e4611588e1/evaluate-0.4.2-py3-none-any.whl.metadata
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2
Note: you may need to restart the kernel to use updated packages.


In [18]:
import evaluate
meteor = evaluate.load('meteor')
predictions = ["The quick brown fox jumps over the lazy dog."]
references = ["A quick brown fox jumps over the lazy dog."]
results = meteor.compute(predictions=predictions, references=references)

[nltk_data] Downloading package wordnet to /home/ziggy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/ziggy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ziggy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [19]:
print(results)

{'meteor': 0.8993827160493828}


In [20]:
from nltk.translate import meteor
from nltk import word_tokenize

score = round(meteor(
       [word_tokenize("A quick brown fox jumps over the lazy dog.")],
       word_tokenize('The quick brown fox jumps over the lazy dog.')
       ), 4)
print(score)

0.8994


In [6]:
from nltk.tokenize import word_tokenize
from nltk.translate import meteor_score

# Example reference and candidate texts
reference_texts = ["A quick brown fox jumps over the lazy dog."]
candidate_texts = ["The quick brown fox jumps over the lazy dog."]

# Tokenize and preprocess (convert to lowercase) the reference and candidate texts
reference_tokens = [word_tokenize(text.lower()) for text in reference_texts]
candidate_tokens = [word_tokenize(text.lower()) for text in candidate_texts]

# Define a preprocess function if needed (e.g., for stemming)
def preprocess(tokens):
    # Example: Perform any preprocessing (e.g., stemming)
    return tokens

# Calculate METEOR score
meteor_score_value = meteor_score(
    reference_tokens,
    candidate_tokens,
    # preprocess=preprocess  # Optional: Pass your preprocess function if needed
)

print(f"METEOR Score: {meteor_score_value}")


TypeError: 'module' object is not callable

In [7]:
meteor = evaluate.load('meteor')
predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
references = ["It is a guide to action that ensures that the military will forever heed Party commands"]
results = meteor.compute(predictions=predictions, references=references)

NameError: name 'evaluate' is not defined

In [4]:
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score

# Example reference and candidate texts
reference_texts = ["A quick brown fox jumps over the lazy dog."]
candidate_texts = ["The quick brown fox jumps over the lazy dog."]

# Tokenize and convert to lowercase
reference_tokens = [word_tokenize(text.lower()) for text in reference_texts]
candidate_tokens = [word_tokenize(text.lower()) for text in candidate_texts]

print(reference_tokens)
print()
print(candidate_tokens)

# Calculate METEOR score
meteor_score_value = meteor_score(reference_tokens, candidate_tokens)
# print(f"METEOR Score: {meteor_score_value}")


[['a', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']]

[['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']]


TypeError: descriptor 'lower' for 'str' objects doesn't apply to a 'list' object

## Meteor Score

In [1]:
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score

# Example reference and candidate texts
reference_texts = ["A quick brown fox jumps over the lazy dog."]
candidate_texts = ["The quick brown fox jumps over the lazy dog."]

# Tokenize the reference and candidate texts into lists of tokens (list of lists)
reference_tokens = [word_tokenize(text.lower()) for text in reference_texts]
candidate_tokens = [word_tokenize(text.lower()) for text in candidate_texts]

# Calculate METEOR score
meteor_score_value = meteor_score(reference_tokens, candidate_tokens)
print(f"METEOR Score: {meteor_score_value}")


TypeError: descriptor 'lower' for 'str' objects doesn't apply to a 'list' object

## Calculate Bert-score

In [34]:
def compute_bart_score(candidates, references):
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    model = BartModel.from_pretrained('facebook/bart-large-cnn')

    candidate_ids = [tokenizer.encode(cand, return_tensors='pt') for cand in candidates]
    reference_ids = [tokenizer.encode(ref[0], return_tensors='pt') for ref in references]

    with torch.no_grad():
        scores = []
        for cand_id, ref_id in zip(candidate_ids, reference_ids):
            outputs = model(input_ids=cand_id, labels=ref_id)
            score = outputs.loss.item()
            scores.append(score)
    
    return scores

In [36]:
score = compute_bart_score(cad_ls, ref_ls)
print(score)

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

TypeError: BartModel.forward() got an unexpected keyword argument 'labels'

In [37]:
pip install bert-score

Collecting bert-score
  Obtaining dependency information for bert-score from https://files.pythonhosted.org/packages/c6/8c/bc5457de4c004b1a623b31f7bc8d0375fb699b7d67df11879098b4b7b7c8/bert_score-0.3.13-py3-none-any.whl.metadata
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13
Note: you may need to restart the kernel to use updated packages.


In [38]:
from bert_score import score

reference = ["A quick brown fox jumps over a lazy dog."]
candidate = ["The quick brown fox jumps over the lazy dog."]
P, R, F1 = score(candidate, reference, lang="en", verbose=True)
print(f"BERTScore Precision: {P.mean().item()}")
print(f"BERTScore Recall: {R.mean().item()}")
print(f"BERTScore F1: {F1.mean().item()}")

Downloading config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

ReadTimeout: HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Read timed out. (read timeout=10.0)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Example BLEU and ROUGE scores for multiple candidate-reference pairs
data = {
    "Candidate": ["Candidate 1", "Candidate 2", "Candidate 3"],
    "BLEU-1": [0.5, 0.6, 0.7],
    "BLEU-2": [0.4, 0.5, 0.6],
    "BLEU-4": [0.3, 0.4, 0.5],
    "ROUGE-1": [0.6, 0.7, 0.8],
    "ROUGE-2": [0.5, 0.6, 0.7],
    "ROUGE-L": [0.55, 0.65, 0.75]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Set up the seaborn style
sns.set(style="whitegrid")

# Plot BLEU scores
plt.figure(figsize=(14, 8))

# Reshape the DataFrame for plotting
melted_bleu = df.melt(id_vars="Candidate", value_vars=["BLEU-1", "BLEU-2", "BLEU-4"], var_name="Metric", value_name="Score")

# Plot BLEU scores with seaborn
plt.subplot(2, 1, 1)
sns.barplot(x="Candidate", y="Score", hue="Metric", data=melted_bleu)
plt.title('BLEU Scores')
plt.ylabel('Score')
plt.legend(loc='upper left')

# Reshape the DataFrame for plotting
melted_rouge = df.melt(id_vars="Candidate", value_vars=["ROUGE-1", "ROUGE-2", "ROUGE-L"], var_name="Metric", value_name="Score")

# Plot ROUGE scores with seaborn
plt.subplot(2, 1, 2)
sns.barplot(x="Candidate", y="Score", hue="Metric", data=melted_rouge)
plt.title('ROUGE Scores')
plt.ylabel('Score')
plt.legend(loc='upper left')

plt.tight_layout()
plt.show()
