<a href="https://colab.research.google.com/github/wesslen/llm-evaluations/blob/main/notebooks/01_llm_evaluations_reference_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# use uv for virtual environment https://docs.astral.sh/uv/
!uv pip install --system bert-score rouge-score sacrebleu evaluate transformers moverscore comet pytorch-pretrained-bert pyemd unbabel-comet

[2mUsing Python 3.10.12 environment at /usr[0m
[2K[2mResolved [1m87 packages[0m [2min 6.27s[0m[0m
[2K[36m[1mBuilding[0m[39m comet[2m==3.1.0[0m
[2K[1A[36m[1mBuilding[0m[39m comet[2m==3.1.0[0m
[36m[1mBuilding[0m[39m rouge-score[2m==0.1.2[0m
[2K[2A[36m[1mBuilding[0m[39m comet[2m==3.1.0[0m
[36m[1mBuilding[0m[39m rouge-score[2m==0.1.2[0m
[36m[1mBuilding[0m[39m moverscore[2m==1.0.3[0m
[2K[3A[36m[1mBuilding[0m[39m comet[2m==3.1.0[0m
[36m[1mBuilding[0m[39m rouge-score[2m==0.1.2[0m
[36m[1mBuilding[0m[39m moverscore[2m==1.0.3[0m
[37m⠙[0m [2mPreparing packages...[0m (0/34)
[2K[4A[36m[1mBuilding[0m[39m comet[2m==3.1.0[0m
[36m[1mBuilding[0m[39m rouge-score[2m==0.1.2[0m
[36m[1mBuilding[0m[39m moverscore[2m==1.0.3[0m
[37m⠙[0m [2mPreparing packages...[0m (0/34)
[2K[4A[36m[1mBuilding[0m[39m comet[2m==3.1.0[0m
[36m[1mBuilding[0m[39m rouge-score[2m==0.1.2[0m
[36m[1mBuilding[0m[39m moverscor

## Reference-based Metrics

### BLEU

In [2]:
# time
import time

# Machine translation evaluation metric
from sacrebleu.metrics import BLEU

bleu = BLEU()

# Note: SacreBLEU expects references as a list of lists (multiple references per translation)
# List of texts for multi-sample evaluation, see https://github.com/mjpost/sacrebleu/tree/master#using-sacrebleu-from-python
refs = [ # First set of references
         ['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.'],
         # Second set of references
         ['The dog had bit the man.', 'No one was surprised.', 'The man had bitten the dog.'],
       ]
sys = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']

start_time = time.time()

bleu_result = bleu.corpus_score(
    sys,
    refs
)

print("BLEU Score Results:")
print(f"BLEU Score: {bleu_result.score:.3f}")
print(f"Precisions: {[f'{p:.1f}' for p in bleu_result.precisions]}")
print(f"Brevity Penalty: {bleu_result.bp:.3f}")
print("\n")

# end time
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.4f} seconds")

BLEU Score Results:
BLEU Score: 48.531
Precisions: ['82.4', '50.0', '45.5', '37.5']
Brevity Penalty: 0.943


Execution time: 0.0059 seconds


In [3]:
# provides how calculated
bleu.get_signature()

nrefs:2|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3

In [4]:
# HF evaluate calculation
from evaluate import load

hf_bleu = load("sacrebleu")

# HF uses different specification
# convert refs to list of 3 lists each with 2 entries
refs = [list(ref) for ref in zip(*refs)]
refs

start_time = time.time()
results = hf_bleu.compute(predictions=sys, references=refs)
print(results)

# end time
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.4f} seconds")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

{'score': 48.530827009929865, 'counts': [14, 7, 5, 3], 'totals': [17, 14, 11, 8], 'precisions': [82.3529411764706, 50.0, 45.45454545454545, 37.5], 'bp': 0.9428731438548749, 'sys_len': 17, 'ref_len': 18}
Execution time: 0.0505 seconds


### ROUGE

In [5]:
# Calculate ROUGE scores for text similarity based on n-gram overlap
from rouge_score import rouge_scorer

# Sample texts for evaluation
candidate = "The quick brown fox jumps over the lazy dog"
reference = "A quick brown fox jumped over a lazy dog"

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, candidate)

print("ROUGE Score Results:")
print(f"ROUGE-1: {scores['rouge1'].fmeasure:.3f}")
print(f"ROUGE-2: {scores['rouge2'].fmeasure:.3f}")
print(f"ROUGE-L: {scores['rougeL'].fmeasure:.3f}")
print("\n")

ROUGE Score Results:
ROUGE-1: 0.778
ROUGE-2: 0.625
ROUGE-L: 0.778




### Bert Score

In [6]:
# Evaluates text similarity using BERT embeddings
from bert_score import score

# List of texts for multi-sample evaluation
candidates = ["The house is small", "The cat sits on the mat"]
references = ["The house is tiny", "There is a cat on the mat"]

P, R, F1 = score(candidates, references, lang='en', verbose=True)

print("BERT Score Results:")
print(f"Precision: {P.mean().item():.3f}")
print(f"Recall: {R.mean().item():.3f}")
print(f"F1: {F1.mean().item():.3f}")
print("\n")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.32 seconds, 1.52 sentences/sec
BERT Score Results:
Precision: 0.969
Recall: 0.965
F1: 0.967




In [7]:
# Uses BART model for scoring
import torch
from transformers import BartTokenizer, BartForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [8]:

def compute_bartscore(source, target):
    # Tokenize
    batch = bart_tokenizer(source, target, return_tensors="pt", padding=True, truncation=True)
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    # Generate score
    with torch.no_grad():
        outputs = bart_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=input_ids
        )

    return -outputs.loss.item()  # Negative log likelihood

bart_scores = [
    compute_bartscore(ref, cand)
    for ref, cand in zip(references, candidates)
]

print("BARTScore Results:")
for i, score in enumerate(bart_scores):
    print(f"Text pair {i+1}: {score:.3f}")
print("\n")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


BARTScore Results:
Text pair 1: -2.283
Text pair 2: -1.591




### MoverScore

In [9]:
# require GPU

from moverscore import word_mover_score  # MoverScore
from collections import defaultdict

# Sample texts for evaluation
# Uses Word Mover Distance with contextual embeddings
# Higher scores indicate better similarity
idf_dict_hyp = defaultdict(lambda: 1.)
idf_dict_ref = defaultdict(lambda: 1.)

mover_scores = word_mover_score(
    references, candidates,
    idf_dict_ref, idf_dict_hyp,
    stop_words=[], n_gram=1,
    remove_subwords=True
)

print("MoverScore Results:")
for i, score in enumerate(mover_scores):
    print(f"Text pair {i+1}: {score:.3f}")
print("\n")

Downloading https://github.com/AIPHES/emnlp19-moverscore/releases/download/0.6/MNLI_BERT.zip to /root/.moverscore/MNLI_BERT.zip
[--------------------------------------------------]


  state_dict = torch.load(weights_path, map_location='cpu')


MoverScore Results:
Text pair 1: 0.926
Text pair 2: 0.586




### COMET

In [14]:
from comet import download_model, load_from_checkpoint  # COMET

# Download and load COMET model
model_path = download_model("wmt20-comet-da")
model = load_from_checkpoint(model_path)

ImportError: cannot import name 'download_model' from 'comet' (/usr/local/lib/python3.10/dist-packages/comet/__init__.py)

In [None]:
# Prepare data for COMET

# input / source -- what should be translated
source_texts = [
    "Hello, my dog is cute",
    "The weather is nice today",
    "I love programming"
]

# hypothetical model predictions
candidates = [
    ["Salut, mon chien est mignon"],
    ["Le temps est beau aujourd'hui"],
    ["J'adore la programmation"]
]

# gold standard (references)
references = [
    ["Bonjour, mon chien est mignon"],
    ["Le temps est beau aujourd'hui"],
    ["J'aime la programmation"]
]

comet_data = [{
    "src": source_texts,
    "mt": cand,
    "ref": references
} for cand, ref in zip(candidates, references)]

# Get COMET scores
comet_scores = model.predict(comet_data, batch_size=8, gpus=1)

comet_scores

### Perplexity

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Prepare text
text = "This is a sample text to evaluate perplexity."
encodings = tokenizer(text, return_tensors='pt')

# Calculate perplexity
with torch.no_grad():
    outputs = model(encodings.input_ids, labels=encodings.input_ids)
    ppl = torch.exp(outputs.loss)

print("Perplexity Results:")
print(f"Perplexity: {ppl.item():.3f}")

## Libraries

In [None]:
# Now check versions
import pkg_resources
import sys

def get_package_details():
    """Print details of specific packages and Python version"""
    packages_to_check = [
        'torch',
        'transformers',
        'sacrebleu',
        'bert-score',
        'rouge-score',
        'moverscore',
        'bleurt',
        'comet',
        'numpy',
        'sentencepiece'  # Often used by transformers
    ]

    print("Python version:", sys.version.split()[0])
    print("\nPackage versions:")
    print("-" * 50)

    for package in packages_to_check:
        try:
            version = pkg_resources.get_distribution(package).version
            print(f"{package:<15} {version}")
        except pkg_resources.DistributionNotFound:
            print(f"{package:<15} Not installed")

# Check CUDA availability for PyTorch
import torch
print("\nCUDA Status:")
print("-" * 50)
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Current GPU: {torch.cuda.get_device_name()}")

# Run the check
get_package_details()