In [6]:
import nltk
import ssl

# This block is often needed for NLTK downloads on some systems
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

print("Attempting to download NLTK 'punkt' tokenizer...")
try:
    nltk.data.find('tokenizers/punkt')
    print("'punkt' tokenizer already downloaded.")
except LookupError: # Corrected from nltk.downloader.LookupError
    print("'punkt' tokenizer not found, downloading now...")
    nltk.download('punkt')
    print("'punkt' tokenizer download complete.")

print("\nAttempting to download NLTK 'wordnet' corpus...")
try:
    nltk.data.find('corpora/wordnet')
    print("'wordnet' corpus already downloaded.")
except LookupError: # Corrected from nltk.downloader.LookupError
    print("'wordnet' corpus not found, downloading now...")
    nltk.download('wordnet')
    print("'wordnet' corpus download complete.")

print("\nAttempting to download NLTK 'omw-1.4' corpus...")
try:
    nltk.data.find('corpora/omw-1.4')
    print("'omw-1.4' corpus already downloaded.")
except LookupError: # Corrected from nltk.downloader.LookupError
    print("'omw-1.4' corpus not found, downloading now...")
    nltk.download('omw-1.4')
    print("'omw-1.4' corpus download complete.")

print("\nNLTK data checks complete.")

Attempting to download NLTK 'punkt' tokenizer...
'punkt' tokenizer not found, downloading now...


[nltk_data] Downloading package punkt to /home/ardjano/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/ardjano/nltk_data...


'punkt' tokenizer download complete.

Attempting to download NLTK 'wordnet' corpus...
'wordnet' corpus not found, downloading now...
'wordnet' corpus download complete.

Attempting to download NLTK 'omw-1.4' corpus...
'omw-1.4' corpus not found, downloading now...


[nltk_data] Downloading package omw-1.4 to /home/ardjano/nltk_data...


'omw-1.4' corpus download complete.

NLTK data checks complete.


In [9]:
nltk_data_items = ['punkt', 'wordnet', 'omw-1.4', 'punkt_tab'] # <--- ADDED: 'punkt_tab'

for item in nltk_data_items:
    print(f"Attempting to find/download '{item}'...")
    try:
        # For 'punkt_tab', NLTK's find path is slightly different, but the download command is usually `nltk.download('punkt_tab')`
        if item == 'punkt_tab':
            # Punkt_tab is a specific sub-package often found under tokenizers/punkt/
            # Direct find might look for 'tokenizers/punkt_tab/english' as per your error
            # We'll just try to download it directly if it fails.
            pass # We'll rely on the download catching it
        else:
            nltk.data.find(f'tokenizers/{item}') if 'punkt' in item else (nltk.data.find(f'corpora/{item}') if 'wordnet' in item or 'omw' in item else nltk.data.find(item))
        print(f"'{item}' already downloaded.")
    except LookupError: # Catches if resource is not found
        print(f"'{item}' not found, downloading now...")
        try:
            nltk.download(item) # This should handle 'punkt_tab' as well
            print(f"'{item}' download complete.")
        except Exception as e:
            print(f"Error downloading '{item}': {e}")
            print(f"Please try running 'nltk.download('{item}')' manually in your Python environment.")


Attempting to find/download 'punkt'...
'punkt' already downloaded.
Attempting to find/download 'wordnet'...
'wordnet' not found, downloading now...
'wordnet' download complete.
Attempting to find/download 'omw-1.4'...
'omw-1.4' not found, downloading now...
'omw-1.4' download complete.
Attempting to find/download 'punkt_tab'...
'punkt_tab' already downloaded.


[nltk_data] Downloading package wordnet to /home/ardjano/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ardjano/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [1]:
!uv pip install bert_score nltk rouge_score

[2mUsing Python 3.11.13 environment at: /home/ardjano/.pyenv/versions/3.11.13/envs/vllm-env[0m
[2K[2mResolved [1m58 packages[0m [2min 610ms[0m[0m                                        [0m
[2K   [36m[1mBuilding[0m[39m rouge-score[2m==0.1.2[0m                                         
[2K[1A   [36m[1mBuilding[0m[39m rouge-score[2m==0.1.2[0m                                 [1A
[37m⠙[0m [2mPreparing packages...[0m (0/3)
[2K[2A   [36m[1mBuilding[0m[39m rouge-score[2m==0.1.2[0m----[0m[0m     0 B/1.44 MiB            [2A
[37m⠙[0m [2mPreparing packages...[0m (0/3)
[2K[2A   [36m[1mBuilding[0m[39m rouge-score[2m==0.1.2[0m----[0m[0m     0 B/1.44 MiB            [2A
[37m⠙[0m [2mPreparing packages...[0m (0/3)
[2mbert-score          [0m [32m[2m------------------------------[0m[0m     0 B/59.70 KiB
[2K[3A   [36m[1mBuilding[0m[39m rouge-score[2m==0.1.2[0m----[0m[0m     0 B/1.44 MiB            [3A
[37m⠙[0m [2mPreparing packag

In [11]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/ardjano/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [12]:
from bert_score import BERTScorer
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import re
import pandas as pd
import torch
# from nltk.translate.nist_score import corpus_nist # Not used in your provided script
# from nltk.translate.meteor_score import meteor_score # Commented out
from nltk import word_tokenize
from tqdm.auto import tqdm # Import tqdm for progress bars

# Ensure NLTK data is downloaded if you haven't already
# import nltk
# try:
#     nltk.data.find('tokenizers/punkt')
# except nltk.downloader.DownloadError:
#     nltk.download('punkt')
# try:
#     nltk.data.find('corpora/wordnet')
# except nltk.downloader.DownloadError:
#     nltk.download('wordnet')
# try:
#     nltk.data.find('corpora/omw-1.4')
# except nltk.downloader.DownloadError:
#     nltk.download('omw-1.4')


pattern = re.compile(r"[^A-Za-z0-9 '’]")

def calculate_bertscore(candidate, ground_truth):
    # BERTScorer can be initialized once outside the loop for efficiency
    # but for simplicity in a function, we'll keep it here.
    # For better performance on large datasets, initialize it globally or pass it.
    scorer = BERTScorer(model_type='bert-base-uncased', device='cuda' if torch.cuda.is_available() else 'cpu')
    P, R, F1 = scorer.score([candidate], [ground_truth])
    return F1.mean().item() # .item() to get scalar from tensor

def calculate_bleu(candidate, standard1, standard2):
  # Standardize and tokenize for BLEU
  candidate_tokens = word_tokenize(re.sub(pattern, "", str(candidate)))
  reference1_tokens = word_tokenize(re.sub(pattern, "", str(standard1)))
  reference2_tokens = word_tokenize(re.sub(pattern, "", str(standard2)))

  # BLEU score requires list of references.
  # If you only have one true response, you can duplicate it for multiple references.
  # Your current setup passes ground_truth twice for standard1 and standard2.
  bleu_score = sentence_bleu([reference1_tokens, reference2_tokens], candidate_tokens)
  return bleu_score

def calculate_rouge_L(candidate, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    # Ensure inputs are strings
    rouge_L_score = scorer.score(str(candidate), str(ground_truth))
    return rouge_L_score['rougeL'].fmeasure

# def calculate_meteor(candidate, ground_truth):
#     # Ensure inputs are strings and tokenize
#     return meteor_score([word_tokenize(str(ground_truth))], word_tokenize(str(candidate)))


if __name__ == "__main__":
    # Define your input CSV file paths
    # Assuming these are the CSVs generated by your vLLM script (e.g., 'vllm_model_test_responses_full_testset.csv')
    # and other baseline models if you have them.
    # Adjust these paths as per your file structure.
    # For this example, let's assume we are evaluating the single vLLM output:
    vllm_output_csv = "vllm_model_test_responses_finetuned_gemma3-4b.csv" # Update to your actual vLLM output file
    # deepseek_df_path = "xuelong/deepseek_baseline1.csv" # Example from your original script
    # gemma_df_path = "xuelong/gemma_baseline1.csv"       # Example from your original script
    # qwen_df_path = "xuelong/qwen_baseline1.csv"         # Example from your original script

    # List of dataframes to process
    dataframes_to_process = {
        "vLLM_Generated": pd.read_csv(vllm_output_csv),
        # Add other models if you want to evaluate them too:
        # "DeepSeek_Baseline": pd.read_csv(deepseek_df_path),
        # "Gemma_Baseline": pd.read_csv(gemma_df_path),
        # "Qwen_Baseline": pd.read_csv(qwen_df_path),
    }

    for df_name, df_data in dataframes_to_process.items():
        print(f"\n--- Calculating metrics for {df_name} ---")

        # Ensure 'BERTScore' column exists and initialize it to 0.0
        # This prevents KeyError if the column doesn't exist.
        df_data['BERTScore'] = 0.0
        df_data['BLEU'] = 0.0
        df_data['ROUGE-L'] = 0.0
        # df_data['METEOR'] = 0.0 # Commented out

        # Use tqdm for a progress bar during iteration
        for i, row in tqdm(df_data.iterrows(), total=len(df_data), desc=f"Processing {df_name} rows"):
            # !!! IMPORTANT: Adjust column names here to match your CSV structure !!!
            # Based on previous discussions, 'Generated_Response' is the candidate
            # and 'True_Response' is the ground truth.
            candidate = row['Generated_Response']
            ground_truth = row['True_Response']

            # Handle potential non-string values gracefully
            candidate = str(candidate) if pd.isna(candidate) else candidate
            ground_truth = str(ground_truth) if pd.isna(ground_truth) else ground_truth

            try:
                bertscore = calculate_bertscore(candidate, ground_truth)
            except Exception as e:
                bertscore = 0.0
                print(f"BERTScore error at index {i} for {df_name}: {e}") # Debugging

            try:
                # For BLEU, if you only have one true response, you can still pass it twice
                # as the two "standards" if your function signature expects two.
                # calculate_bleu(candidate, ground_truth, ground_truth) is valid.
                bleu = calculate_bleu(candidate, ground_truth, ground_truth)
            except Exception as e:
                bleu = 0.0
                print(f"BLEU error at index {i} for {df_name}: {e}") # Debugging

            try:
                rouge_L = calculate_rouge_L(candidate, ground_truth)
            except Exception as e:
                rouge_L = 0.0
                print(f"ROUGE-L error at index {i} for {df_name}: {e}") # Debugging

            # try: # Commented out
            #     meteor = calculate_meteor(candidate, ground_truth)
            # except Exception as e:
            #     meteor = 0.0
            #     print(f"METEOR error at index {i} for {df_name}: {e}") # Debugging

            df_data.at[i, 'BERTScore'] = bertscore
            df_data.at[i, 'BLEU'] = bleu
            df_data.at[i, 'ROUGE-L'] = rouge_L
            # df_data.at[i, 'METEOR'] = meteor # Commented out

        # Define output file path
        output_csv_path = f"{df_name}_scores.csv" # Adjust output path as needed, e.g., "results/{df_name}_scores.csv"
        df_data.to_csv(output_csv_path, index=False)

        print(f"Metrics calculated for {df_name}. Saved to {output_csv_path}")
        print(f"Average scores for {df_name}:")
        print(f"  BERTScore: {df_data['BERTScore'].mean():.4f}")
        print(f"  BLEU: {df_data['BLEU'].mean():.4f}")
        print(f"  ROUge-L: {df_data['ROUGE-L'].mean():.4f}")
        # print(f"  METEOR: {df_data['METEOR'].mean():.4f}") # Commented out


--- Calculating metrics for vLLM_Generated ---


Processing vLLM_Generated rows:   0%|          | 0/200 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Metrics calculated for vLLM_Generated. Saved to vLLM_Generated_scores.csv
Average scores for vLLM_Generated:
  BERTScore: 0.5297
  BLEU: 0.0036
  ROUge-L: 0.1491
