## BLEU Score Calculation to Compare Similarity Between LLM Response and Ground Truth Response from Human Coaches

In [None]:
# Running this cell may make changes to your environment

# !pip install transformers sacrebleu
# !pip install nltk

In [None]:
import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

In [None]:
# Function to calculate BLEU score for two texts
def calculate_bleu(reference, candidate):
    reference_tokens = [word_tokenize(reference.lower())]
    candidate_tokens = word_tokenize(candidate.lower())
    # Applying smoothing function for cases where perfect matches don't occur
    smoothie = SmoothingFunction().method1
    return sentence_bleu(reference_tokens, candidate_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie)

In [None]:
# Load our LLM responses
with open('prompt1_LLM_response.json', 'r', encoding='utf-8') as file:
    llm_data = json.load(file)

# Load ground truth responses
# We used testing transcripts as the ground truth to compare with LLM responses.
# The testing transcripts are the real transcripts of leadership coaching sessions happend between a human coach and a leader
# Therefore, the human coach responses are considered as ground truth.
with open('Testing Transcripts.json', 'r', encoding='utf-8') as file:
    ground_truth_data = json.load(file)

filtered_gt_data = [item for item in ground_truth_data if 'coach' in item]

# List to hold individual BLEU scores
bleu_scores = []

# Assuming both files are of the same length and corresponding indices match
for llm_item, gt_item in zip(llm_data, filtered_gt_data):
    ground_truth_response = gt_item['coach']
    llm_response = llm_item['response']
    score = calculate_bleu(ground_truth_response, llm_response)
    bleu_scores.append(score)
    print(f"Prompt: {llm_item['prompt']}\nGround Truth: {ground_truth_response}\nLLM Response: {llm_response}\nBLEU Score: {score}\n")

# Calculate average BLEU
average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU Score across all prompts: {average_bleu_score}")
