In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/results-gpt4o-mini-cosine.csv', nrows=300)

## Q1. Getting the embeddings model

In [3]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
answer_llm = df.iloc[0].answer_llm
vector = embedding_model.encode(answer_llm)
print("What's the first value of the resulting vector?: ", round(vector[0], 2))

  from tqdm.autonotebook import tqdm, trange


What's the first value of the resulting vector?:  -0.42


## Q2. Computing the dot product

In [4]:
# Function to compute embeddings
def emb_cal(row, column):
    value = row[column]
    return embedding_model.encode(value)

# Compute embeddings for both columns
df['answer_llm_vector'] = df.apply(lambda row: emb_cal(row, column='answer_llm'), axis=1)
df['answer_orig_vector'] = df.apply(lambda row: emb_cal(row, column='answer_orig'), axis=1)

In [5]:
# Compute the dot product and store in 'score'
df['score'] = df.apply(lambda row: np.dot(row['answer_llm_vector'], row['answer_orig_vector']), axis=1)

print("What's the 75% percentile of the score?: ", round(df['score'].quantile(0.75), 2))

What's the 75% percentile of the score?:  31.67


## Q3. Computing the cosine

In [6]:
# Function to normalize vectors
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

# Normalize vectors
df['answer_llm_vector_norm'] = df['answer_llm_vector'].apply(normalize_vector)
df['answer_orig_vector_norm'] = df['answer_orig_vector'].apply(normalize_vector)

# Compute the cosine similarity (dot product of normalized vectors)
df['cosine_similarity'] = df.apply(lambda row: np.dot(row['answer_llm_vector_norm'], row['answer_orig_vector_norm']), axis=1)

print("What's the 75% cosine in the scores?: ", round(df['cosine_similarity'].quantile(0.75), 3))

What's the 75% cosine in the scores?:  0.836


## Q4. Rouge

In [7]:
from rouge import Rouge

rouge_scorer = Rouge()
scores = rouge_scorer.get_scores(df['answer_llm'], df['answer_orig'])[10]
print("What's the F score for rouge-1? ", round(scores['rouge-1']['f'], 2))

What's the F score for rouge-1?  0.45


## Q5. Average rouge score

In [8]:
f1_scores = [scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f']]
print("Average between rouge-1, rouge-2 and rouge-l for the same record from Q4 ", round(sum(f1_scores)/ len(f1_scores), 2))

Average between rouge-1, rouge-2 and rouge-l for the same record from Q4  0.35


## Q6. Average rouge score for all the data points

In [9]:
rouge_scorer = Rouge()
# Define a function to calculate ROUGE scores for a single row
def calculate_rouge_scores(row):
    # Get the ROUGE scores for the given row
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    # Extract F1 scores for ROUGE-1, ROUGE-2, and ROUGE-L
    rouge_1_f = scores['rouge-1']['f']
    rouge_2_f = scores['rouge-2']['f']
    rouge_l_f = scores['rouge-l']['f']
    rouge_avg = (rouge_1_f + rouge_2_f + rouge_l_f) / 3
    return pd.Series([rouge_1_f, rouge_2_f, rouge_l_f, rouge_avg])

# Apply the function to each row in the DataFrame
df[['rouge-1_f1', 'rouge-2_f1', 'rouge-l_f1', 'rouge-avg_f1']] = df.apply(calculate_rouge_scores, axis=1)

print("What's the agerage rouge_2 across all the records? ", round(df['rouge-2_f1'].mean(), 3))

What's the agerage rouge_2 across all the records?  0.207
