In [1]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '04-monitoring/data/results-gpt4o-mini.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df = pd.read_csv(ground_truth_url)

In [2]:
df = df.iloc[:300]

In [3]:
df

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp
...,...,...,...,...,...
295,An alternative way to load the data using the ...,Above users showed how to load the dataset dir...,8d209d6d,What is an alternative way to load the data us...,machine-learning-zoomcamp
296,You can directly download the dataset from Git...,Above users showed how to load the dataset dir...,8d209d6d,How can I directly download the dataset from G...,machine-learning-zoomcamp
297,You can fetch data for homework using the `req...,Above users showed how to load the dataset dir...,8d209d6d,Could you share a method to fetch data for hom...,machine-learning-zoomcamp
298,If the status code is 200 when downloading dat...,Above users showed how to load the dataset dir...,8d209d6d,What should I do if the status code is 200 whe...,machine-learning-zoomcamp


In [4]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-mpnet-base-dot-v1'
model = SentenceTransformer(model_name)

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [5]:
answer_llm = df.iloc[0].answer_llm
model.encode(answer_llm)[0]

-0.42244655

In [6]:
ground_truth = df.to_dict(orient='records')

In [7]:
import numpy as np
from tqdm.auto import tqdm

evaluations = []
for i, row in enumerate(tqdm(ground_truth)):
    embedding_llm = model.encode(row['answer_llm'])
    embedding_orig = model.encode(row['answer_orig'])
    
    dot_product = np.dot(embedding_llm, embedding_orig)
    
    evaluations.append(dot_product)


  0%|          | 0/300 [00:00<?, ?it/s]

In [8]:
# Calculate the 75th percentile of the evaluations
percentile_75 = np.percentile(evaluations, 75)
percentile_75

31.67430877685547

In [9]:
def normalize(v):
    np_v = np.array(v)
    norm = np.sqrt((np_v * np_v).sum())
    return v / norm


In [10]:
normalized_evaluations = []
for i, row in enumerate(tqdm(ground_truth)):
    embedding_llm = model.encode(row['answer_llm'])
    embedding_orig = model.encode(row['answer_orig'])
    
    dot_product = np.dot(normalize(embedding_llm), normalize(embedding_orig))
    
    normalized_evaluations.append(dot_product)

  0%|          | 0/300 [00:00<?, ?it/s]

In [11]:
normalized_percentile_75 = np.percentile(normalized_evaluations, 75)
normalized_percentile_75

0.8362348973751068

In [15]:
r = df.iloc[10]
r

answer_llm     Yes, all sessions are recorded, so if you miss...
answer_orig    Everything is recorded, so you won’t miss anyt...
document                                                5170565b
question                    Are sessions recorded if I miss one?
course                                 machine-learning-zoomcamp
Name: 10, dtype: object

In [16]:
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [17]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}