In [1]:
!python -V

Python 3.10.13


In [2]:
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm

### Get data

In [4]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [5]:
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [6]:
df = df.iloc[:300]

In [8]:
# !pip install sentence_transformers

In [9]:
from sentence_transformers import SentenceTransformer

model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [10]:
answer_llm = df.iloc[0].answer_llm

### Embedding first value

In [17]:
embedding_model.encode(answer_llm)

array([-4.22446549e-01, -2.24856257e-01, -3.24058414e-01, -2.84758478e-01,
        7.25642918e-03,  1.01186566e-01,  1.03716910e-01, -1.89983174e-01,
       -2.80599259e-02,  2.71588802e-01, -1.15337655e-01,  1.14666030e-01,
       -8.49586725e-02,  3.32365334e-01,  5.52720726e-02, -2.22195774e-01,
       -1.42540857e-01,  1.02519155e-01, -1.52333647e-01, -2.02912465e-01,
        1.98422875e-02,  8.38149190e-02, -5.68632066e-01,  2.32844148e-02,
       -1.67292684e-01, -2.39256918e-01, -8.05464387e-02,  2.57084146e-02,
       -8.15464780e-02, -7.39290118e-02, -2.61550009e-01,  1.92575473e-02,
        3.22909206e-01,  1.90357104e-01, -9.34726413e-05, -2.13165611e-01,
        2.88943425e-02, -1.79530401e-02, -5.92756271e-02,  1.99918285e-01,
       -4.75170948e-02,  1.71634093e-01, -2.45917086e-02, -9.38061550e-02,
       -3.57002735e-01,  1.33263692e-01,  1.94045901e-01, -1.18530318e-01,
        4.56915230e-01,  1.47728190e-01,  3.35945129e-01, -1.86959356e-01,
        2.45954901e-01, -

### Dot product: 75 percentile

In [12]:
records = df.to_dict(orient='records')

In [13]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [14]:
similarity = []
Average Rouge 
for record in tqdm(records):
    sim = compute_similarity(record)
    similarity.append(sim)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:23<00:00,  3.59it/s]


In [15]:
df['cosine'] = similarity
df['cosine'].describe()

count    300.000000
mean      27.495996
std        6.384742
min        4.547923
25%       24.307844
50%       28.336870
75%       31.674309
max       39.476013
Name: cosine, dtype: float64

### Cosine 75 percentile

In [20]:
def compute_similarity_norm(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)

    v_llm_norm = np.sqrt((v_llm * v_llm).sum())
    v_llm = v_llm / v_llm_norm

    v_orig_norm = np.sqrt((v_orig * v_orig).sum())
    v_orig = v_orig / v_orig_norm
    
    return v_llm.dot(v_orig)

In [None]:
similarity = []

for record in tqdm(records):
    sim = compute_similarity_norm(record)
    similarity.append(sim)

 52%|███████████████████████████████████████████████▌                                           | 157/300 [00:41<00:50,  2.83it/s]

In [None]:
df['cosine'] = similarity
df['cosine'].describe()

### Rouge 1 F 

In [None]:
# !pip install rouge

In [None]:
r = records[10]
r

In [None]:
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [None]:
scores

### Average Rouge 

In [None]:
# Extracting the F-scores
f_scores = [scores[key]['f'] for key in scores]

# Calculating the average F-score
average_f_score = sum(f_scores) / len(f_scores)
average_f_score

### Average Rouge 2 

In [None]:
f_scores = []
for r in records:
    scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
    f_scores.append(scores['rouge-2']['f'])

# Calculating the average F-score for rouge-2
average_f_score = sum(f_scores) / len(f_scores)
average_f_score