###  Loading data for homework-04

In [2]:
import pandas as pd

github_url= 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/04-monitoring/data/results-gpt4o-mini.csv'

url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [3]:
df =  df.iloc[:300]

In [4]:
df.head(3)

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp


### Q1. Getting the embeddings model

In [5]:
from sentence_transformers import SentenceTransformer

model_name='multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [6]:
answer_llm = df.iloc[0].answer_llm

results_llm = embedding_model.encode(answer_llm)

round(results_llm[0],2)

np.float32(-0.42)

### Q2. Computing the dot product

In [7]:
results_df = df.to_dict(orient='records')

results_df[0]

{'answer_llm': 'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).',
 'answer_orig': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'document': '0227b872',
 'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp'}

In [8]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [9]:
from tqdm.autonotebook import tqdm
similarity = []

for record in tqdm(results_df):
    sim = compute_similarity(record)
    similarity.append(sim)

100%|██████████| 300/300 [00:59<00:00,  5.04it/s]


In [10]:
results_df= pd.DataFrame(results_df)
results_df['cosine'] = similarity
results_df['cosine'].describe()

count    300.000000
mean      27.495996
std        6.384743
min        4.547927
25%       24.307841
50%       28.336862
75%       31.674305
max       39.476013
Name: cosine, dtype: float64

### Q3. Computing the cosine

In [11]:
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

In [12]:
def norm_compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = normalize_vector(embedding_model.encode(answer_llm))
    v_orig = normalize_vector(embedding_model.encode(answer_orig))
    
    return v_llm.dot(v_orig)

In [13]:
import numpy as np
from tqdm.autonotebook import tqdm
norm_similarity = []

for record in tqdm(results_df.to_dict(orient='records')):
    sim = norm_compute_similarity(record)
    norm_similarity.append(sim)

100%|██████████| 300/300 [00:15<00:00, 18.92it/s]


In [14]:
results_df['norm_similarity']= norm_similarity

results_df['norm_similarity'].describe()

count    300.000000
mean       0.728392
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: norm_similarity, dtype: float64

### Q4. Rouge

In [15]:
from rouge import Rouge
rouge_scorer = Rouge()

In [16]:
r = results_df[results_df['document'] == '5170565b']

In [17]:
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

round(scores['rouge-1']['f'],2)

0.45

### Q5. Average rouge score

In [18]:
import numpy as np
rouge_scores = [scores['rouge-1']['r'], scores['rouge-2']['r'], scores['rouge-l']['r']]
round(np.mean(rouge_scores),2)

np.float64(0.35)

### Q6. Average rouge score for all the data points

In [19]:
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores= []
rouge_avg_scores = []

for record in tqdm(results_df.to_dict(orient='records')):
    scores = rouge_scorer.get_scores(record['answer_llm'],record['answer_orig'])[0]
       
    rouge_1 = scores['rouge-1']['f']
    rouge_2 = scores['rouge-2']['f']
    rouge_l = scores['rouge-l']['f']
    rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3
    
    rouge_1_scores.append(rouge_1)
    rouge_2_scores.append(rouge_2)
    rouge_l_scores.append(rouge_l)
    rouge_avg_scores.append(rouge_avg)
    
    pd.DataFrame({
        "rouge_1_scores": rouge_1_scores,
        "rouge_2_scores": rouge_2_scores,
        "rouge_l_scores": rouge_l_scores,
        "rouge_avg_scores": rouge_avg_scores,
        
    })

100%|██████████| 300/300 [00:00<00:00, 367.29it/s]


In [23]:
 df_rouge_scores = pd.DataFrame({
        "rouge_1_scores": rouge_1_scores,
        "rouge_2_scores": rouge_2_scores,
        "rouge_l_scores": rouge_l_scores,
        "rouge_avg_scores": rouge_avg_scores,
        
    })

In [24]:
df_rouge_scores['rouge_2_scores'].describe()

count    300.000000
mean       0.206965
std        0.153550
min        0.000000
25%        0.097809
50%        0.178671
75%        0.286181
max        0.739130
Name: rouge_2_scores, dtype: float64