# LLM Homework 04 - Evaluation and Monitoring

### What's the first value of the resulting vector?

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '04-monitoring/data/results-gpt4o-mini.csv'

docs_url = f'{base_url}/{relative_url}?raw=1'

url = f'{docs_url}?raw=1'
df = pd.read_csv(url)

In [3]:
df = df.iloc[:300]

In [4]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [5]:
answer_llm = df.iloc[0].answer_llm
embedding = embedding_model.encode(answer_llm)

print(embedding[0])

-0.4224466


### What's the 75% percentile of the score?

In [6]:
evaluations = []

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    answer_llm = row['answer_llm']
    answer_orig = row['answer_orig']
    embedding1 = embedding_model.encode(answer_llm)
    embedding2 = embedding_model.encode(answer_orig)
    
    dot_product = np.dot(embedding1, embedding2)
    evaluations.append(dot_product)

100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [02:02<00:00,  2.44it/s]


In [7]:
percentile_75 = np.percentile(evaluations, 75)
print(percentile_75)

31.6743106842041


### What's the 75% cosine in the scores?

In [8]:
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

def cosine_similarity(v1, v2):

    v1_norm = normalize_vector(v1)
    v2_norm = normalize_vector(v2)

    return np.dot(v1_norm, v2_norm)

In [9]:
cosine_similarities = []

for index, row in tqdm(df.iterrows(), total=df.shape[0]):

    answer_llm = row['answer_llm']
    answer_orig = row['answer_orig']
    embedding1 = embedding_model.encode(answer_llm)
    embedding2 = embedding_model.encode(answer_orig)

    cosine_sim = cosine_similarity(embedding1, embedding2)
    cosine_similarities.append(cosine_sim)

100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [02:02<00:00,  2.45it/s]


In [10]:
percentile_75_cosine = np.percentile(cosine_similarities, 75)
print(percentile_75_cosine)

0.8362348079681396


### What's the F score for 'rouge-1'?

In [11]:
!pip install rouge



In [12]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df.iloc[10]
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

print(scores['rouge-1']['f'])

0.45454544954545456


### Let's compute the average between rouge-1, rouge-2 and rouge-l for the same record from Q4

In [13]:
rouge_1 = scores['rouge-1']['f']
rouge_2 = scores['rouge-2']['f']
rouge_l = scores['rouge-l']['f']

rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3

print(rouge_avg)

0.35490034990035496


### What's the average rouge_2 across all the records?

In [14]:
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []

for index, row in df.iterrows():
    
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    
    rouge_1_scores.append(scores['rouge-1']['f'])
    rouge_2_scores.append(scores['rouge-2']['f'])
    rouge_l_scores.append(scores['rouge-l']['f'])

In [15]:
average_rouge_2 = np.mean(rouge_2_scores)
print(average_rouge_2)

0.20696501983423318
