## Load data

In [19]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [33]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [36]:
import requests
import pandas as pd
import numpy as np
from rouge import Rouge

In [17]:
# read in data
url = f'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv?raw=1'
df = pd.read_csv(url)

# filter first 300 documents:
df = df.iloc[:300]
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


## Index data

In [20]:
# get embedding model
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-mpnet-base-dot-v1'
model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
# create the embeddings for first LLM answer
first_answer_llm = df.iloc[0].answer_llm
first_answer_llm_embedding = model.encode(first_answer_llm)
first_answer_llm_embedding

array([-4.22446579e-01, -2.24856094e-01, -3.24058473e-01, -2.84758508e-01,
        7.25637982e-03,  1.01186477e-01,  1.03716679e-01, -1.89983442e-01,
       -2.80597862e-02,  2.71588922e-01, -1.15337484e-01,  1.14666171e-01,
       -8.49585980e-02,  3.32365513e-01,  5.52722663e-02, -2.22195774e-01,
       -1.42540708e-01,  1.02519087e-01, -1.52333796e-01, -2.02912658e-01,
        1.98424123e-02,  8.38147700e-02, -5.68632305e-01,  2.32842825e-02,
       -1.67292669e-01, -2.39256859e-01, -8.05463567e-02,  2.57082582e-02,
       -8.15465227e-02, -7.39290714e-02, -2.61549950e-01,  1.92575473e-02,
        3.22909385e-01,  1.90357044e-01, -9.34726413e-05, -2.13165492e-01,
        2.88941171e-02, -1.79531835e-02, -5.92757724e-02,  1.99918330e-01,
       -4.75172400e-02,  1.71634063e-01, -2.45916881e-02, -9.38061625e-02,
       -3.57002705e-01,  1.33263960e-01,  1.94045797e-01, -1.18530668e-01,
        4.56915230e-01,  1.47728100e-01,  3.35945100e-01, -1.86959475e-01,
        2.45955020e-01, -

## Dot product metric

In [25]:
# Create embeddings for each answer pair
embeddings_llm = model.encode(df['answer_llm'].tolist(), convert_to_tensor=True)
embeddings_orig = model.encode(df['answer_orig'].tolist(), convert_to_tensor=True)

# Compute dot product between both embeddings and store the results (scores) in an evaluations list
evaluations = [float(np.dot(embeddings_llm[i], embeddings_orig[i])) for i in range(len(df))]

# Compute the 75th percentile of the evaluations
percentile_75 = np.percentile(evaluations, 75)

# Output the results
print("Evaluations:", evaluations)
print("75th Percentile Score:", percentile_75)

Evaluations: [17.515987396240234, 13.418399810791016, 25.313255310058594, 12.147415161132812, 18.74773597717285, 33.97040557861328, 30.251705169677734, 29.521575927734375, 35.27219772338867, 27.751771926879883, 32.34471130371094, 31.441843032836914, 36.38071823120117, 33.34049987792969, 30.606159210205078, 32.50304412841797, 29.674449920654297, 24.35346221923828, 20.132469177246094, 23.995479583740234, 30.880279541015625, 32.69243621826172, 30.049171447753906, 16.078163146972656, 31.796417236328125, 37.980003356933594, 20.839046478271484, 32.61286544799805, 38.894195556640625, 34.05182647705078, 28.263877868652344, 27.124832153320312, 23.975263595581055, 26.34014892578125, 18.65811538696289, 25.016403198242188, 21.101131439208984, 33.72679901123047, 29.340347290039062, 28.654504776000977, 29.608585357666016, 30.810733795166016, 33.331199645996094, 26.22047996520996, 26.550073623657227, 13.148595809936523, 12.962546348571777, 12.275609016418457, 9.974442481994629, 10.883928298950195, 29

In [32]:
# another way of computing dot product (same answer as above)
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']

    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)

    return v_llm.dot(v_orig)


similarity = []

for _, record in df.iterrows():
    sim = compute_similarity(record)
    similarity.append(sim)

# Compute the 75th percentile of the evaluations
per_75 = np.percentile(similarity, 75)
per_75

31.674307823181152

## Cosine similarity metric

In [27]:
# Function to normalize a vector
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

In [28]:
# Normalize embeddings
embeddings_llm_norm = np.array([normalize_vector(v) for v in embeddings_llm])
embeddings_orig_norm = np.array([normalize_vector(v) for v in embeddings_orig])

# Compute cosine similarities and store in evaluations list
evaluations_norm = [float(np.dot(embeddings_llm_norm[i], embeddings_orig_norm[i])) for i in range(len(df))]

# Compute the 75th percentile of the evaluations
percentile_75_norm = np.percentile(evaluations_norm, 75)

# Output the results
print("Evaluations:", evaluations_norm)
print("75th Percentile Score:", percentile_75_norm)

Evaluations: [0.5067539215087891, 0.38854867219924927, 0.7185990214347839, 0.33726629614830017, 0.5217923521995544, 0.8305321335792542, 0.7462832927703857, 0.6944060325622559, 0.8468860387802124, 0.6559075713157654, 0.7779558897018433, 0.7835663557052612, 0.9046880602836609, 0.8063029050827026, 0.7275962233543396, 0.7751896381378174, 0.7151663899421692, 0.5890557765960693, 0.5332295894622803, 0.5857593417167664, 0.8123271465301514, 0.8371443152427673, 0.7661154866218567, 0.4333398640155792, 0.8155858516693115, 0.9266785979270935, 0.5526159405708313, 0.7622108459472656, 0.9452981948852539, 0.8478370904922485, 0.7192838788032532, 0.6864790916442871, 0.610093891620636, 0.6491081118583679, 0.4855499863624573, 0.6549568176269531, 0.5297187566757202, 0.8489029407501221, 0.7395622730255127, 0.7609682083129883, 0.7015317678451538, 0.7140964269638062, 0.7781699299812317, 0.6202107071876526, 0.6221020221710205, 0.3347295820713043, 0.33249256014823914, 0.3134307563304901, 0.25845351815223694, 0.2

## Rouge

In [60]:
# examine record 10
df['document'][10]

'5170565b'

In [61]:
# compute ROUGE score between the answers at the index 0 of our dataframe
rouge_scorer = Rouge()
scores = rouge_scorer.get_scores(df['answer_llm'], df['answer_orig'])[10]

In [62]:
scores # r = recall, p = precision, f = f1_score

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

## Average rouge score

In [67]:
# Extract 'f' values
f_values = [scores[key]['f'] for key in scores]

# Compute the mean of the 'f' values
mean_f_value = np.mean(f_values)

# Output the 'f' values and their mean
print("F Values:", f_values)
print("Mean F Value:", mean_f_value)

F Values: [0.45454544954545456, 0.21621621121621637, 0.393939388939394]
Mean F Value: 0.35490034990035496


## Average rouge score for all the data points

In [72]:
# Compute ROUGE scores for each pair and store results in a list
rouge_results = []

for llm_answer, orig_answer in zip(df['answer_llm'], df['answer_orig']):
    scores = rouge_scorer.get_scores(llm_answer, orig_answer)
    rouge_1_f = scores[0]['rouge-1']['f']
    rouge_2_f = scores[0]['rouge-2']['f']
    rouge_l_f = scores[0]['rouge-l']['f']

    rouge_results.append({
        'rouge_1_f': rouge_1_f,
        'rouge_2_f': rouge_2_f,
        'rouge_l_f': rouge_l_f
    })

# Create a DataFrame from the ROUGE results
rouge_df = pd.DataFrame(rouge_results)

# Compute the average ROUGE-2 F1 score across all records
average_rouge_2 = rouge_df['rouge_2_f'].mean()

# Output the ROUGE results and the average ROUGE-2 F1 score
print("Average ROUGE-2 F1 Score:", average_rouge_2)

Average ROUGE-2 F1 Score: 0.20696501983423318


In [71]:
rouge_df

Unnamed: 0,rouge_1_f,rouge_2_f,rouge_l_f
0,0.095238,0.028169,0.095238
1,0.125000,0.055556,0.093750
2,0.415584,0.177778,0.389610
3,0.216216,0.047059,0.189189
4,0.142076,0.033898,0.120219
...,...,...,...
295,0.654545,0.540984,0.618182
296,0.590164,0.460432,0.557377
297,0.654867,0.564516,0.637168
298,0.304762,0.132231,0.304762
