In [1]:
#Getting the data
import pandas as pd

github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"

url = f'{github_url}?raw=1'
df = pd.read_csv(url)
df = df.iloc[:300]

In [6]:
# Q1. Getting the embeddings model
# !pip install sentence_transformersprint(embedding_model.encode(answer_llm)[0])
from sentence_transformers import SentenceTransformer

model_name = "multi-qa-mpnet-base-dot-v1"

embedding_model = SentenceTransformer(model_name)
answer_llm = df.iloc[0].answer_llm

Q1 = embedding_model.encode(answer_llm)[0]

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence_transformers)
  Downloading transformers-4.43.4-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub>=0.15.1 (from sentence_transformers)
  Downloading huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.34.0->sentence_transformers)
  Downloading regex-2024.7.24-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.34.0->sentence_transformers)
  Downloading safetensors-0.4.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers<5.0.0,>=4.34.0->sentence_transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading sentence_transfor

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
# Q2. Computing the dot product
# !pip install numpy
import numpy as np

evaluations = []

for ids, row in df.iterrows():
    orig_v = embedding_model.encode(row['answer_orig'])
    llm_v = embedding_model.encode(row['answer_llm'])
    
    evaluations.append(llm_v.dot(orig_v))

Q2 = np.percentile(evaluations, 75)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [15]:
# Q3. Computing the cosine

def normalise(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

evaluations = []

for ids, row in df.iterrows():
    orig_v = embedding_model.encode(row['answer_orig'])
    llm_v = embedding_model.encode(row['answer_llm'])
    
    orig_v = normalise(orig_v)
    llm_v = normalise(llm_v)
    
    evaluations.append(llm_v.dot(orig_v))

Q3 = np.percentile(evaluations, 75)

In [35]:
# Q4. Rouge
!pip install rouge
import rouge
from rouge import Rouge
# print(rouge.__version__)

rouge_scorer = Rouge()
scores = rouge_scorer.get_scores(df.iloc[10]['answer_llm'], df.iloc[10]['answer_orig'])[0]

Q4 = scores['rouge-1']['f']

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [54]:
# Q5. Average rouge score
Q5 = (scores['rouge-1']['f'] + scores['rouge-2']['f'] + scores['rouge-l']['f'])/3

In [56]:
# Q6. Average rouge score for all the data points
evaluations = []

for ids, r in df.iterrows():
    scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
    
    rouge_2 = scores['rouge-2']['f']
    
    evaluations.append(rouge_2)

Q6 = np.mean(evaluations)