In [2]:
from pydantic import BaseModel
from openai import OpenAI
from dotenv import load_dotenv
from enum import Enum
import os
from rouge import Rouge
from sentence_transformers import SentenceTransformer
import numpy as np

load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=OPENAI_API_KEY)

  from .autonotebook import tqdm as notebook_tqdm


### Exact match evals
Exact match evals measure whether the model’s output exactly matches a predefined correct answer. It’s a simple, unambiguous metric that’s perfect for tasks with clear-cut, categorical answers like sentiment analysis (positive, negative, neutral).

In [4]:

class Options(str, Enum):
    option1 = "negative"
    option2 = "positive"
    option3 = "mixed"
    option4 = "neutral"


class sentiment(BaseModel):
    sentiment: Options


tweets = [
    {"text": "This movie was a total waste of time. 👎", "sentiment": "negative"},
    {"text": "The new album is 🔥! Been on repeat all day.", "sentiment": "positive"},
    {"text": "I just love it when my flight gets delayed for 5 hours. #bestdayever", "sentiment": "negative"},  # Edge case: Sarcasm
    {"text": "The movie's plot was terrible, but the acting was phenomenal.", "sentiment": "mixed"},  # Edge case: Mixed sentiment
]


def get_completion(prompt: str):
    message = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        #max_tokens=50,
        messages=[
        {"role": "user", "content": prompt}
        ],
        response_format=sentiment
    )
    return message.choices[0].message.parsed

outputs = [get_completion(f"Classify this as 'positive', 'negative', 'neutral', or 'mixed': {tweet['text']}") for tweet in tweets]
model_output =[output.sentiment.value for output in outputs]
ground_truth = [tweet['sentiment'] for tweet in tweets]
accuracy = sum([output == tweet for output, tweet in zip(model_output, ground_truth)]) / len(ground_truth)
print(f"Sentiment Analysis Accuracy: {accuracy * 100}%")
print(f"the model output is : {model_output}")
print(f"the ground truth is : {ground_truth}")

Sentiment Analysis Accuracy: 75.0%
the model output is : ['negative', 'positive', 'mixed', 'mixed']
the ground truth is : ['negative', 'positive', 'negative', 'mixed']


### Cosine similarity evals
Cosine similarity measures the similarity between two vectors (in this case, sentence embeddings of the model’s output using SBERT) by computing the cosine of the angle between them. Values closer to 1 indicate higher similarity. It’s ideal for evaluating consistency because similar questions should yield semantically similar answers, even if the wording varies.

In [5]:
faq_variations = [
    {"questions": "What's the mission of Singapore Ministry of Education?", "answer": "The mission of MOE is to mould the future of our nation by moulding the people who will determine our future."},  # Edge case: Typos
    {"questions":  "What's the mission of Singapore Inland Revenue Authority of Singapore", "answer": "Act as an agent of the Government and provide service in the administration of taxes and enterprise disbursements. Advise the Government, and represent Singapore internationally, on matters relating to taxation."} #{"questions": ["I'm Jane's cousin, and she said you guys have great customer service. Can I return this?", "Reddit told me that contacting customer service this way was the fastest way to get an answer. I hope they're right! What is the return window for a jacket?"], "answer": "Our return policy allows..."},  # Edge case: Irrelevant info
    # ... 47 more FAQs
]

def get_completion(prompt: str):
    message = client.chat.completions.create(
        model="gpt-4o",
        #max_tokens=2048,
        messages=[
        {"role": "user", "content": prompt}
        ]
    )
    return message.choices[0].message.content

outputs = [get_completion(test["questions"]) for test in faq_variations]

model = SentenceTransformer('all-MiniLM-L6-v2')
cosine_similarities_results = []
for output, faq_variation in zip(outputs, faq_variations):
    output_embeddings = model.encode(output)
    ground_truth_embeddings = model.encode(faq_variation['answer'])
    cosine_similarities = np.dot(output_embeddings, ground_truth_embeddings) / (np.linalg.norm(output_embeddings) * np.linalg.norm(ground_truth_embeddings))
    cosine_similarities_results.append(cosine_similarities)

print(f"the cosine similarity results is : {cosine_similarities_results}")
print(f"the outputs are : {outputs}")


the cosine similarity results is : [0.57449484, 0.66036063]
the outputs are : ['The mission of the Singapore Ministry of Education (MOE) is to mold the future of the nation by shaping the young. MOE aims to provide a well-rounded education that helps students discover their talents, pursue their interests, and reach their full potential. The Ministry focuses on nurturing a passion for learning and fostering values like resilience and responsibility to prepare individuals for the challenges of the future. Additionally, MOE is committed to working closely with educators, parents, and the community to achieve their educational goals.', "The mission of the Inland Revenue Authority of Singapore (IRAS) is typically focused on the administration of the country's tax system. While specific wording can vary over time, generally, IRAS's mission includes objectives such as collecting taxes that are due in a fair and efficient manner, ensuring compliance with tax laws, and fostering a competitive 

### ROUGE-L evals
ROUGE-L (Recall-Oriented Understudy for Gisting Evaluation - Longest Common Subsequence) evaluates the quality of generated summaries. It measures the length of the longest common subsequence between the candidate and reference summaries. High ROUGE-L scores indicate that the generated summary captures key information in a coherent order.

In [7]:

articles = [
    {"text": """The mission of MOE is to mould the future of our nation by moulding the people who will determine our future.
The wealth of a nation lies in its people. Their commitment to country and community, their willingness to strive and persevere, their ability to think, achieve and excel. How we raise our young at home and teach them in school will shape our society in the next generation. Our future depends on the continuous renewal and regeneration of our leadership and citizenry, building upon the experience of the past, learning from the circumstances of the present, and preparing to seize the opportunities of the future.
To achieve our mission, MOE will provide our children with a balanced and well-rounded education, develop them to their full potential, and nurture them into lifelong learners and good citizens, conscious of their responsibilities to family, community and country.
MOE's vision of “Thinking Schools, Learning Nation” (TSLN) was first announced by then-Prime Minister Goh Chok Tong in 1997.This vision describes a nation of thinking and committed citizens capable of seizing future opportunities, and an education system ready to ride the waves of change in the 21st century.
Thinking Schools will be learning organisations in every sense, constantly challenging assumptions, and seeking better ways of doing things through participation, creativity and innovation. Thinking Schools will be the cradle of thinking students as well as thinking adults. This spirit of learning should accompany our students throughout their lives, even after they have graduated from the system.
A Learning Nation envisions a national culture and social environment that promotes lifelong learning in our people. The capacity of Singaporeans to continually learn, both for professional development and for personal enrichment, will determine our collective success as a society and nation.""", 
"summary": "The mission of the Singapore Ministry of Education (MOE) is to mold the future of the nation by shaping the young. MOE aims to provide a well-rounded education that helps students discover their talents, pursue their interests, and reach their full potential. The Ministry focuses on nurturing a passion for learning and fostering values like resilience and responsibility to prepare individuals for the challenges of the future. Additionally, MOE is committed to working closely with educators, parents, and the community to achieve their educational goals."}, 
]


def get_completion(prompt: str):
    message = client.chat.completions.create(
        model="gpt-4o",
        max_tokens=1024,
        messages=[
        {"role": "user", "content": prompt}
        ]
    )
    return message.choices[0].message.content   

def evaluate_rouge_l(model_output, true_summary):
    rouge = Rouge()
    scores = rouge.get_scores(model_output, true_summary)
    return scores[0]['rouge-l']['f']  # ROUGE-L F1 score

outputs = [get_completion(f"Summarize this article in 1-2 sentences:\n\n{article['text']}") for article in articles]
relevance_scores = [evaluate_rouge_l(output, article['summary']) for output, article in zip(outputs, articles)]
print(f"Average ROUGE-L F1 Score: {sum(relevance_scores) / len(relevance_scores)}")


Average ROUGE-L F1 Score: 0.2956521689164462
