In [None]:
import json
from datasets import Dataset, load_dataset
from openai import OpenAI
from tqdm.auto import tqdm
import concurrent.futures

In [4]:
def evaluate_answer(
    instruction: str, answer: str, client: OpenAI
) -> dict:
    prompt = f"""You are an expert judge. Please evaluate the quality of a given answer to an instruction based on two criteria: \
        1. Accuracy: How factually correct is the information presented in the answer? You are a technical expert in this topic.
        2. Style: Is the tone and writing style appropriate for a blog post or social media content? It should use simple but technical words and avoid formal or complex language.

        Accuracy Scale:
        1 (Poor): Contains factual errors or misleading information.
        2 (Good): Mostly accurate but has minor errors or omissions.
        3 (Excellent): Highly accurate and comprehensive, with no factual errors.

        Style Scale:
        1 (Poor): Too formal, uses some overly complex words.
        2 (Good): Good balance of technical content and accessibility, but still uses formal words and expressions.
        3 (Excellent): Perfectly accessible langugage for blog/social media, uses simple but precise technical terms when necessary.

        Example of bad style: The 4-2-4 formation is instrumental in cultivating an offensive style of play, achieved by deploying a substantial contingent of players in advanced positions, thereby presenting a formidable challenge to the opposing defense.
        Example of excellent style: The 4 2 4 formation facilitates attacking play by positioning significant numbers of players high up the pitch, directly challenging the opposition's defense.

        Instruction: {instruction}

        Answer: {answer}

        Provide your evaluation in JSON format with the following structure:

        {{
            "accuracy": {{
                "analysis": "...",
                "score": 0
            }},
            "style": {{
                "analysis": "...",
                "score" : 0
            }}
        }}
        """
    completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant who evaluates answers based on accuracy and style. Provide your response in JSON format with a short analysis and score for each criterion.",
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            response_format={
                "type": "json_object",
            },
            max_tokens=1000,
            temperature=0.8
        )
    return json.loads(completion.choices[0].message.content)

NameError: name 'OpenAI' is not defined

In [11]:
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
def evaluate_batch(batch, start_index):
    client = OpenAI()
    return [(i, evaluate_answer(instr, ans, client)) for i, (instr, ans) in enumerate(batch, start=start_index)]


def evaluate_answers(model_id: str, num_threads: int = 10, batch_size: int = 5) -> Dataset:
    # Load the dataset
    dataset = load_dataset(f"tunahankilic/{model_id.split('/')[-1]}-results", split="all")

    # Create batches of instruction-answer pairs with their original indices
    batches = [
        (i, list(zip(dataset["instruction"][i : i + batch_size], dataset["answers"][i : i + batch_size], strict=False)))
        for i in range(0, len(dataset), batch_size)
    ]

    evaluations = [None] * len(dataset)

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(evaluate_batch, batch, start_index) for start_index, batch in batches]

        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            for index, evaluation in future.result():
                evaluations[index] = evaluation

    # Replace the 'evaluation' column if it exists, otherwise add it
    if "evaluation" in dataset.column_names:
        dataset = dataset.remove_columns(["evaluation"])
    dataset = dataset.add_column("evaluation", evaluations)

    # Post-process evaluations
    accuracy_scores = []
    style_scores = []

    for evaluation in dataset["evaluation"]:
        try:
            eval_dict = json.loads(evaluation) if isinstance(evaluation, str) else evaluation
            accuracy_score = eval_dict["accuracy"]["score"]
            style_score = eval_dict["style"]["score"]

            accuracy_scores.append(accuracy_score)
            style_scores.append(style_score)

        except (json.JSONDecodeError, KeyError, TypeError):
            # If there's an error, append None to maintain alignment
            accuracy_scores.append(None)
            style_scores.append(None)

    # Add new columns to the dataset
    if "accuracy" in dataset.column_names:
        dataset = dataset.remove_columns(["accuracy"])
    dataset = dataset.add_column("accuracy", accuracy_scores)
    if "style" in dataset.column_names:
        dataset = dataset.remove_columns(["style"])
    dataset = dataset.add_column("style", style_scores)

    dataset.push_to_hub(f"tunahankilic/{model_id.split('/')[-1]}-results")

    return dataset

In [34]:
model_ids = [
    "tunahankilic/LlamaFootball-3.1-8B",
    "meta-llama/Llama-3.1-8B-Instruct"
]

In [35]:
for model_id in model_ids:
    print(f"Evaluating answers for model: {model_id}")
    evaluate_answers(model_id)
    print(f"Evaluation completed for model: {model_id}")

Evaluating answers for model: tunahankilic/LlamaFootball-3.1-8B


  0%|          | 0/12 [00:00<?, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation completed for model: tunahankilic/LlamaFootball-3.1-8B
Evaluating answers for model: meta-llama/Llama-3.1-8B-Instruct


README.md:   0%|          | 0.00/488 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/62.3k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/58 [00:00<?, ? examples/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation completed for model: meta-llama/Llama-3.1-8B-Instruct


In [36]:
    # Analyze results
    for model_id in model_ids:
        dataset = load_dataset(f"tunahankilic/{model_id.split('/')[-1]}-results", split="all")

        score = sum(dataset["accuracy"]) / len(dataset["accuracy"])
        print(f"{model_id.split('/')[-1]} - Accuracy: {score:.2f}")  # noqa

        score = sum(dataset["style"]) / len(dataset["style"])
        print(f"{model_id.split('/')[-1]} - Style: {score:.2f}")  # noqa

README.md:   0%|          | 0.00/719 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/64.8k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/58 [00:00<?, ? examples/s]

LlamaFootball-3.1-8B - Accuracy: 2.81
LlamaFootball-3.1-8B - Style: 2.52


README.md:   0%|          | 0.00/719 [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/84.1k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/58 [00:00<?, ? examples/s]

Llama-3.1-8B-Instruct - Accuracy: 2.64
Llama-3.1-8B-Instruct - Style: 2.21


In [1]:
from llm_engineering.domain.queries import Query
from llm_engineering.application.rag.query_expansion import QueryExpansion

ModuleNotFoundError: No module named 'transformers'

In [3]:
query = Query.from_str("Write an article about the effective areas of the 3 5 2 formation.")
query_expander = QueryExpansion()
expanded_queries = query_expander.generate(query, expand_to_n=3)
for expanded_query in expanded_queries:
    print(expanded_query.content)

[37mHTTP Request: POST [0m[34mhttps://api.openai.com/v1/chat/completions[37m "HTTP/1.1 200 OK"[0m
Write an article about the effective areas of the 3 5 2 formation.
What are the key strengths and effective zones of the 3-5-2 formation in football?
Can you explain how the 3-5-2 formation optimizes player positioning and tactical advantages on the field?
