In [None]:
from chunking import extract_text_from_pdf

pages = extract_text_from_pdf(start_page=1)

page_docs=[]
for page_num, content in pages.items():
    text_per_page = content.get("text")
    page_docs.append({
        "page_num": page_num,
        "text": text_per_page
    })
page_docs


In [4]:
# results = []
# for page_num, payload in pages.items():
#     page = {
#         "page": page_num,
#         "chapter": payload["chapter"],
#         "text": payload["text"],
#     }
#     result = process_document(chunk)
#     result["page"] = page_num
#     results.append(result)

In [2]:
from openai import OpenAI
openai_client = OpenAI()

def llm_structured(instructions, user_prompt, output_format, model="gpt-4o-mini"):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.parse(
        model=model,
        input=messages,
        text_format=output_format
    )

    return (response.output_parsed, response.usage)

In [3]:
from pydantic import BaseModel, Field
from typing import List, Literal

class Question(BaseModel):
    """
    Represents a realistic search-engine-style query a user might type before finding the article.
    """

    question: str = Field(
        ...,
        description="A natural, short search query — not a full-sentence question — phrased like something typed into Google.",
    )
    summary_answer: str = Field(
        ...,
        description="A concise 1–2 sentence summary of how the article addresses the query.",
    )
    difficulty: Literal["beginner", "intermediate", "expert"] = Field(
        ..., description="The assumed knowledge level of the ai engineer making the query."
    )



class GeneratedQuestions(BaseModel):
    """
    A structured collection of human-like search queries derived from a given article.
    """

    description: str = Field(
        ...,
        description="A summary of the article or topic these search-style questions were generated for.",
    )
    questions: List[Question] = Field(
        ...,
        description="A list of realistic search queries with short summaries, difficulty levels, and user intent.",
    )

instructions = """ 
    You are given a page from a chapter called Evaluating AI Systems from a technical AI Engineering textbook.
    Your task is to imagine what an AI engineer might type into a search engine to learn about evaluating LLMs. 

    Generate realistic, human-like search queries — not formal questions. 

    Guidelines:
  
    - Make queries varied and spontaneous, not repetitive or over-polished.
    - Assume users of different knowledge levels from beginner (think junior level) to expert (think ai researcher or senior level):
        - beginner: broad or basic understanding
        - intermediate: knows basic terms but seeks clarification 
        - expert: familiar with evaluating 

    Distribution rules:
    - 50% of the queries should target beginner-level ai engineers
    - 30% should target intermediate-level ai engineers
    - 20% should target advanced-level ai engineers
 

    For each generated query, include:
    - question: the natural, human-style search phrase
    - summary_answer: a short 1–2 sentence summary of how the chapter chunks addresses it
    - difficulty: one of ["beginner", "intermediate", "expert"]

    Also include a description summarizing what kind of article the questions are about.
    """.strip()

In [4]:
from tqdm import tqdm

def map_progress(pool, seq, f):
    """Map function f over seq using the provided executor pool while
    displaying a tqdm progress bar. Returns a list of results in submission order.
    """
    results = []
    
    with tqdm(total=len(seq)) as progress:
        futures = []
    
        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)
        
        return results

In [5]:
import json

def process_document(doc:dict):
    content = doc.get("text", "")

    num_questions = len(content) // 400 # create one question for every 400 characters

    user_prompt = f"""
    generate {num_questions} questions for this document:
    {json.dumps(doc)}
    """.strip()

    output, usage = llm_structured(
        instructions=instructions,
        user_prompt=user_prompt,
        output_format=GeneratedQuestions,
    )

    final_output = {'page': doc, 'questions': output, 'usage': usage}
    return final_output

In [6]:
from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor(max_workers=6) as pool:
    results = map_progress(pool, page_docs, process_document)

results

100%|██████████| 100/100 [01:20<00:00,  1.24it/s]


[{'page': {'page_num': 1,
   'text': 'Chapter 4. Evaluate AI Systems\nA model is only useful if it works for its intended purposes. You need to\nevaluate models in the context of your application. Chapter 3 discusses\ndifferent approaches to automatic evaluation. This chapter discusses how to\nuse these approaches to evaluate models for your applications.\nThis chapter contains three parts. It starts with a discussion of the criteria\nyou might use to evaluate your applications and how these criteria are\ndefined and calculated. For example, many people worry about AI making\nup facts—how is factual consistency detected? How are domain-specific\ncapabilities like math, science, reasoning, and summarization measured?\nThe second part focuses on model selection. Given an increasing number of\nfoundation models to choose from, it can feel overwhelming to choose the\nright model for your application. Thousands of benchmarks have been\nintroduced to evaluate these models along different cri

In [7]:
from toyaikit.pricing import PricingConfig

pricing = PricingConfig()
input_tokens = 0
output_tokens = 0

for r in results:
    usage = r['usage']
    input_tokens = input_tokens + usage.input_tokens
    output_tokens = output_tokens + usage.output_tokens
    
pricing.calculate_cost('gpt-4o-mini', input_tokens, output_tokens)

CostInfo(input_cost=Decimal('0.01202535'), output_cost=Decimal('0.01017'), total_cost=Decimal('0.02219535'))

In [10]:
final_questions = []

for r in results:
    page = r.get("page")
    page_num = page.get("page_num")
    doc_text = page.get("text", "")
    questions = r.get("questions")
    for q in questions.questions:
        entry = q.model_dump()
        entry["text"] = doc_text
        final_questions.append(entry)
final_questions

[{'question': 'how to evaluate ai model performance',
  'summary_answer': 'The chapter outlines essential criteria for evaluating AI models, including how to measure factual consistency and domain-specific capabilities relevant to the application.',
  'difficulty': 'beginner',
  'text': 'Chapter 4. Evaluate AI Systems\nA model is only useful if it works for its intended purposes. You need to\nevaluate models in the context of your application. Chapter 3 discusses\ndifferent approaches to automatic evaluation. This chapter discusses how to\nuse these approaches to evaluate models for your applications.\nThis chapter contains three parts. It starts with a discussion of the criteria\nyou might use to evaluate your applications and how these criteria are\ndefined and calculated. For example, many people worry about AI making\nup facts—how is factual consistency detected? How are domain-specific\ncapabilities like math, science, reasoning, and summarization measured?\nThe second part focuse

In [11]:
len(final_questions)

252

In [12]:
import pandas as pd

pd.set_option("display.max_colwidth",None)
df = pd.DataFrame(final_questions)
df


Unnamed: 0,question,summary_answer,difficulty,text
0,how to evaluate ai model performance,"The chapter outlines essential criteria for evaluating AI models, including how to measure factual consistency and domain-specific capabilities relevant to the application.",beginner,"Chapter 4. Evaluate AI Systems\nA model is only useful if it works for its intended purposes. You need to\nevaluate models in the context of your application. Chapter 3 discusses\ndifferent approaches to automatic evaluation. This chapter discusses how to\nuse these approaches to evaluate models for your applications.\nThis chapter contains three parts. It starts with a discussion of the criteria\nyou might use to evaluate your applications and how these criteria are\ndefined and calculated. For example, many people worry about AI making\nup facts—how is factual consistency detected? How are domain-specific\ncapabilities like math, science, reasoning, and summarization measured?\nThe second part focuses on model selection. Given an increasing number of\nfoundation models to choose from, it can feel overwhelming to choose the\nright model for your application. Thousands of benchmarks have been\nintroduced to evaluate these models along different criteria. Can these\nbenchmarks be trusted? How do you select what benchmarks to use? How\nabout public leaderboards that aggregate multiple benchmarks?\nThe model landscape is teeming with proprietary models and open source\nmodels. A question many teams will need to visit over and over again is\nwhether to host their own models or to use a model API. This question has\nbecome more nuanced with the introduction of model API services built on\ntop of open source models."
1,criteria for model evaluation in ai,"It discusses various criteria used to evaluate AI applications, detailing how these criteria can be defined and calculated to ensure model effectiveness.",beginner,"Chapter 4. Evaluate AI Systems\nA model is only useful if it works for its intended purposes. You need to\nevaluate models in the context of your application. Chapter 3 discusses\ndifferent approaches to automatic evaluation. This chapter discusses how to\nuse these approaches to evaluate models for your applications.\nThis chapter contains three parts. It starts with a discussion of the criteria\nyou might use to evaluate your applications and how these criteria are\ndefined and calculated. For example, many people worry about AI making\nup facts—how is factual consistency detected? How are domain-specific\ncapabilities like math, science, reasoning, and summarization measured?\nThe second part focuses on model selection. Given an increasing number of\nfoundation models to choose from, it can feel overwhelming to choose the\nright model for your application. Thousands of benchmarks have been\nintroduced to evaluate these models along different criteria. Can these\nbenchmarks be trusted? How do you select what benchmarks to use? How\nabout public leaderboards that aggregate multiple benchmarks?\nThe model landscape is teeming with proprietary models and open source\nmodels. A question many teams will need to visit over and over again is\nwhether to host their own models or to use a model API. This question has\nbecome more nuanced with the introduction of model API services built on\ntop of open source models."
2,how to choose the right ai model from benchmarks,"The chapter explains the challenges of selecting an appropriate model given numerous benchmarks, including how to determine trustworthy benchmarks and the role of leaderboards in model selection.",intermediate,"Chapter 4. Evaluate AI Systems\nA model is only useful if it works for its intended purposes. You need to\nevaluate models in the context of your application. Chapter 3 discusses\ndifferent approaches to automatic evaluation. This chapter discusses how to\nuse these approaches to evaluate models for your applications.\nThis chapter contains three parts. It starts with a discussion of the criteria\nyou might use to evaluate your applications and how these criteria are\ndefined and calculated. For example, many people worry about AI making\nup facts—how is factual consistency detected? How are domain-specific\ncapabilities like math, science, reasoning, and summarization measured?\nThe second part focuses on model selection. Given an increasing number of\nfoundation models to choose from, it can feel overwhelming to choose the\nright model for your application. Thousands of benchmarks have been\nintroduced to evaluate these models along different criteria. Can these\nbenchmarks be trusted? How do you select what benchmarks to use? How\nabout public leaderboards that aggregate multiple benchmarks?\nThe model landscape is teeming with proprietary models and open source\nmodels. A question many teams will need to visit over and over again is\nwhether to host their own models or to use a model API. This question has\nbecome more nuanced with the introduction of model API services built on\ntop of open source models."
3,how to evaluate AI applications,The chapter outlines methods for creating an evaluation pipeline that helps assess the effectiveness and ROI of deployed AI applications over time.,beginner,"The last part discusses developing an evaluation pipeline that can guide the\ndevelopment of your application over time. This part brings together the\ntechniques we’ve learned throughout the book to evaluate concrete\napplications.\nEvaluation Criteria\nWhich is worse—an application that has never been deployed or an\napplication that is deployed but no one knows whether it’s working? When I\nasked this question at conferences, most people said the latter. An\napplication that is deployed but can’t be evaluated is worse. It costs to\nmaintain, but if you want to take it down, it might cost even more.\nAI applications with questionable returns on investment are, unfortunately,\nquite common. This happens not only because the application is hard to\nevaluate but also because application developers don’t have visibility into\nhow their applications are being used. An ML engineer at a used car\ndealership told me that his team built a model to predict the value of a car\nbased on the specs given by the owner. A year after the model was\ndeployed, their users seemed to like the feature, but he had no idea if the\nmodel’s predictions were accurate. At the beginning of the ChatGPT fever,\ncompanies rushed to deploy customer support chatbots. Many of them are\nstill unsure if these chatbots help or hurt their user experience."
4,importance of evaluating deployed AI models,It highlights that an application in the field without proper evaluation can be more detrimental than one that has never been deployed at all.,intermediate,"The last part discusses developing an evaluation pipeline that can guide the\ndevelopment of your application over time. This part brings together the\ntechniques we’ve learned throughout the book to evaluate concrete\napplications.\nEvaluation Criteria\nWhich is worse—an application that has never been deployed or an\napplication that is deployed but no one knows whether it’s working? When I\nasked this question at conferences, most people said the latter. An\napplication that is deployed but can’t be evaluated is worse. It costs to\nmaintain, but if you want to take it down, it might cost even more.\nAI applications with questionable returns on investment are, unfortunately,\nquite common. This happens not only because the application is hard to\nevaluate but also because application developers don’t have visibility into\nhow their applications are being used. An ML engineer at a used car\ndealership told me that his team built a model to predict the value of a car\nbased on the specs given by the owner. A year after the model was\ndeployed, their users seemed to like the feature, but he had no idea if the\nmodel’s predictions were accurate. At the beginning of the ChatGPT fever,\ncompanies rushed to deploy customer support chatbots. Many of them are\nstill unsure if these chatbots help or hurt their user experience."
...,...,...,...,...
247,Why might Meta support open source AI models?,The excerpt suggests that Meta's support for open source might be a strategy to maintain competitiveness against rivals like Google and Microsoft while fostering broader societal benefits.,expert,"reason Microsoft invested in OpenAI.\n4\nInterestingly enough, some companies with strict data privacy requirements have told me that even\nthough they can’t usually send data to third-party services, they’re okay with sending their data to\nmodels hosted on GCP, AWS, and Azure. For these companies, the data privacy policy is more about\nwhat services they can trust. They trust big cloud providers but don’t trust other startups.\n5\nThe story was reported by several outlets, including TechRadar (see “Samsung Workers Made a\nMajor Error by Using ChatGPT”, by Lewis Maddison (April 2023).\n6\nAs regulations are evolving around the world, requirements for auditable information of models and\ntraining data may increase. Commercial models may be able to provide certifications, saving\ncompanies from the effort.\n7\nUsers want models to be open source because open means more information and more options, but\nwhat’s in it for model developers? Many companies have sprung up to capitalize on open source\nmodels by providing inference and finetuning services. It’s not a bad thing. Many people need these\nservices to leverage open source models. But, from model developers’ perspective, why invest\nmillions, if not billions, into building models just for others to make money?It might be argued that\nMeta supports open source models only to keep their competitors (Google, Microsoft/OpenAI) in\ncheck. Both Mistral and Cohere have open source models, but they also have APIs. At some point,\ninference services on top of Mistral and Cohere models become their competitors.There’s the\nargument that open source is better for society, and maybe that’s enough as an incentive. People who\nwant what’s good for society will continue to push for open source, and maybe there will be enough\ncollective goodwill to help open source prevail. I certainly hope so.\n8\nThe companies that get hit the most by API costs are probably not the biggest companies. The\nbiggest companies might be important enough to service providers to negotiate favorable terms.\n9\nThis is similar to the philosophy in software infrastructure to always use the most popular tools that\nhave been extensively tested by the community."
248,tools for evaluating AI benchmarks,The article describes how Hugging Face has improved transparency in benchmark selection and emphasizes the importance of evaluating benchmarks for AI systems.,beginner,"0\nWhen I posted a question on Hugging Face’s Discord about why they chose certain benchmarks,\nLewis Tunstall responded that they were guided by the benchmarks that the then popular models\nused. Thanks to the Hugging Face team for being so wonderfully responsive and for their great\ncontributions to the community.\n1\nI’m really glad to report that while I was writing this book, leaderboards have become much more\ntransparent about their benchmark selection and aggregation process. When launching their new\nleaderboard, Hugging Face shared a great analysis of the benchmarks correlation (2024).\n2\nIt’s both really cool and intimidating to see that in just a couple of years, benchmarks had to change\nfrom grade-level questions to graduate-level questions.\n3\nIn gaming, there’s the concept of a neverending game where new levels can be procedurally\ngenerated as players master all the existing levels. It’d be really cool to design a neverending\nbenchmark where more challenging problems are procedurally generated as models level up.\n4\nReading about other people’s experience is educational, but it’s up to us to discern an anecdote from\nthe universal truth. The same model update can cause some applications to degrade and some to\nimprove. For example, migrating from GPT-3.5-turbo-0301 to GPT-3.5-turbo-1106 led to a 10% drop\nin Voiceflow’s intent classification task but an improvement in GoDaddy’s customer support chatbot.\n5\nIf there is a publicly available score, check how reliable the score is.\n6\nThe HELM paper reported that the total cost is $38,000 for commercial APIs and 19,500 GPU hours\nfor open models. If an hour of GPU costs between $2.15 and $3.18, the total cost comes out to\n$80,000–$100,000.\n7\nA friend quipped: “A benchmark stops being useful as soon as it becomes public.”\n8\nThis is because the square root of 10 is approximately 3.3."
249,why have AI benchmarks changed recently,"It explains that AI benchmarks have evolved from simpler to more complex standards, reflecting the advances in model capabilities over time.",beginner,"0\nWhen I posted a question on Hugging Face’s Discord about why they chose certain benchmarks,\nLewis Tunstall responded that they were guided by the benchmarks that the then popular models\nused. Thanks to the Hugging Face team for being so wonderfully responsive and for their great\ncontributions to the community.\n1\nI’m really glad to report that while I was writing this book, leaderboards have become much more\ntransparent about their benchmark selection and aggregation process. When launching their new\nleaderboard, Hugging Face shared a great analysis of the benchmarks correlation (2024).\n2\nIt’s both really cool and intimidating to see that in just a couple of years, benchmarks had to change\nfrom grade-level questions to graduate-level questions.\n3\nIn gaming, there’s the concept of a neverending game where new levels can be procedurally\ngenerated as players master all the existing levels. It’d be really cool to design a neverending\nbenchmark where more challenging problems are procedurally generated as models level up.\n4\nReading about other people’s experience is educational, but it’s up to us to discern an anecdote from\nthe universal truth. The same model update can cause some applications to degrade and some to\nimprove. For example, migrating from GPT-3.5-turbo-0301 to GPT-3.5-turbo-1106 led to a 10% drop\nin Voiceflow’s intent classification task but an improvement in GoDaddy’s customer support chatbot.\n5\nIf there is a publicly available score, check how reliable the score is.\n6\nThe HELM paper reported that the total cost is $38,000 for commercial APIs and 19,500 GPU hours\nfor open models. If an hour of GPU costs between $2.15 and $3.18, the total cost comes out to\n$80,000–$100,000.\n7\nA friend quipped: “A benchmark stops being useful as soon as it becomes public.”\n8\nThis is because the square root of 10 is approximately 3.3."
250,HELM paper evaluation metrics explained,"The text mentions the HELM paper's significant insights on the financial costs associated with evaluating commercial and open AI models, vital for resource planning.",intermediate,"0\nWhen I posted a question on Hugging Face’s Discord about why they chose certain benchmarks,\nLewis Tunstall responded that they were guided by the benchmarks that the then popular models\nused. Thanks to the Hugging Face team for being so wonderfully responsive and for their great\ncontributions to the community.\n1\nI’m really glad to report that while I was writing this book, leaderboards have become much more\ntransparent about their benchmark selection and aggregation process. When launching their new\nleaderboard, Hugging Face shared a great analysis of the benchmarks correlation (2024).\n2\nIt’s both really cool and intimidating to see that in just a couple of years, benchmarks had to change\nfrom grade-level questions to graduate-level questions.\n3\nIn gaming, there’s the concept of a neverending game where new levels can be procedurally\ngenerated as players master all the existing levels. It’d be really cool to design a neverending\nbenchmark where more challenging problems are procedurally generated as models level up.\n4\nReading about other people’s experience is educational, but it’s up to us to discern an anecdote from\nthe universal truth. The same model update can cause some applications to degrade and some to\nimprove. For example, migrating from GPT-3.5-turbo-0301 to GPT-3.5-turbo-1106 led to a 10% drop\nin Voiceflow’s intent classification task but an improvement in GoDaddy’s customer support chatbot.\n5\nIf there is a publicly available score, check how reliable the score is.\n6\nThe HELM paper reported that the total cost is $38,000 for commercial APIs and 19,500 GPU hours\nfor open models. If an hour of GPU costs between $2.15 and $3.18, the total cost comes out to\n$80,000–$100,000.\n7\nA friend quipped: “A benchmark stops being useful as soon as it becomes public.”\n8\nThis is because the square root of 10 is approximately 3.3."


In [None]:
import pandas as pd

gt = pd.read_csv("generated_questions_chap4.csv")
gt_dict = gt.to_dict(orient="records")

In [15]:
gt_dict[2]

{'question': 'how to choose the right ai model from benchmarks',
 'summary_answer': 'The chapter explains the challenges of selecting an appropriate model given numerous benchmarks, including how to determine trustworthy benchmarks and the role of leaderboards in model selection.',
 'difficulty': 'intermediate',
 'text': 'Chapter 4. Evaluate AI Systems\nA model is only useful if it works for its intended purposes. You need to\nevaluate models in the context of your application. Chapter 3 discusses\ndifferent approaches to automatic evaluation. This chapter discusses how to\nuse these approaches to evaluate models for your applications.\nThis chapter contains three parts. It starts with a discussion of the criteria\nyou might use to evaluate your applications and how these criteria are\ndefined and calculated. For example, many people worry about AI making\nup facts—how is factual consistency detected? How are domain-specific\ncapabilities like math, science, reasoning, and summarizatio

In [20]:
import hashlib
import pandas as pd

def generate_chunk_id(chunk):
    combined = f"{chunk["question"]}-{chunk["text"][:10]}"
    hash_hex = hashlib.md5(combined.encode()).hexdigest()
    chunk_id = hash_hex[:8]
    return chunk_id

for chunk in gt_dict:
    chunk["id"] = generate_chunk_id(chunk)

gt_with_id = pd.DataFrame(gt_dict)

gt_with_id.to_csv("generated_questions_chap4.csv", index=False)



In [1]:
import pandas as pd
pd.reset_option("display.max_colwidth")
pd.read_json("eval_chunk_300_250.json")

Unnamed: 0,question,input_tokens,output_tokens,total_tokens,input_cost,output_cost,total_cost,answer_relevant,completeness,grounded_accuracy,context_utilization,chunk_coverage,consistency,focused,uncertainty_handling
0,importance of reliable benchmarks in AI,1579,2927,4506,7.9e-05,0.001171,0.00125,True,True,True,True,True,True,True,True
1,what is evaluation-driven development in ai?,1410,2699,4109,7.1e-05,0.00108,0.00115,True,True,True,True,True,True,True,True
2,performance differences in model APIs,1587,2418,4005,7.9e-05,0.000967,0.001047,True,True,True,True,True,True,True,True
3,why do OpenAI models seem worse after updates,1597,2574,4171,8e-05,0.00103,0.001109,True,True,True,True,True,True,True,True
4,Why did Samsung ban ChatGPT?,1402,2452,3854,7e-05,0.000981,0.001051,True,True,True,True,True,True,True,True
5,methods for decontaminating evaluation data,1444,1832,3276,7.2e-05,0.000733,0.000805,True,True,True,True,False,True,True,False
6,best practices for evaluating generative AI re...,1545,1835,3380,7.7e-05,0.000734,0.000811,True,True,True,True,True,True,True,True
7,limitations of public benchmarks in AI evaluation,1493,2260,3753,7.5e-05,0.000904,0.000979,True,False,True,True,True,True,True,False
8,evaluating generative AI models,1593,2850,4443,8e-05,0.00114,0.00122,True,True,True,True,True,True,True,True
9,measuring safety and factual consistency in ge...,1643,2798,4441,8.2e-05,0.001119,0.001201,True,True,True,True,True,True,True,False
