In [1]:
import pandas as pd

sample_questions = pd.read_csv("sample_gt.csv")
sample_dict = sample_questions.to_dict(orient="records")
sample_dict[2]

{'question': 'performance differences in model APIs',
 'summary_answer': 'The excerpt mentions that the same AI model may perform differently across various APIs due to optimization techniques used, which necessitates thorough testing when switching APIs.',
 'difficulty': 'intermediate',
 'text': 'After developing a model, a developer can choose to open source it, make it\naccessible via an API, or both. Many model developers are also model\nservice providers. Cohere and Mistral open source some models and\nprovide APIs for some. OpenAI is typically known for their commercial\nmodels, but they’ve also open sourced models (GPT-2, CLIP). Typically,\nmodel providers open source weaker models and keep their best models\nbehind paywalls, either via APIs or to power their products.\nModel APIs can be available through model providers (such as OpenAI and\nAnthropic), cloud service providers (such as Azure and GCP [Google Cloud\nPlatform]), or third-party API providers (such as Databricks Mosa

In [2]:
from chunking import extract_text_from_pdf, chunk_sliding_window

pages = extract_text_from_pdf(start_page=1)
chunks = chunk_sliding_window(pages, size=300, step=250)
chunks[:2]

[{'start': 0,
  'text': 'Chapter 4. Evaluate AI Systems\nA model is only useful if it works for its intended purposes. You need to\nevaluate models in the context of your application. Chapter 3 discusses\ndifferent approaches to automatic evaluation. This chapter discusses how to\nuse these approaches to evaluate models for you',
  'chapter': 4},
 {'start': 250,
  'text': 'to\nuse these approaches to evaluate models for your applications.\nThis chapter contains three parts. It starts with a discussion of the criteria\nyou might use to evaluate your applications and how these criteria are\ndefined and calculated. For example, many people worry about AI making\nup facts—how ',
  'chapter': 4}]

In [3]:
from minsearch import VectorSearch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np

embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')
v_index = VectorSearch(keyword_fields = [])


embeddings = []
for chunk in tqdm(chunks):
    vector = embedding_model.encode(chunk["text"])
    embeddings.append(vector)

embeddings_array = np.array(embeddings)
vector_store = v_index.fit(embeddings_array, chunks)


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 468/468 [00:08<00:00, 53.07it/s]


In [12]:
user_question = "what benchmarks to use for evaluating LLMs"
user_question_embedding = embedding_model.encode(user_question)
vector_store.search(user_question_embedding)

[{'start': 69000,
  'text': 's\nsaturate, necessitating the introduction of new benchmarks.\nA tool that helps you evaluate a model on multiple benchmarks is an\nevaluation harness. As of this writing, EleutherAI’s lm-evaluation-harness\nsupports over 400 benchmarks. OpenAI’s evals lets you run any of the\napproximately 500 existing',
  'chapter': 4},
 {'start': 71750,
  'text': ' in late 2023, Hugging Face updated their Open LLM Leaderboard\nto use the average of six different benchmarks to rank models:\n1. ARC-C (Clark et al., 2018): Measuring the ability to solve complex,\ngrade school-level science questions.\n2. MMLU (Hendrycks et al., 2020): Measuring knowledge and reasoni',
  'chapter': 4},
 {'start': 75500,
  'text': '23): a graduate-level Q&A benchmark\nMuSR (Sprague et al., 2023): a chain-of-thought, multistep reasoning benchmark\nBBH (BIG-bench Hard) (Srivastava et al., 2023): another reasoning benchmark\nIFEval (Zhou et al., 2023): an instruction-following benchmark\nI have

In [16]:
def search(user_question:str):
    user_question_embedding = embedding_model.encode(user_question)
    results = vector_store.search(user_question_embedding, num_results=10)
    return results

In [None]:
instructions = """ 

You are an AI Researcher with 10 years of experience as a senior AI Engineer. 
You are instructing a course based on the "AI Engineering book" published by Chip Huyen.
Answer the QUESTION and base your response ONLY on the CONTEXT provided by this book.
""".strip()

prompt_template = """ 
<QUESTION>
{user_question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
"""

In [19]:
import json
def build_prompt(user_question, search_results):
    search_json = json.dumps(search_results)
    prompt = prompt_template.format(user_question=user_question, context=search_json).strip()
    return prompt

In [None]:
user_question = "what benchmarks to use for evaluating LLMs"
search_results = search(user_question = user_question)

user_prompt = build_prompt(user_question, search_results)

'<QUESTION>\nwhat benchmarks to use for evaluating LLMs\n</QUESTION>\n\n<CONTEXT>\n[{"start": 69000, "text": "s\\nsaturate, necessitating the introduction of new benchmarks.\\nA tool that helps you evaluate a model on multiple benchmarks is an\\nevaluation harness. As of this writing, EleutherAI\\u2019s lm-evaluation-harness\\nsupports over 400 benchmarks. OpenAI\\u2019s evals lets you run any of the\\napproximately 500 existing", "chapter": 4}, {"start": 71750, "text": " in late 2023, Hugging Face updated their Open LLM Leaderboard\\nto use the average of six different benchmarks to rank models:\\n1. ARC-C (Clark et al., 2018): Measuring the ability to solve complex,\\ngrade school-level science questions.\\n2. MMLU (Hendrycks et al., 2020): Measuring knowledge and reasoni", "chapter": 4}, {"start": 75500, "text": "23): a graduate-level Q&A benchmark\\nMuSR (Sprague et al., 2023): a chain-of-thought, multistep reasoning benchmark\\nBBH (BIG-bench Hard) (Srivastava et al., 2023): anoth

In [25]:
from openai import OpenAI
client = OpenAI()

def ask_llm(user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

    messages.append({
        "role": "user",
        "content": user_prompt
    })

    response = client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [None]:
def rag(user_question):
    search_results = search(user_question)
    user_prompt = build_prompt(user_question, search_results)
    response = ask_llm(user_prompt)
    return response
