In [1]:
from enum import Enum
import json
from typing import List, Optional

from ollama import chat
import pandas as pd
from pydantic import BaseModel, Field

In [8]:
df = pd.read_parquet(
    './data/eval_sampled.parquet',
)
df.head()

Unnamed: 0,id,doc_id,vector,text,metadata,cluster,cluster_tsne
783,871e39f3-ad80-413d-9353-93b39da8adf5,aa4c9403-c960-442a-aca3-31ad8ae64f6e,"[-0.06531426, 0.0017810601, 0.0071745906, -0.0...",## Concept\n\nA data connector (aka `Reader`) ...,"{'_node_content': '{""id_"": ""871e39f3-ad80-413d...",19,0
851,45e0ab38-6280-4862-be9f-b57ce7f96492,5f858553-f1ec-4828-88df-b6dce5754a75,"[-0.07814654, -0.0054140342, 0.03389499, -0.05...",## Relation-Based Node Parsers,"{'_node_content': '{""id_"": ""45e0ab38-6280-4862...",15,0
600,a65609e2-2297-447c-8779-cc312612f445,dc1397d8-b136-4894-91a1-e292a39b15c2,"[-0.078950115, -0.04367002, 0.023431662, -0.01...",## LlamaIndex Ecosystem\n\nThere's more to the...,"{'_node_content': '{""id_"": ""a65609e2-2297-447c...",7,0
1432,d8e50d48-3935-4a1d-8387-c5f2fd504656,7359399a-1e56-4443-9a70-fc33645355e2,"[-0.08533687, -0.019117128, 0.01595358, 0.0007...",## Additional Resources\n\n- [A Guide to Extra...,"{'_node_content': '{""id_"": ""d8e50d48-3935-4a1d...",7,0
746,cd3836ad-9707-4411-90a9-12c1f5d7ae44,84e9995b-6686-49f4-82ca-c207dd25900e,"[-0.10055203, -0.034588337, -0.016951276, -0.0...",## Usage\n\nYou can create an index on LlamaCl...,"{'_node_content': '{""id_"": ""cd3836ad-9707-4411...",19,0


In [24]:
print(df.iloc[0]['text'])

## Concept

A data connector (aka `Reader`) ingest data from different data sources and data formats into a simple `Document` representation (text and simple metadata).

!!! tip
    Once you've ingested your data, you can build an [Index](../../indexing/index.md) on top, ask questions using a [Query Engine](../../deploying/query_engine/index.md), and have a conversation using a [Chat Engine](../../deploying/chat_engines/index.md).


## Define prompt & response structure

In [80]:
def generate_qa_prompt_structured(context: str) -> str:
    prompt = f"""You are an expert at creating diverse and challenging Question-Answer pairs from text that 
will be used to evaluate a Retrieval Augmented Generation system.

Given the following document chunk, generate 3 question-answer pairs that mimic realistic questions that 
a user might ask about this documentation and:

1. Include different question types (factual, inferential, analytical)
2. Vary in difficulty (basic recall, complex reasoning)
3. Test understanding of key concepts, relationships, and implications
 
The questions should be directly answerable from the documentation and should not require any external knowledge.


<start documentation>
{context}
<end documentation>
"""
    return prompt

In [81]:
class QTypeEnum(str, Enum):
    factual = 'factual'
    inferential = 'inferential'
    analytical = 'analytical'

class DifficultyEnum(str, Enum):
    easy = 'easy'
    medium = 'medium'
    hard = 'hard'

class QAQualityEnum(str, Enum):
    good = 'good'
    fair = 'fair'
    poor = 'poor'

class QAPairMetaData(BaseModel):
    question_type: QTypeEnum = Field(
        description="Question type"
    )
    difficulty: DifficultyEnum = Field(
        description="Question difficulty"
    )
    required_context: str = Field(
        description="Specific quote string from the chunk needed to answer the question"
    )
    reasoning: str = Field(
        description="A brief description of how you arrived at the answer from the context"
    )
    q_a_quality: QAQualityEnum = Field(
        description="Your assessment of the quality of the Q-A pair for evaluating a RAG system"
    )


class QAPair(BaseModel):
    question: str
    metadata: QAPairMetaData
    answer: str

class QAPairList(BaseModel):
    qa_pairs: List[QAPair]

## Generation test

In [82]:
input_ = generate_qa_prompt_structured(df.iloc[0]['text'])
print(input_)

You are an expert at creating diverse and challenging Question-Answer pairs from text that 
will be used to evaluate a Retrieval Augmented Generation system.

Given the following document chunk, generate 3 question-answer pairs that mimic realistic questions that 
a user might ask about this documentation and:

1. Include different question types (factual, inferential, analytical)
2. Vary in difficulty (basic recall, complex reasoning)
3. Test understanding of key concepts, relationships, and implications
 
The questions should be directly answerable from the documentation and should not require any external knowledge.


<start documentation>
## Concept

A data connector (aka `Reader`) ingest data from different data sources and data formats into a simple `Document` representation (text and simple metadata).

!!! tip
    Once you've ingested your data, you can build an [Index](../../indexing/index.md) on top, ask questions using a [Query Engine](../../deploying/query_engine/index.md), 

### Llama3.1-8b

In [83]:
r_llama = chat(
  messages=[
    {
      'role': 'user',
      'content': input_,
    }
  ],
  model='llama3.1:latest',
  format=QAPairList.model_json_schema(),
)

In [84]:
qa_pairs_llama = QAPairList.model_validate_json(r_llama.message.content)

In [89]:
print(json.dumps(qa_pairs_llama.model_dump()['qa_pairs'], indent=4))

[
    {
        "question": "What is the primary function of a data connector in this system?",
        "metadata": {
            "question_type": "factual",
            "difficulty": "easy",
            "required_context": "concept section",
            "reasoning": " basic recall ",
            "q_a_quality": "good"
        },
        "answer": "to ingest data from different data sources and data formats"
    },
    {
        "question": "What kind of document representation is used after ingesting data?",
        "metadata": {
            "question_type": "factual",
            "difficulty": "medium",
            "required_context": "concept section",
            "reasoning": " basic recall ",
            "q_a_quality": "good"
        },
        "answer": "a simple Document representation (text and simple metadata)"
    },
    {
        "question": "What are the potential benefits of building an index on top of ingested data?",
        "metadata": {
            "question_type": "inf

### Scaling up LLM size with gemma3-12b

In [86]:
r_gemma = chat(
  messages=[
    {
      'role': 'user',
      'content': input_,
    }
  ],
  model='gemma3:12b',
  format=QAPairList.model_json_schema(),
)

In [87]:
qa_pairs_gemma = QAPairList.model_validate_json(r_gemma.message.content)

In [88]:
print(json.dumps(qa_pairs_gemma.model_dump()['qa_pairs'], indent=4))

[
    {
        "question": "What is a data connector referred to as in this documentation?",
        "metadata": {
            "question_type": "factual",
            "difficulty": "easy",
            "required_context": "Definition of data connector",
            "reasoning": "Direct recall from the text.",
            "q_a_quality": "good"
        },
        "answer": "A data connector is referred to as a 'Reader'."
    },
    {
        "question": "After data is ingested, what are the next steps outlined for utilizing the data?",
        "metadata": {
            "question_type": "inferential",
            "difficulty": "medium",
            "required_context": "Flow of data processing steps",
            "reasoning": "Requires understanding the sequence of actions mentioned \u2013 ingest, build an index, query, chat.",
            "q_a_quality": "good"
        },
        "answer": "After ingesting data, the next steps are to build an Index, use a Query Engine, and utilize a Chat E

## Scale to all chunks

In [96]:
print(f"Expect to take about {df.shape[0]*15/(60*60):.2f} hours")

Expect to take about 0.83 hours


In [None]:
def generate_qa_dataset(
    filepath: str,
    df: pd.DataFrame,
    model: str = 'gemma3:12b',
    max_evals: Optional[int] = None,
    verbose: bool = False,
):
    for col in ['id', 'text']:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
        
    with open(filepath, 'w') as f:
        for i, (idx, row) in enumerate(df.iterrows()):
            
            if verbose:
                print(f"Generating for row {i}")
            input_ = generate_qa_prompt_structured(row['text'])
            r = chat(
                messages=[
                    {
                        'role': 'user',
                        'content': input_,
                    }
                ],
                model=model,
                format=QAPairList.model_json_schema(),
            )
            try:
                qa_pairs = QAPairList.model_validate_json(r.message.content)
                gen_dict = qa_pairs.model_dump()
                gen_dict.update({
                    'idx': idx,
                    'id': row['id'],
                })
                out_ = json.dumps(gen_dict)
            except:
                if verbose:
                    print(f"Failed for row {i}")
                out_ = ""
            f.write(out_ + '\n')

            if max_evals and i >= max_evals-1:
                break
         

In [2]:
eval_filepath = './data/qa_pairs_gemma.jsonl'

In [None]:
generate_qa_dataset(
    filepath=eval_filepath,
    df=df,
    model='gemma3:12b',
    max_evals=None,
    verbose=True,
)

Generating for row 0
Generating for row 1
Generating for row 2
Generating for row 3
Generating for row 4
Generating for row 5
Generating for row 6
Generating for row 7
Generating for row 8
Generating for row 9
Generating for row 10
Generating for row 11
Generating for row 12
Generating for row 13
Generating for row 14
Generating for row 15
Generating for row 16
Generating for row 17
Generating for row 18
Generating for row 19
Generating for row 20
Generating for row 21
Generating for row 22
Generating for row 23
Generating for row 24
Generating for row 25
Generating for row 26
Generating for row 27
Generating for row 28
Generating for row 29
Generating for row 30
Generating for row 31
Generating for row 32
Generating for row 33
Generating for row 34
Generating for row 35
Generating for row 36
Generating for row 37
Generating for row 38
Generating for row 39
Generating for row 40
Generating for row 41
Generating for row 42
Generating for row 43
Generating for row 44
Generating for row 4

In [124]:
df.head()

Unnamed: 0,id,doc_id,vector,text,metadata,cluster,cluster_tsne
783,871e39f3-ad80-413d-9353-93b39da8adf5,aa4c9403-c960-442a-aca3-31ad8ae64f6e,"[-0.06531426, 0.0017810601, 0.0071745906, -0.0...",## Concept\n\nA data connector (aka `Reader`) ...,"{'_node_content': '{""id_"": ""871e39f3-ad80-413d...",19,0
851,45e0ab38-6280-4862-be9f-b57ce7f96492,5f858553-f1ec-4828-88df-b6dce5754a75,"[-0.07814654, -0.0054140342, 0.03389499, -0.05...",## Relation-Based Node Parsers,"{'_node_content': '{""id_"": ""45e0ab38-6280-4862...",15,0
600,a65609e2-2297-447c-8779-cc312612f445,dc1397d8-b136-4894-91a1-e292a39b15c2,"[-0.078950115, -0.04367002, 0.023431662, -0.01...",## LlamaIndex Ecosystem\n\nThere's more to the...,"{'_node_content': '{""id_"": ""a65609e2-2297-447c...",7,0
1432,d8e50d48-3935-4a1d-8387-c5f2fd504656,7359399a-1e56-4443-9a70-fc33645355e2,"[-0.08533687, -0.019117128, 0.01595358, 0.0007...",## Additional Resources\n\n- [A Guide to Extra...,"{'_node_content': '{""id_"": ""d8e50d48-3935-4a1d...",7,0
746,cd3836ad-9707-4411-90a9-12c1f5d7ae44,84e9995b-6686-49f4-82ca-c207dd25900e,"[-0.10055203, -0.034588337, -0.016951276, -0.0...",## Usage\n\nYou can create an index on LlamaCl...,"{'_node_content': '{""id_"": ""cd3836ad-9707-4411...",19,0


In [3]:
# parse jsonl file
df_tmp = pd.read_json(eval_filepath, lines=True)
df_tmp.head()

Unnamed: 0,qa_pairs,idx,id
0,[{'question': 'What is the primary function of...,783,871e39f3-ad80-413d-9353-93b39da8adf5
1,[{'question': 'What is the primary subject mat...,851,45e0ab38-6280-4862-be9f-b57ce7f96492
2,[{'question': 'What is the primary function of...,600,a65609e2-2297-447c-8779-cc312612f445
3,[{'question': 'What resource is provided to he...,1432,d8e50d48-3935-4a1d-8387-c5f2fd504656
4,[{'question': 'What environment variable is us...,746,cd3836ad-9707-4411-90a9-12c1f5d7ae44


In [4]:
df_tmp.shape[0]

200