In [58]:
import os
import time
import re

import uuid
import pandas as pd

filepath = 'oz_questions.csv'

In [None]:
%%capture
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive

gauth = GoogleAuth();
gauth.LoadClientConfigFile("client_secrets.json");
gauth.LocalWebserverAuth();

drive = GoogleDrive(gauth);

In [None]:


if os.path.exists(filepath):
    os.remove(filepath)
    print(f"Replacing existing file: {filepath}")
    time.sleep(1)

file = drive.CreateFile({'id': '1aa61xgSOBXu6qH1chEiFqUxgFmBNyvr4Q5bSCFUOkt8'})
file.GetContentFile(filepath, mimetype='text/csv')

In [4]:
oz_q_df = pd.read_csv(filepath)

In [5]:
oz_q_df.head()

Unnamed: 0,Question,Book #,Book Title,Best Answer,length,Note
0,What color are Dorothy's shoes?,1,The Wonderful Wizard of Oz,"“She was so old,” explained the Witch of the N...",285.0,
1,How old is the Scarecrow when Dorothy finds him?,1,The Wonderful Wizard of Oz,“My life has been so short that I really know ...,412.0,
2,Which are the first antagonistic creatures the...,1,The Wonderful Wizard of Oz,In the morning they traveled on until they cam...,1085.0,
3,"When is the first time we read ""There's no pla...",1,The Wonderful Wizard of Oz,“That is because you have no brains” answered ...,236.0,
4,What is the wizard's secret in the Wonderful W...,1,The Wonderful Wizard of Oz,"“No, you are all wrong,” said the little man m...",546.0,


In [6]:
import numpy as np

def select_questions(oz_q_df, random_seed=42,
                     questions_per_book=2, additional_random_questions=4):
    # Set random seed for reproducibility
    np.random.seed(random_seed)

    # Get unique books
    unique_books = oz_q_df['Book #'].unique()

    # Select 2 questions from each book
    selected_per_book = []
    for book_num in unique_books:
        book_questions = oz_q_df[oz_q_df['Book #'] == book_num]
        sample = book_questions.sample(n=min(questions_per_book, len(book_questions)), random_state=random_seed)
        selected_per_book.append(sample)

    selected_from_books = pd.concat(selected_per_book)

    # Select 4 additional random questions from remaining questions
    remaining_questions = oz_q_df[~oz_q_df.index.isin(selected_from_books.index)]
    additional_questions = remaining_questions.sample(n=min(additional_random_questions, len(remaining_questions)), random_state=random_seed)

    # Combine all selected questions
    final_selection = pd.concat([selected_from_books, additional_questions])

    print(f"Selected {len(selected_from_books)} questions from books ({questions_per_book} per book)")
    print(f"Selected {len(additional_questions)} additional random questions")
    print(f"Total: {len(final_selection)} questions\n")

    return final_selection

selected_questions = select_questions(oz_q_df)

Selected 16 questions from books (2 per book)
Selected 4 additional random questions
Total: 20 questions



In [None]:
# API PARAMETERS
BASE_URL = "http://localhost:8080"

#### IMPORTANT
Start up your backend locally for testing
> run docker compose up --build

In [33]:
import aiohttp
import asyncio

# List of book URLs
BOOK_URLS = [
    "https://www.gutenberg.org/cache/epub/55/pg55-images.html",       # The Wonderful Wizard of Oz
    "https://www.gutenberg.org/cache/epub/54/pg54-images.html",       # Marvelous Land of Oz
    "https://www.gutenberg.org/cache/epub/33361/pg33361-images.html", # Ozma of Oz
    "https://www.gutenberg.org/cache/epub/22566/pg22566-images.html", # Dorothy and the Wizard in Oz
    "https://www.gutenberg.org/cache/epub/26624/pg26624-images.html", # The Road to Oz
    "https://www.gutenberg.org/cache/epub/41667/pg41667-images.html", # The Emerald City of Oz
    "https://www.gutenberg.org/cache/epub/32094/pg32094-images.html", # The Patchwork Girl of Oz
    "https://www.gutenberg.org/cache/epub/75720/pg75720-images.html", # Jack Pumpkinhead of Oz
]

async def process_book(session, url, target_chunk_size=800, sentence_overlap=2,
                       small_paragraph_length=200, small_paragraph_overlap=2):
    """Process a single book and return its filename"""
    book_data_payload = {
        "url": url,
        "target_chunk_size": target_chunk_size,
        "sentence_overlap": sentence_overlap,
        "small_paragraph_length": small_paragraph_length,
        "small_paragraph_overlap": small_paragraph_overlap
    }

    async with session.post(f"{BASE_URL}/v1/book-data", json=book_data_payload) as response:
        result = await response.json()
        if result.get("status") == "error":
            raise Exception(f"Error uploading {url}: {result['message']}")
        return result.get("filename")


async def process_all_books(book_urls, **chunking_params):
    """
    Process all books in parallel and wait for all to complete

    This mimics Promise.all in JavaScript (as done in the frontend)
    """
    async with aiohttp.ClientSession() as session:
        # Create tasks for all uploads
        tasks = [
            process_book(session, url, **chunking_params)
            for url in book_urls
        ]

        filenames = []
        # Process tasks as they complete
        for i, coroutine in enumerate(asyncio.as_completed(tasks), 1):
            filename = await coroutine
            print(f"[{i}/{len(tasks)}] Completed chunking and embedding: {filename}")
            filenames.append(filename)

        print(f"Successfully processed {len(filenames)} books")
        return filenames


async def run_test_async(
    session,
    test_query=None,
    filenames=None,
    book_urls=None,
    target_chunk_size=None,
    sentence_overlap=None,
    small_paragraph_length=None,
    small_paragraph_overlap=None,
    skip_book_upload=True
):

    if not test_query:
        raise ValueError("test_query must be provided")

    if not skip_book_upload:
        if any(param is None for param in [
            target_chunk_size, sentence_overlap,
            small_paragraph_length, small_paragraph_overlap,
            book_urls
        ]):
            raise ValueError("Chunking parameters must be provided when skip_book_upload is False")

        filenames = await process_all_books(
            book_urls=book_urls,
            target_chunk_size=target_chunk_size,
            sentence_overlap=sentence_overlap,
            small_paragraph_length=small_paragraph_length,
            small_paragraph_overlap=small_paragraph_overlap
        )
    elif not filenames:
        raise ValueError("filenames must be provided if skip_book_upload is True")

    query_id = str(uuid.uuid4())
    model_payload = {"user_query": test_query}

    async with session.post(f"{BASE_URL}/v1/model-response", json=model_payload) as response:
        result = await response.json()
        if result.get("status") == "error":
            raise Exception(f"Error in model response: {result['message']}")

    search_payload = {
        "query": result["search_query"],
        "filenames": filenames,
        "top_k": 3,
        "query_id": query_id,
        "enhanced_query": True
    }

    async with session.post(f"{BASE_URL}/v1/search-response", json=search_payload) as response:
        search_results = await response.json()
        if search_results.get("status") == "error":
            raise Exception(f"Error in search response: {search_results['message']}")
        return search_results, filenames

In [None]:
# Define parameter combinations to test
PARAM_COMBOS = [
    # (target_chunk_size, sentence_overlap, small_paragraph_length, small_paragraph_overlap)
    (800, 2, 200, 2),   # Default
    (1000, 2, 200, 2),  # Larger chunks
    (1000, 2, 200, 3),  # Larger chunks with more paragraph overlap
    (1000, 2, 150, 3),  # Larger chunks with smaller paragraph threshold
    (600, 2, 200, 2),   # Smaller chunks
    (600, 2, 200, 3),   # Smaller chunks with more paragraph overlap
    (600, 2, 150, 3),   # Smaller chunks with smaller
    (600, 2, 250, 3),   # Smaller chunks with larger paragraph threshold
    (700, 2, 200, 2),   # Medium chunks
    (700, 2, 200, 3),   # Medium chunks with more paragraph overlap
    (700, 2, 150, 3),   # Medium chunks with smaller paragraph threshold
    (800, 3, 200, 2),   # More sentence overlap
    (800, 1, 200, 2),   # Less sentence overlap
    (800, 2, 150, 2),   # Smaller paragraph threshold
    (800, 2, 250, 2),   # Larger paragraph threshold
    (800, 2, 200, 3),   # More paragraph overlap
    (800, 2, 200, 1),   # Less paragraph overlap
]

def normalize_text(text):
    text = re.sub(r'["\'”\'“]', '', text)
    return re.sub(r'\s+', ' ', text.lower().strip())

def remove_chapter_chunk_tag(text):
    text_match = re.search(r"From Chapter\s+.+?:\s*(.+)", text, re.DOTALL)
    if text_match:
        return text_match.group(1).strip()
    return text

def find_chunk_locations_with_continuity(filepath, expected_text):
    """
    Find chunks that contain the expected text with continuity across multiple chunks.
    Uses word-by-word matching to handle text that spans chunk boundaries.
    Returns list of chunk indices where the text spans, or empty list if not found.
    """
    temp_df = pd.read_pickle(filepath)

    # Normalize and split expected text into words
    normalized_expected = normalize_text(expected_text)
    expected_words = normalized_expected.split()

    if not expected_words:
        print("WARNING: No words found in expected text")
        return []

    print(f"Looking for {len(expected_words)} words from expected text")

    # Find the chunk containing the first few words
    first_phrase = ' '.join(expected_words[:min(5, len(expected_words))])  # Use first 5 words
    start_chunk_idx = None

    for chunk_idx, row in temp_df.iterrows():
        chunk_text = normalize_text(remove_chapter_chunk_tag(row['text']))
        if first_phrase in chunk_text:
            start_chunk_idx = chunk_idx
            break

    if start_chunk_idx is None:
        print(f"WARNING: First phrase not found: '{first_phrase}'")
        return []

    # Now verify continuity word by word from start_chunk_idx
    matched_chunks = [start_chunk_idx]
    current_chunk_idx = start_chunk_idx
    word_idx = 0

    while word_idx < len(expected_words):
        if current_chunk_idx >= len(temp_df):
            print(f"WARNING: Ran out of chunks at word {word_idx}/{len(expected_words)}")
            return []

        current_text = normalize_text(remove_chapter_chunk_tag(temp_df.iloc[current_chunk_idx]['text']))
        chunk_words = current_text.split()

        # Find where we are in the current chunk
        words_matched_in_chunk = 0

        # Try to match consecutive words from expected_words starting at word_idx
        for i in range(word_idx, len(expected_words)):
            # Look for the current expected word in remaining chunk words
            expected_word = expected_words[i]

            # Create a sliding window of chunk text to find the word sequence
            remaining_chunk_text = ' '.join(chunk_words[words_matched_in_chunk:])

            if expected_word in remaining_chunk_text:
                # Find the position and advance
                word_position = remaining_chunk_text.split().index(expected_word) if expected_word in remaining_chunk_text.split() else -1
                if word_position >= 0:
                    words_matched_in_chunk += word_position + 1
                    word_idx = i + 1
                else:
                    # Word is part of a larger match, just continue
                    word_idx = i + 1
            else:
                # Word not found in remaining chunk text
                break

        if word_idx == 0 or words_matched_in_chunk == 0:
            # Couldn't match any words in this chunk - continuity broken
            print(f"WARNING: Continuity broken at word {word_idx}: '{expected_words[word_idx] if word_idx < len(expected_words) else 'END'}'")
            print(f"Current chunk text: {current_text}")
            print(f"For filepath: {filepath}, chunk index: {current_chunk_idx}")

            return []

        # If there are more words to find, move to next chunk
        if word_idx < len(expected_words):
            current_chunk_idx += 1
            if current_chunk_idx < len(temp_df) and current_chunk_idx not in matched_chunks:
                matched_chunks.append(current_chunk_idx)

    # Successfully found all words with continuity
    print(f"SUCCESS: Found text spanning {len(matched_chunks)} chunk(s): {matched_chunks}")
    return matched_chunks


def find_chunk_location_from_text(filepath, expected_text):
    matched_chunks = find_chunk_locations_with_continuity(filepath, expected_text)

    if not matched_chunks:
        print("WARNING: No chunk was matched to the given text in 'BEST ANSWER'!")
        return (None, None, None, None, [])

    # Get metadata from the first matched chunk
    temp_df = pd.read_pickle(filepath)
    primary_chunk_idx = matched_chunks[0]
    row = temp_df.iloc[primary_chunk_idx]

    chapter_number = int(row['chapter_index'])
    chapter_title = str(row['title'])
    chunk_in_chapter_match = re.search(r"(.*)\((\d+)\)", chapter_title)
    if chunk_in_chapter_match and chunk_in_chapter_match.group(2).isdigit():
        chunk_in_chapter_index = int(chunk_in_chapter_match.group(2))
        chapter_title = chunk_in_chapter_match.group(1).strip()
    else:
        chunk_in_chapter_index = None

    return primary_chunk_idx, chapter_number, chapter_title, chunk_in_chapter_index, matched_chunks

async def run_all_tests(
    selected_questions,
    param_combos:list=PARAM_COMBOS,
    book_urls:list=BOOK_URLS,
    skip_book_processing:bool=False,
):
    run_id = str(uuid.uuid4())[:8]
    import datetime

    all_results = []

    async with aiohttp.ClientSession() as session:
        for test_num, params in enumerate(param_combos, 1):
            target_chunk_size, sentence_overlap, small_paragraph_length, small_paragraph_overlap = params

            timestamp = datetime.datetime.now().strftime("%H:%M:%S.%f")[:-3]
            print(f"[{timestamp}] [{run_id}] TEST {test_num}: chunk_size={target_chunk_size}")

            if not skip_book_processing:
                start_time = datetime.datetime.now()
                filenames = await process_all_books(
                    book_urls=book_urls,
                    target_chunk_size=target_chunk_size,
                    sentence_overlap=sentence_overlap,
                    small_paragraph_length=small_paragraph_length,
                    small_paragraph_overlap=small_paragraph_overlap
                )
                end_time = datetime.datetime.now()
                elapsed = (end_time - start_time).total_seconds()
                print(f"[{timestamp}] [{run_id}] Completed book processing for TEST {test_num}, "
                  f"it took {elapsed:.2f} seconds")
            else:
                print(
                    "Skipping book processing. Assumption is dfs are already processed correctly and available in the directory."
                )
                # fallback to hardcoded filenames
                filenames = [
                    "the_wonderful_wizard_of_oz",
                    "the_marvelous_land_of_oz",
                    "ozma_of_oz",
                    "dorothy_and_the_wizard_in_oz",
                    "the_road_to_oz",
                    "the_emerald_city_of_oz",
                    "the_patchwork_girl_of_oz",
                    "jack_pumpkinhead_of_oz",
                ]

            print(f"[{timestamp}] [{run_id}] Starting tests per query for TEST {test_num}")

            for question_number, (_, question_row) in enumerate(selected_questions.iterrows(), 1):

                question = question_row['Question']
                print(f"[{timestamp}] [{run_id}]   Question {question_number}: {question}")

                search_results, _ = await run_test_async(
                    session,
                    test_query=question,
                    filenames=filenames
                )

                # Process results...
                chunk_lengths = {}
                avg_chunk_length = {}
                for filename in filenames:
                    temp_df = pd.read_pickle(f"../temp/{filename}.pkl")
                    chunk_lengths.update({filename: temp_df.chunk_length.tolist()})
                    avg_chunk_length.update({filename: temp_df.chunk_length.mean()})

                # Find the expected chunk index
                expected_filename = question_row['Book Title'].lower().replace(' ', '_').strip()
                expected_book_filepath = f"../temp/{expected_filename}.pkl"

                (
                    expected_chunk_index,
                    expected_chapter_number,
                    expected_chapter_title,
                    expected_chunk_in_chapter_index,
                    all_expected_chunks
                ) = find_chunk_location_from_text(
                        expected_book_filepath, question_row['Best Answer']
                    ) if os.path.exists(expected_book_filepath) else (None, None, None, None)

                for result_num, result in enumerate(search_results["search_results"], 1):
                    matched_text = result['data']['matched_texts']
                    matched_hits = [m.get('text', '') for m in matched_text if m.get('is_match') is True]
                    match_text = " ".join(matched_hits) if matched_hits else ""

                    book_match = result['data']['book_title'].lower().strip() == question_row['Book Title'].lower().strip()
                    text_match = question_row['Best Answer'] in match_text

                    # chunk distances
                    chunk_distance_from_expected = abs(
                        result['data']['chunk_index'] - expected_chunk_index
                        ) if expected_chunk_index is not None and book_match else "NaN"
                    character_distance_from_expected = (round(chunk_distance_from_expected * avg_chunk_length[expected_filename])) if os.path.exists(expected_book_filepath) and chunk_distance_from_expected != "NaN" else "NaN"

                    # chapter distances
                    chapter_match = (result['data']['chapter_number'] == expected_chapter_number) if book_match else "NaN"
                    chapter_distance_from_expected = abs(
                        result['data']['chapter_number'] - expected_chapter_number
                        ) if expected_chapter_number is not None and book_match else "NaN"
                    chunk_in_chap_distance_from_expected = abs(
                        result['data']['chunk_in_chapter_index'] - expected_chunk_in_chapter_index
                    ) if expected_chunk_in_chapter_index is not None and chapter_match is True else "NaN"

                    result_row = {
                        'test_number': test_num,
                        'question_number': question_number,
                        'result_rank': result_num,
                        'original_query': question,
                        'enhanced_query': result['data']['query'],
                        'target_chunk_size': target_chunk_size,
                        'sentence_overlap': sentence_overlap,
                        'small_paragraph_length': small_paragraph_length,
                        'small_paragraph_overlap': small_paragraph_overlap,
                        'matched_chapter_title': result['data']['chapter_title'],
                        'expected_chapter_title': expected_chapter_title,
                        'matched_chapter_number': result['data']['chapter_number'],
                        'expected_chapter_number': expected_chapter_number,
                        'correct_chapter_found': chapter_match,
                        'chapter_distance_from_expected': chapter_distance_from_expected,
                        'matched_chunk_in_chapter_index': result['data']['chunk_in_chapter_index'],
                        'exptected_chunk_in_chapter_index': expected_chunk_in_chapter_index,
                        'chunk_in_chap_distance_from_expected': chunk_in_chap_distance_from_expected,
                        'score': result['data']['score'],
                        'matched_text': match_text,
                        'expected_text': question_row['Best Answer'],
                        'correct_text_found': text_match,
                        'matched_chunk_index': result['data']['chunk_index'],
                        'expected_primary_chunk_index': expected_chunk_index if expected_chunk_index else "NaN",
                        'expected_all_chunk_indices': all_expected_chunks if all_expected_chunks else "NaN",
                        'chunk_distance_from_expected': chunk_distance_from_expected,
                        'char_distance_from_expected': character_distance_from_expected,
                        'matched_book_title': result['data']['book_title'],
                        'expected_book_title': question_row['Book Title'],
                        'correct_book_found': book_match,
                        'avg_chunk_length': avg_chunk_length,
                        'all_chunks': chunk_lengths,
                    }
                    all_results.append(result_row)

    results_df = pd.DataFrame(all_results)
    timestamp = datetime.datetime.now().strftime("%H:%M:%S.%f")[:-3]
    print(f"\n{'='*80}")
    print(f"[{timestamp}] [{run_id}] COMPLETED {len(param_combos)} TESTS")
    print(f"[{timestamp}] [{run_id}] Total results collected: {len(results_df)}")
    print(f"{'='*80}")
    return results_df

In [152]:
results_df = await run_all_tests(
    selected_questions=selected_questions, param_combos=PARAM_COMBOS[0:1], book_urls=BOOK_URLS, skip_book_processing=True
  )

[00:34:32.711] [ba25585d] TEST 1: chunk_size=800
Skipping book processing. Assumption is dfs are already processed correctly and available in the directory.
[00:34:32.711] [ba25585d] Starting tests per query for TEST 1
[00:34:32.711] [ba25585d]   Question 1: How is Glinda's appearance described when Dorothy meets her?


Looking for 43 words from expected text
SUCCESS: Found text spanning 1 chunk(s): [247]
[00:34:32.711] [ba25585d]   Question 2: How old is the Scarecrow when Dorothy finds him?
Looking for 81 words from expected text
SUCCESS: Found text spanning 1 chunk(s): [38]
[00:34:32.711] [ba25585d]   Question 3: How does one use the Powder of Life?
Looking for 133 words from expected text
SUCCESS: Found text spanning 2 chunk(s): [11, 12]
[00:34:32.711] [ba25585d]   Question 4: When is Wogglebug introduced?
Looking for 62 words from expected text
SUCCESS: Found text spanning 1 chunk(s): [141]
[00:34:32.711] [ba25585d]   Question 5: What happens to the Scarecrow and the Sawhorse that they get hurt?
Looking for 96 words from expected text
SUCCESS: Found text spanning 1 chunk(s): [150]
[00:34:32.711] [ba25585d]   Question 6: When do Dorothy and Ozma meet?
Looking for 104 words from expected text
SUCCESS: Found text spanning 1 chunk(s): [100]
[00:34:32.711] [ba25585d]   Question 7: Why doesn't Eureka i

In [153]:
results_df.head()

Unnamed: 0,test_number,question_number,result_rank,original_query,enhanced_query,target_chunk_size,sentence_overlap,small_paragraph_length,small_paragraph_overlap,matched_chapter_title,...,matched_chunk_index,expected_primary_chunk_index,expected_all_chunk_indices,chunk_distance_from_expected,char_distance_from_expected,matched_book_title,expected_book_title,correct_book_found,avg_chunk_length,all_chunks
0,1,1,1,How is Glinda's appearance described when Doro...,description of Glinda's appearance when Doroth...,800,2,200,2,Glinda The Good Witch Grants Dorothy’s Wish,...,247,247,[247],0.0,0.0,The Wonderful Wizard of Oz,The Wonderful Wizard of Oz,True,{'the_wonderful_wizard_of_oz': 894.97286821705...,"{'the_wonderful_wizard_of_oz': [883, 321, 873,..."
1,1,1,2,How is Glinda's appearance described when Doro...,description of Glinda's appearance when Doroth...,800,2,200,2,Princess Ozma Of Oz,...,297,247,[247],,,The Marvelous Land of Oz,The Wonderful Wizard of Oz,False,{'the_wonderful_wizard_of_oz': 894.97286821705...,"{'the_wonderful_wizard_of_oz': [883, 321, 873,..."
2,1,1,3,How is Glinda's appearance described when Doro...,description of Glinda's appearance when Doroth...,800,2,200,2,How Glinda Worked A Magic Spell,...,378,247,[247],,,The Emerald City of Oz,The Wonderful Wizard of Oz,False,{'the_wonderful_wizard_of_oz': 894.97286821705...,"{'the_wonderful_wizard_of_oz': [883, 321, 873,..."
3,1,2,1,How old is the Scarecrow when Dorothy finds him?,the Scarecrow's age or description when Doroth...,800,2,200,2,How Dorothy Saved The Scarecrow,...,28,38,[38],10.0,8950.0,The Wonderful Wizard of Oz,The Wonderful Wizard of Oz,True,{'the_wonderful_wizard_of_oz': 894.97286821705...,"{'the_wonderful_wizard_of_oz': [883, 321, 873,..."
4,1,2,2,How old is the Scarecrow when Dorothy finds him?,the Scarecrow's age or description when Doroth...,800,2,200,2,"The Discovery Of Oz, The Terrible",...,185,38,[38],147.0,131561.0,The Wonderful Wizard of Oz,The Wonderful Wizard of Oz,True,{'the_wonderful_wizard_of_oz': 894.97286821705...,"{'the_wonderful_wizard_of_oz': [883, 321, 873,..."


In [176]:
print("Number of results where at least one result is within 2 chunks of the expected answer:", results_df[
    results_df['chunk_distance_from_expected'].apply(lambda x: isinstance(x, (int, float)) and x <= 2)
]['question_number'].nunique())
results_df[
    results_df['chunk_distance_from_expected'].apply(lambda x: isinstance(x, (int, float)) and x <= 2)
][['original_query', 'result_rank', 'matched_chunk_index', 'expected_primary_chunk_index', 'expected_all_chunk_indices', 'chunk_distance_from_expected']]

Number of results where at least one result is within 2 chunks of the expected answer: 10


Unnamed: 0,original_query,result_rank,matched_chunk_index,expected_primary_chunk_index,expected_all_chunk_indices,chunk_distance_from_expected
0,How is Glinda's appearance described when Doro...,1,247,247,[247],0
6,How does one use the Powder of Life?,1,11,11,"[11, 12]",0
8,How does one use the Powder of Life?,3,10,11,"[11, 12]",1
12,What happens to the Scarecrow and the Sawhorse...,1,150,150,[150],0
19,Why doesn't Eureka immediately tell Ozma where...,2,277,279,[279],2
21,How does the Wizard demonstrate his magic to t...,1,77,76,"[76, 77]",1
22,How does the Wizard demonstrate his magic to t...,2,76,76,"[76, 77]",0
24,What is the function of Santa Claus in The Roa...,1,237,239,"[239, 240]",2
25,What is the function of Santa Claus in The Roa...,2,238,239,"[239, 240]",1
40,What is the Patchwork Girl's name?,2,54,54,[54],0


In [177]:
print("Number of results where at least one result is within 1 chapter of the expected answer:", results_df[
    results_df['chapter_distance_from_expected'].apply(lambda x: isinstance(x, (int, float)) and x <= 1)
]['question_number'].nunique())
results_df[
    results_df['chapter_distance_from_expected'].apply(lambda x: isinstance(x, (int, float)) and x <= 1)
][['original_query', 'result_rank', 'matched_chunk_index', 'expected_primary_chunk_index', 'expected_all_chunk_indices', 'chapter_distance_from_expected']]

Number of results where at least one result is within 1 chapter of the expected answer: 13


Unnamed: 0,original_query,result_rank,matched_chunk_index,expected_primary_chunk_index,expected_all_chunk_indices,chapter_distance_from_expected
0,How is Glinda's appearance described when Doro...,1,247,247,[247],0
3,How old is the Scarecrow when Dorothy finds him?,1,28,38,[38],1
6,How does one use the Powder of Life?,1,11,11,"[11, 12]",0
8,How does one use the Powder of Life?,3,10,11,"[11, 12]",0
9,When is Wogglebug introduced?,1,144,141,[141],0
10,When is Wogglebug introduced?,2,150,141,[141],1
11,When is Wogglebug introduced?,3,147,141,[141],1
12,What happens to the Scarecrow and the Sawhorse...,1,150,150,[150],0
18,Why doesn't Eureka immediately tell Ozma where...,1,276,279,[279],0
19,Why doesn't Eureka immediately tell Ozma where...,2,277,279,[279],0


In [157]:
# Count how many questions had at least one result with chunk_distance <= 2
correct_questions = results_df[
    results_df['chunk_distance_from_expected'].apply(lambda x: isinstance(x, (int, float)) and x <= 0)
]['question_number'].nunique()

total_questions = results_df['question_number'].nunique()

print(f"Questions with correct answer in top 3: {correct_questions}/{total_questions}")
print(f"Accuracy: {correct_questions/total_questions*100:.1f}%")
# results_df[["original_query", "matched_book_title", "matched_chapter_title", "expected_chapter_title", "matched_chapter_number", "expected_chapter_number", "correct_chapter_found", "chapter_distance_from_expected", "matched_chunk_in_chapter_index", "exptected_chunk_in_chapter_index", "chunk_in_chap_distance_from_expected"]]

Questions with correct answer in top 3: 7/20
Accuracy: 35.0%


There are two main parameters we aim to adjust for:
1. Semantic similarity of chunks
2. Chunk size consistency

Aiming for semantic similarity has benefits of each chunk maintaining contextual information, but then the sizes of each chunk can vary drastically. Aiming for chunk size consistency is to reduce bias towards longer chunks. Very long chunks dominate rankings simply because they contain more tokens that may match your query. Additionally, too small of a chunk and you will find poor semantic representation. Too large of a chunk and you are increasing the noise and can dilute the signal. Lastly, more constitent chunks means better embedding and search performance since the resources required are more predictable chunk by chunk.

Therefore we can take advantage of book structure and but also tune for more consistent chunks...

In [126]:
cols_to_exclude = ['all_chunks', 'avg_chunk_length']
cols_to_keep = [col for col in results_df.columns if col not in cols_to_exclude]
results_df[cols_to_keep].to_csv("results.csv", index=False)