In [58]:
import os
import time
import re

import uuid
import pandas as pd

filepath = 'oz_questions.csv'

In [None]:
%%capture
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive

gauth = GoogleAuth();
gauth.LoadClientConfigFile("client_secrets.json");
gauth.LocalWebserverAuth();

drive = GoogleDrive(gauth);

In [None]:


if os.path.exists(filepath):
    os.remove(filepath)
    print(f"Replacing existing file: {filepath}")
    time.sleep(1)

file = drive.CreateFile({'id': '1aa61xgSOBXu6qH1chEiFqUxgFmBNyvr4Q5bSCFUOkt8'})
file.GetContentFile(filepath, mimetype='text/csv')

In [4]:
oz_q_df = pd.read_csv(filepath)

In [5]:
oz_q_df.head()

Unnamed: 0,Question,Book #,Book Title,Best Answer,length,Note
0,What color are Dorothy's shoes?,1,The Wonderful Wizard of Oz,"“She was so old,” explained the Witch of the N...",285.0,
1,How old is the Scarecrow when Dorothy finds him?,1,The Wonderful Wizard of Oz,“My life has been so short that I really know ...,412.0,
2,Which are the first antagonistic creatures the...,1,The Wonderful Wizard of Oz,In the morning they traveled on until they cam...,1085.0,
3,"When is the first time we read ""There's no pla...",1,The Wonderful Wizard of Oz,“That is because you have no brains” answered ...,236.0,
4,What is the wizard's secret in the Wonderful W...,1,The Wonderful Wizard of Oz,"“No, you are all wrong,” said the little man m...",546.0,


In [6]:
import numpy as np

def select_questions(oz_q_df, random_seed=42,
                     questions_per_book=2, additional_random_questions=4):
    # Set random seed for reproducibility
    np.random.seed(random_seed)

    # Get unique books
    unique_books = oz_q_df['Book #'].unique()

    # Select 2 questions from each book
    selected_per_book = []
    for book_num in unique_books:
        book_questions = oz_q_df[oz_q_df['Book #'] == book_num]
        sample = book_questions.sample(n=min(questions_per_book, len(book_questions)), random_state=random_seed)
        selected_per_book.append(sample)

    selected_from_books = pd.concat(selected_per_book)

    # Select 4 additional random questions from remaining questions
    remaining_questions = oz_q_df[~oz_q_df.index.isin(selected_from_books.index)]
    additional_questions = remaining_questions.sample(n=min(additional_random_questions, len(remaining_questions)), random_state=random_seed)

    # Combine all selected questions
    final_selection = pd.concat([selected_from_books, additional_questions])

    print(f"Selected {len(selected_from_books)} questions from books ({questions_per_book} per book)")
    print(f"Selected {len(additional_questions)} additional random questions")
    print(f"Total: {len(final_selection)} questions\n")

    return final_selection

selected_questions = select_questions(oz_q_df)

Selected 16 questions from books (2 per book)
Selected 4 additional random questions
Total: 20 questions



In [7]:
# API PARAMETERS
BASE_URL = "http://localhost:8080"
TEST_URL = "https://www.gutenberg.org/cache/epub/55/pg55-images.html"

#### IMPORTANT
Start up your backend locally for testing
> run docker compose up --build

In [33]:
import aiohttp
import asyncio

# List of book URLs
BOOK_URLS = [
    "https://www.gutenberg.org/cache/epub/55/pg55-images.html",       # The Wonderful Wizard of Oz
    "https://www.gutenberg.org/cache/epub/54/pg54-images.html",       # Marvelous Land of Oz
    "https://www.gutenberg.org/cache/epub/33361/pg33361-images.html", # Ozma of Oz
    "https://www.gutenberg.org/cache/epub/22566/pg22566-images.html", # Dorothy and the Wizard in Oz
    "https://www.gutenberg.org/cache/epub/26624/pg26624-images.html", # The Road to Oz
    "https://www.gutenberg.org/cache/epub/41667/pg41667-images.html", # The Emerald City of Oz
    "https://www.gutenberg.org/cache/epub/32094/pg32094-images.html", # The Patchwork Girl of Oz
    "https://www.gutenberg.org/cache/epub/75720/pg75720-images.html", # Jack Pumpkinhead of Oz
]

async def process_book(session, url, target_chunk_size=800, sentence_overlap=2,
                       small_paragraph_length=200, small_paragraph_overlap=2):
    """Process a single book and return its filename"""
    book_data_payload = {
        "url": url,
        "target_chunk_size": target_chunk_size,
        "sentence_overlap": sentence_overlap,
        "small_paragraph_length": small_paragraph_length,
        "small_paragraph_overlap": small_paragraph_overlap
    }

    async with session.post(f"{BASE_URL}/v1/book-data", json=book_data_payload) as response:
        result = await response.json()
        if result.get("status") == "error":
            raise Exception(f"Error uploading {url}: {result['message']}")
        return result.get("filename")


async def process_all_books(book_urls, **chunking_params):
    """
    Process all books in parallel and wait for all to complete

    This mimics Promise.all in JavaScript (as done in the frontend)
    """
    async with aiohttp.ClientSession() as session:
        # Create tasks for all uploads
        tasks = [
            process_book(session, url, **chunking_params)
            for url in book_urls
        ]

        filenames = []
        # Process tasks as they complete
        for i, coroutine in enumerate(asyncio.as_completed(tasks), 1):
            filename = await coroutine
            print(f"[{i}/{len(tasks)}] Completed chunking and embedding: {filename}")
            filenames.append(filename)

        print(f"Successfully processed {len(filenames)} books")
        return filenames


async def run_test_async(
    session,
    test_query=None,
    filenames=None,
    book_urls=None,
    target_chunk_size=None,
    sentence_overlap=None,
    small_paragraph_length=None,
    small_paragraph_overlap=None,
    skip_book_upload=True
):

    if not test_query:
        raise ValueError("test_query must be provided")

    if not skip_book_upload:
        if any(param is None for param in [
            target_chunk_size, sentence_overlap,
            small_paragraph_length, small_paragraph_overlap,
            book_urls
        ]):
            raise ValueError("Chunking parameters must be provided when skip_book_upload is False")

        filenames = await process_all_books(
            book_urls=book_urls,
            target_chunk_size=target_chunk_size,
            sentence_overlap=sentence_overlap,
            small_paragraph_length=small_paragraph_length,
            small_paragraph_overlap=small_paragraph_overlap
        )
    elif not filenames:
        raise ValueError("filenames must be provided if skip_book_upload is True")

    query_id = str(uuid.uuid4())
    model_payload = {"user_query": test_query}

    async with session.post(f"{BASE_URL}/v1/model-response", json=model_payload) as response:
        result = await response.json()
        if result.get("status") == "error":
            raise Exception(f"Error in model response: {result['message']}")

    search_payload = {
        "query": result["search_query"],
        "filenames": filenames,
        "top_k": 3,
        "query_id": query_id,
        "enhanced_query": True
    }

    async with session.post(f"{BASE_URL}/v1/search-response", json=search_payload) as response:
        search_results = await response.json()
        if search_results.get("status") == "error":
            raise Exception(f"Error in search response: {search_results['message']}")
        return search_results, filenames

In [None]:
# Define parameter combinations to test
PARAM_COMBOS = [
    # (target_chunk_size, sentence_overlap, small_paragraph_length, small_paragraph_overlap)
    (800, 2, 200, 2),   # Default
    (1000, 2, 200, 2),  # Larger chunks
    (1000, 2, 200, 3),  # Larger chunks with more paragraph overlap
    (1000, 2, 150, 3),  # Larger chunks with smaller paragraph threshold
    (600, 2, 200, 2),   # Smaller chunks
    (600, 2, 200, 3),   # Smaller chunks with more paragraph overlap
    (600, 2, 150, 3),   # Smaller chunks with smaller
    (600, 2, 250, 3),   # Smaller chunks with larger paragraph threshold
    (700, 2, 200, 2),   # Medium chunks
    (700, 2, 200, 3),   # Medium chunks with more paragraph overlap
    (700, 2, 150, 3),   # Medium chunks with smaller paragraph threshold
    (800, 3, 200, 2),   # More sentence overlap
    (800, 1, 200, 2),   # Less sentence overlap
    (800, 2, 150, 2),   # Smaller paragraph threshold
    (800, 2, 250, 2),   # Larger paragraph threshold
    (800, 2, 200, 3),   # More paragraph overlap
    (800, 2, 200, 1),   # Less paragraph overlap
]

def normalize_text(text):
    return re.sub(r'\s+', ' ', text.lower().replace('“', '"').replace('”', '"').strip())

def get_first_sentence_partial_match(text):
    return text.replace('“', '"').replace('”', '"').lower().split("\n")[0]

def find_chunk_location_from_text(filepath, expected_text, partial_match_ratio=3):
    temp_df = pd.read_pickle(filepath)

    for chunk_idx, row in temp_df.iterrows():
        normalized_expected_text = normalize_text(expected_text)
        normalized_df_text = normalize_text(row['text'])

        full_text_location_match = normalized_expected_text in normalized_df_text
        partial_text_location_match = (
            normalized_expected_text[:len(normalized_expected_text)//partial_match_ratio] in normalized_df_text
            or normalized_expected_text in normalized_df_text[-len(normalized_expected_text)//partial_match_ratio:]
        )
        first_sentence_partial_match = get_first_sentence_partial_match(expected_text) in normalized_df_text

        if partial_text_location_match and not full_text_location_match:
            print("NOTE: taking first chunk location where ~partial~ text matched.")

        if first_sentence_partial_match and not (full_text_location_match or partial_text_location_match):
            print("NOTE: taking first chunk location where ~first sentence partial~ text matched.")

        if full_text_location_match or partial_text_location_match or first_sentence_partial_match:
            chapter_number = int(row['chapter_index']) # potential to change this name
            chapter_title = str(row['title'])
            chunk_in_chapter_match = re.search(r"(.*)\((\d+)\)", chapter_title)
            if chunk_in_chapter_match and chunk_in_chapter_match.group(2).isdigit():
                chunk_in_chapter_index = int(chunk_in_chapter_match.group(2))
                chapter_title = chunk_in_chapter_match.group(1).strip()
            else:
                chunk_in_chapter_index = None
            return chunk_idx, chapter_number, chapter_title, chunk_in_chapter_index

    print("WARNING: no chunk was matched to the given text in 'BEST ANSWER'!")
    return (None, None, None, None)

async def run_all_tests(
    selected_questions,
    param_combos:list=PARAM_COMBOS,
    book_urls:list=BOOK_URLS,
    skip_book_processing:bool=False,
):
    run_id = str(uuid.uuid4())[:8]
    import datetime

    all_results = []

    async with aiohttp.ClientSession() as session:
        for test_num, params in enumerate(param_combos, 1):
            target_chunk_size, sentence_overlap, small_paragraph_length, small_paragraph_overlap = params

            timestamp = datetime.datetime.now().strftime("%H:%M:%S.%f")[:-3]
            print(f"[{timestamp}] [{run_id}] TEST {test_num}: chunk_size={target_chunk_size}")

            if not skip_book_processing:
                start_time = datetime.datetime.now()
                filenames = await process_all_books(
                    book_urls=book_urls,
                    target_chunk_size=target_chunk_size,
                    sentence_overlap=sentence_overlap,
                    small_paragraph_length=small_paragraph_length,
                    small_paragraph_overlap=small_paragraph_overlap
                )
                end_time = datetime.datetime.now()
                elapsed = (end_time - start_time).total_seconds()
                print(f"[{timestamp}] [{run_id}] Completed book processing for TEST {test_num}, "
                  f"it took {elapsed:.2f} seconds")
            else:
                print(
                    "Skipping book processing. Assumption is dfs are already processed correctly and available in the directory."
                )
                # fallback to hardcoded filenames
                filenames = [
                    "the_wonderful_wizard_of_oz",
                    "the_marvelous_land_of_oz",
                    "ozma_of_oz",
                    "dorothy_and_the_wizard_in_oz",
                    "the_road_to_oz",
                    "the_emerald_city_of_oz",
                    "the_patchwork_girl_of_oz",
                    "jack_pumpkinhead_of_oz",
                ]

            print(f"[{timestamp}] [{run_id}] Starting tests per query for TEST {test_num}")

            for question_number, (_, question_row) in enumerate(selected_questions.iterrows(), 1):

                question = question_row['Question']
                print(f"[{timestamp}] [{run_id}]   Question {question_number}: {question}")

                search_results, _ = await run_test_async(
                    session,
                    test_query=question,
                    filenames=filenames
                )

                # Process results...
                chunk_lengths = {}
                avg_chunk_length = {}
                for filename in filenames:
                    temp_df = pd.read_pickle(f"../temp/{filename}.pkl")
                    chunk_lengths.update({filename: temp_df.chunk_length.tolist()})
                    avg_chunk_length.update({filename: temp_df.chunk_length.mean()})

                # Find the expected chunk index
                expected_filename = question_row['Book Title'].lower().replace(' ', '_').strip()
                expected_book_filepath = f"../temp/{expected_filename}.pkl"

                (
                    expected_chunk_index,
                    expected_chapter_number,
                    expected_chapter_title,
                    expected_chunk_in_chapter_index
                ) = find_chunk_location_from_text(
                        expected_book_filepath, question_row['Best Answer']
                    ) if os.path.exists(expected_book_filepath) else (None, None, None, None)

                for result_num, result in enumerate(search_results["search_results"], 1):
                    matched_text = result['data']['matched_texts']
                    matched_hits = [m.get('text', '') for m in matched_text if m.get('is_match') is True]
                    match_text = " ".join(matched_hits) if matched_hits else ""

                    book_match = result['data']['book_title'].lower().strip() == question_row['Book Title'].lower().strip()
                    text_match = question_row['Best Answer'] in match_text

                    # chunk distances
                    chunk_distance_from_expected = abs(
                        result['data']['chunk_index'] - expected_chunk_index
                        ) if expected_chunk_index is not None and book_match else "NaN"
                    character_distance_from_expected = (round(chunk_distance_from_expected * avg_chunk_length[expected_filename])) if os.path.exists(expected_book_filepath) and chunk_distance_from_expected != "NaN" else "NaN"

                    # chapter distances
                    chapter_match = (result['data']['chapter_number'] == expected_chapter_number) if book_match else "NaN"
                    chapter_distance_from_expected = abs(
                        result['data']['chapter_number'] - expected_chapter_number
                        ) if expected_chapter_number is not None and book_match else "NaN"
                    chunk_in_chap_distance_from_expected = abs(
                        result['data']['chunk_in_chapter_index'] - expected_chunk_in_chapter_index
                    ) if expected_chunk_in_chapter_index is not None and chapter_match is True else "NaN"

                    result_row = {
                        'test_number': test_num,
                        'question_number': question_number,
                        'result_rank': result_num,
                        'original_query': question,
                        'enhanced_query': result['data']['query'],
                        'target_chunk_size': target_chunk_size,
                        'sentence_overlap': sentence_overlap,
                        'small_paragraph_length': small_paragraph_length,
                        'small_paragraph_overlap': small_paragraph_overlap,
                        'matched_chapter_title': result['data']['chapter_title'],
                        'expected_chapter_title': expected_chapter_title,
                        'matched_chapter_number': result['data']['chapter_number'],
                        'expected_chapter_number': expected_chapter_number,
                        'correct_chapter_found': chapter_match,
                        'chapter_distance_from_expected': chapter_distance_from_expected,
                        'matched_chunk_in_chapter_index': result['data']['chunk_in_chapter_index'],
                        'exptected_chunk_in_chapter_index': expected_chunk_in_chapter_index,
                        'chunk_in_chap_distance_from_expected': chunk_in_chap_distance_from_expected,
                        'score': result['data']['score'],
                        'matched_text': match_text,
                        'expected_text': question_row['Best Answer'],
                        'correct_text_found': text_match,
                        'matched_chunk_index': result['data']['chunk_index'],
                        'expected_chunk_index': expected_chunk_index if expected_chunk_index else "NaN",
                        'chunk_distance_from_expected': chunk_distance_from_expected,
                        'char_distance_from_expected': character_distance_from_expected,
                        'matched_book_title': result['data']['book_title'],
                        'expected_book_title': question_row['Book Title'],
                        'correct_book_found': book_match,
                        'avg_chunk_length': avg_chunk_length,
                        'all_chunks': chunk_lengths,
                    }
                    all_results.append(result_row)

    results_df = pd.DataFrame(all_results)
    timestamp = datetime.datetime.now().strftime("%H:%M:%S.%f")[:-3]
    print(f"\n{'='*80}")
    print(f"[{timestamp}] [{run_id}] COMPLETED {len(param_combos)} TESTS")
    print(f"[{timestamp}] [{run_id}] Total results collected: {len(results_df)}")
    print(f"{'='*80}")
    return results_df

In [117]:
results_df = await run_all_tests(
    selected_questions=selected_questions, param_combos=PARAM_COMBOS[0:1], book_urls=BOOK_URLS, skip_book_processing=True
  )

[00:39:48.682] [425d6589] TEST 1: chunk_size=800
Skipping book processing. Assumption is dfs are already processed correctly and available in the directory.
[00:39:48.682] [425d6589] Starting tests per query for TEST 1
[00:39:48.682] [425d6589]   Question 1: How is Glinda's appearance described when Dorothy meets her?
[00:39:48.682] [425d6589]   Question 2: How old is the Scarecrow when Dorothy finds him?
[00:39:48.682] [425d6589]   Question 3: How does one use the Powder of Life?
NOTE: taking first chunk location where ~partial~ text matched.
[00:39:48.682] [425d6589]   Question 4: When is Wogglebug introduced?
[00:39:48.682] [425d6589]   Question 5: What happens to the Scarecrow and the Sawhorse that they get hurt?
[00:39:48.682] [425d6589]   Question 6: When do Dorothy and Ozma meet?
[00:39:48.682] [425d6589]   Question 7: Why doesn't Eureka immediately tell Ozma where the missing piglet is?
[00:39:48.682] [425d6589]   Question 8: How does the Wizard demonstrate his magic to the Man

In [100]:
results_df.head()

Unnamed: 0,test_number,question_number,result_rank,original_query,enhanced_query,target_chunk_size,sentence_overlap,small_paragraph_length,small_paragraph_overlap,matched_chapter_title,...,correct_text_found,matched_chunk_index,expected_chunk_index,chunk_distance_from_expected,char_distance_from_expected,matched_book_title,expected_book_title,correct_book_found,avg_chunk_length,all_chunks
0,1,1,1,How is Glinda's appearance described when Doro...,description of Glinda's appearance when Doroth...,800,2,200,2,Glinda The Good Witch Grants Dorothy’s Wish,...,True,247,247,0.0,0.0,The Wonderful Wizard of Oz,The Wonderful Wizard of Oz,True,{'the_wonderful_wizard_of_oz': 894.97286821705...,"{'the_wonderful_wizard_of_oz': [883, 321, 873,..."
1,1,1,2,How is Glinda's appearance described when Doro...,description of Glinda's appearance when Doroth...,800,2,200,2,Princess Ozma Of Oz,...,False,297,247,,,The Marvelous Land of Oz,The Wonderful Wizard of Oz,False,{'the_wonderful_wizard_of_oz': 894.97286821705...,"{'the_wonderful_wizard_of_oz': [883, 321, 873,..."
2,1,1,3,How is Glinda's appearance described when Doro...,description of Glinda's appearance when Doroth...,800,2,200,2,How Glinda Worked A Magic Spell,...,False,378,247,,,The Emerald City of Oz,The Wonderful Wizard of Oz,False,{'the_wonderful_wizard_of_oz': 894.97286821705...,"{'the_wonderful_wizard_of_oz': [883, 321, 873,..."
3,1,2,1,How old is the Scarecrow when Dorothy finds him?,the Scarecrow's age or description when Doroth...,800,2,200,2,How Dorothy Saved The Scarecrow,...,False,28,38,10.0,8950.0,The Wonderful Wizard of Oz,The Wonderful Wizard of Oz,True,{'the_wonderful_wizard_of_oz': 894.97286821705...,"{'the_wonderful_wizard_of_oz': [883, 321, 873,..."
4,1,2,2,How old is the Scarecrow when Dorothy finds him?,the Scarecrow's age or description when Doroth...,800,2,200,2,"The Discovery Of Oz, The Terrible",...,False,185,38,147.0,131561.0,The Wonderful Wizard of Oz,The Wonderful Wizard of Oz,True,{'the_wonderful_wizard_of_oz': 894.97286821705...,"{'the_wonderful_wizard_of_oz': [883, 321, 873,..."


In [127]:
# Count how many questions had at least one result with chunk_distance <= 2
correct_questions = results_df[
    results_df['chunk_distance_from_expected'].apply(lambda x: isinstance(x, (int, float)) and x <= 2)
]['question_number'].nunique()

total_questions = results_df['question_number'].nunique()

print(f"Questions with correct answer in top 3: {correct_questions}/{total_questions}")
print(f"Accuracy: {correct_questions/total_questions*100:.1f}%")
# results_df[["original_query", "matched_book_title", "matched_chapter_title", "expected_chapter_title", "matched_chapter_number", "expected_chapter_number", "correct_chapter_found", "chapter_distance_from_expected", "matched_chunk_in_chapter_index", "exptected_chunk_in_chapter_index", "chunk_in_chap_distance_from_expected"]]

Questions with correct answer in top 3: 10/20
Accuracy: 50.0%


There are two main parameters we aim to adjust for:
1. Semantic similarity of chunks
2. Chunk size consistency

Aiming for semantic similarity has benefits of each chunk maintaining contextual information, but then the sizes of each chunk can vary drastically. Aiming for chunk size consistency is to reduce bias towards longer chunks. Very long chunks dominate rankings simply because they contain more tokens that may match your query. Additionally, too small of a chunk and you will find poor semantic representation. Too large of a chunk and you are increasing the noise and can dilute the signal. Lastly, more constitent chunks means better embedding and search performance since the resources required are more predictable chunk by chunk.

Therefore we can take advantage of book structure and but also tune for more consistent chunks...

In [126]:
cols_to_exclude = ['all_chunks', 'avg_chunk_length']
cols_to_keep = [col for col in results_df.columns if col not in cols_to_exclude]
results_df[cols_to_keep].to_csv("results.csv", index=False)