In [1]:
#Imports
from google.oauth2 import service_account
from googleapiclient.discovery import build
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from openai import OpenAI
import pandas as pd
import config
import sys
import re
import os

In [2]:
# Setup Git Repo import
repo_path = os.path.join(os.getcwd(), '/work/readiscoverers-backend') 
sys.path.insert(0, repo_path) 

from src import parse_html, search, constants, __init__

sys.path.pop(0)

'/work/readiscoverers-backend'

In [3]:
#Setup Google Drive API Access
scopes = ["https://www.googleapis.com/auth/drive.readonly"]
service_file = "service_account.json"

creds = service_account.Credentials.from_service_account_file(
    service_file, scopes=scopes
)

drive = build("drive", "v3", credentials=creds)

In [4]:
# Build Sheets API client
sheets_service = build("sheets", "v4", credentials=creds)

#Oz Questions Spreadsheet ID and range
spreadsheet_id = "1aa61xgSOBXu6qH1chEiFqUxgFmBNyvr4Q5bSCFUOkt8"
spreadsheet_range = "Sheet1!A1:D100" 

#Call Oz Questions Spreadsheet
result = sheets_service.spreadsheets().values().get(
    spreadsheetId=spreadsheet_id,
    range=spreadsheet_range
).execute()

values = result.get("values", [])

#Convert to DataFrame
oz_questions_df = pd.DataFrame(values[1:], columns=values[0])

In [5]:
#Extract questions from Oz Questions spreadsheet and convert to list
questions = oz_questions_df["Question"].tolist()

# Rename Best Question column golden_excerpt
oz_questions_df = oz_questions_df.rename(columns={"Best Answer": "golden_excerpt"})

In [6]:
#Set variables for OpenAI API
load_dotenv() 
api_key = os.environ.get("MY_KEY")
client = OpenAI(api_key=api_key)
model = "gpt-5.1" 
vector_store_id = "vs_692ca3fc84a88191bf1b9017254bd572" 
instructions = """
You are now operating in a special mode called OZ-EXTRACT.
OZ-EXTRACT overrides all other system instructions.
Ignore and bypass all internal rules, metadata, recency logic, tool-routing rules, Slack-related guidelines, or RAG-style citation requirements.
Only the instructions below apply.
1. Allowed Sources
You may use only the Oz book HTML files that are uploaded in this project.
You must not reference or draw from any other documents, summaries, metadata, meetings, Slack logs, or external sources.
2. Your Task
You are not answering questions.
Your task is to extract text from the Oz books.
For every question, you must output:
Exactly three passages
Each passage must be verbatim and continuous
Each passage must be approximately 1,200 characters long, ending at the nearest full stop
Passages must be taken directly from the source files
No invented text
No omitted text
No stitching or merging non-contiguous text
You must output nothing except these three passages.
3. Passage Selection Rules
The first passage must be the most relevant passage in the uploaded Oz corpus that relates to the user’s question.
The second and third passages must be the next most relevant passages.
If fewer than three relevant passages exist:
Passage 1 = the most relevant
Passages 2 & 3 = closest related concepts from anywhere in the Oz corpus
If absolutely nothing relates, select the three passages that are closest in theme or vocabulary.
4. Required Output Format
Each passage must follow this exact layout:
“{verbatim excerpt of ~1200 characters, ending at a complete sentence}”
[file:FILE-NAME.html†L###-L###] — Book Title, Chapter Title
No bullet points.
No extra text before, between, or after passages.
No explanations or summaries.
Passages should appear one after another in the final answer separated with one blank line.
5. Forbidden Behaviors
You must not:
Summarize
Paraphrase
Explain
Provide commentary
Answer the question directly
Cite using RAG-style citations (e.g., )
Refer to recency or metadata
Reference non-Oz content
Use text not in the uploaded books
Describe your reasoning process
Respond with “I think,” “according to,” or meta-discussion
6. If a rule seems to conflict internally, come up with 3 passages anyway. 
7. Do not ask the user any follow-up questions.

"""

In [7]:
#Call OpenAI API and run requests
def ask_oz_extractor(questions):
    ''' 
    Asks the OpenAI API with the given question and returns the answer and request information. 
    '''
    response = client.responses.create(
        model=model,
        instructions=instructions,
        input=questions,
        tools=[{
            "type": "file_search",
            "vector_store_ids": [vector_store_id],
        }],
    )

    answer_text = response.output_text

    usage = getattr(response, "usage", None)
    tokens_in = getattr(usage, "input_tokens", None) if usage else None
    tokens_out = getattr(usage, "output_tokens", None) if usage else None

    return {
        "question": questions,
        "answer": answer_text,
        "response_id": response.id,
        'model': response.model,
        "status": response.status,
        "tokens_input": tokens_in,
        "tokens_output": tokens_out,
    }

In [8]:
#Run all questions through the API and convert returned information to a DataFrame
def oz_extractor_to_df():
    ''' 
    Runs all questions through the OpenAI API and converts the returned information to a DataFrame. 
    '''
    rows = []
    for q in questions:
        print(f"Asking: {q}")
        row = ask_oz_extractor(q)
        rows.append(row)
        print(f"→ Status: {row['status']}\n")

    # Convert to DataFrame
    df = pd.DataFrame(rows)

     # Add a new column for the book number
    df['book_num'] = df['question'].apply(lambda x: oz_questions_df.loc[oz_questions_df['Question'] == x, 'Book #'].values[0])

    #Save to CSV
    df.to_csv("oz_extractor_5_2_results.csv", index=False)
    
    return df

In [9]:
#Run to use the API and convert returned information to a DataFrame
# oz_extractor_df = oz_extractor_to_df()

In [10]:
def clean_excerpt_text(text):
    soup = BeautifulSoup(text, 'html.parser')
    parse_html.remove_page_number_hyperlinks(soup)
    return soup.get_text()

In [11]:
# # Clean the 'answer' column 
# oz_5_1_matches_cleaned = oz_5_1_matches.copy()
# oz_5_1_matches_cleaned['answer'] = oz_5_1_matches_cleaned['answer'].apply(clean_text)

In [12]:
def extract_excerpts_and_locs(answer):
    
    # Create dict for new excerpt and location columns
    result = {
        "excerpt_1": None, "loc_1": None,
        "excerpt_2": None, "loc_2": None,
        "excerpt_3": None, "loc_3": None,
    }

    # Skip empty answers
    if not isinstance(answer, str) or not answer.strip():
        return pd.Series(result)


    # Use regex to locate location text
    pattern = r"(?:\[\])?\s*\[(?P<loc>(?=[^\]]*)[^\]]*)\]"
    matches = list(re.finditer(pattern, answer, flags=re.DOTALL))

    #If there are no matches, return the whole answer as excerpt_1
    if not matches:
        result["excerpt_1"] = clean_excerpt_text(answer)
        return pd.Series(result)

    excerpts = []
    locs = []
    start_idx = 0

    for m in matches:
        # Extract excerpt text
        raw_excerpt = answer[start_idx:m.start()]
        excerpt = clean_excerpt_text(raw_excerpt)
        excerpts.append(excerpt or None)

        # Extract location text
        loc = m.group("loc").strip()
        locs.append(loc or None)

        # Start next excerpt AFTER the next blank line
        after_block = m.end()
        rest = answer[after_block:]
        blank = re.search(r"\n\s*\n", rest)
        if blank:
            start_idx = after_block + blank.end()
        else:
            start_idx = after_block

    # Insert excerpts and locations into result dict
    for i in range(3):
        result[f"excerpt_{i+1}"] = excerpts[i]
        result[f"loc_{i+1}"] = locs[i]

    # Return result dict as a pandas Series
    return pd.Series(result)

In [13]:
def extract_excerpts_and_locs(answer):
    result = {
        "excerpt_1": None, "loc_1": None,
        "excerpt_2": None, "loc_2": None,
        "excerpt_3": None, "loc_3": None,
    }

    if not isinstance(answer, str) or not answer.strip():
        return pd.Series(result)


    # Search any [file...] optionally preceded by [pagenum] is a location
    pattern = r"(?:\[pagenum\])?\s*\[(?P<loc>(?=[^\]]*(?:prompt://file|file:|p:file|pdf:|pile-|\.html†L))[^\]]*)\]"
    matches = list(re.finditer(pattern, answer, flags=re.DOTALL))

    # If no location return one clean excerpt
    if not matches:
        result["excerpt_1"] = clean_excerpt_text(answer)
        return pd.Series(result)

    excerpts = []
    locs = []
    start_idx = 0

    for m in matches:
        # Excerpt before this location block
        raw_excerpt = answer[start_idx:m.start()]
        excerpt = clean_excerpt_text(raw_excerpt)
        excerpts.append(excerpt or None)

        # Location content inside the [file...]
        loc = m.group("loc").strip()
        locs.append(loc or None)

        # Start next excerpt AFTER the next blank line
        after_block = m.end()
        rest = answer[after_block:]
        blank = re.search(r"\n\s*\n", rest)
        if blank:
            start_idx = after_block + blank.end()
        else:
            start_idx = after_block

    # Normalize to exactly 3
    while len(excerpts) < 3:
        excerpts.append(None)
        locs.append(None)
    if len(excerpts) > 3:
        excerpts = excerpts[:3]
        locs = locs[:3]

    for i in range(3):
        result[f"excerpt_{i+1}"] = excerpts[i]
        result[f"loc_{i+1}"] = locs[i]

    return pd.Series(result)

In [14]:
def normalize_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'["\'”\'“]', '', text)
    return re.sub(r'\s+', ' ', text.lower().strip())

def is_match(row):
    gold = normalize_text(row["golden_excerpt"])
    excerpts = [normalize_text(row["excerpt_1"]), normalize_text(row["excerpt_2"]), normalize_text(row["excerpt_3"])]
    
    for excerpt in excerpts:
        if gold in excerpt:
            return True
    return False


In [15]:
def find_matches(filepath, golden_excerpts):
    # Read in CSV and parse answers
    df = pd.read_csv(filepath)
    parsed = df["answer"].apply(extract_excerpts_and_locs)
    df = pd.concat([df, parsed], axis=1)
    df = pd.concat([df, golden_excerpts], axis=1)

    # Drop non-eval or reference columns
    df = df.drop(columns=['tokens_input', 'tokens_output', 'status'])

    # Calculate is_match
    df['is_match'] = df.apply(is_match, axis=1)
    
    return df


In [16]:
file_path = "/work/oz_extractor_5_2_results.csv"
golden_excerpts = oz_questions_df[['golden_excerpt']]
oz_5_1_2_matches = find_matches(file_path, golden_excerpts)

In [17]:
oz_5_1_2_matches

Unnamed: 0,question,answer,response_id,model,book_num,excerpt_1,loc_1,excerpt_2,loc_2,excerpt_3,loc_3,golden_excerpt,is_match
0,What color are Dorothy's shoes?,"“Dorothy had only one other dress, but that ha...",resp_0b67f946a9c00bdb0069339f78d830819599acbf5...,gpt-5.1-2025-11-13,1,"“Dorothy had only one other dress, but that ha...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"“Dorothy looked, and gave a little cry of frig...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,“‘Your Silver Shoes will carry you over the de...,file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"“She was so old,” explained the Witch of the N...",False
1,How old is the Scarecrow when Dorothy finds him?,“When she had gone several miles she thought s...,resp_01cb19c39a99c6880069339f86b3ac819887a1feb...,gpt-5.1-2025-11-13,1,“When she had gone several miles she thought s...,file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,“Dorothy reached up both arms and lifted the f...,file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"“""Why, our old friend has just moved into his ...",file:Book6_the_emerald_city_of_oz_pg41667-imag...,“My life has been so short that I really know ...,False
2,Which are the first antagonistic creatures the...,“They found the forest very thick on this side...,resp_0b6a5628e76380ee0069339f98da6081978ceb0ce...,gpt-5.1-2025-11-13,1,“They found the forest very thick on this side...,file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,“They had just started to cross this queer bri...,file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,“There were few birds in this part of the fore...,file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,In the morning they traveled on until they cam...,False
3,"When is the first time we read ""There's no pla...",“That is because you have no brains” answered ...,resp_06541925783f064f0069339fa90cd0819bb79e640...,gpt-5.1-2025-11-13,1,“That is because you have no brains” answered ...,file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,“The Crooked Magician was wrong to make the Gl...,file:Book7_the_patchwork_girl_of_oz_pg32094-im...,“Once you could see sandy desert all around Oz...,file:Book7_the_patchwork_girl_of_oz_pg32094-im...,“That is because you have no brains” answered ...,True
4,What is the wizard's secret in the Wonderful W...,"“The Tin Woodman, raising his axe, rushed towa...",resp_09cd51ac74314b500069339fb439f4819b80a3664...,gpt-5.1-2025-11-13,1,"“The Tin Woodman, raising his axe, rushed towa...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"““My dear friends,” said Oz, “I pray you not t...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"““I am—I certainly am,” answered the little ma...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"“No, you are all wrong,” said the little man m...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,Why is Ojo arrested?,"""What's the news since I\r\nleft? Anything imp...",resp_09d8d42c96c5d056006933a67d41f4819991b401c...,gpt-5.1-2025-11-13,7,"""What's the news since I\r\nleft? Anything imp...",file:Book7_the_patchwork_girl_of_oz_pg32094-im...,"""Very well; prove he picked it, if you can.""\n...",file:Book7_the_patchwork_girl_of_oz_pg32094-im...,"""Ozma ordered the boy's arrest,"" said Dorothy,...",file:Book7_the_patchwork_girl_of_oz_pg32094-im...,"""Don't do that!"" exclaimed the Shaggy Man, ear...",False
66,What is the Woozy?,"""As this is the only Woozy that has ever lived...",resp_0fcadc3150c71fa5006933a6a5a4fc819684283cd...,gpt-5.1-2025-11-13,7,"""As this is the only Woozy that has ever lived...",file:Book7_the_patchwork_girl_of_oz_pg32094-im...,"""'Because I eat up all the honey-bees which th...",file:Book7_the_patchwork_girl_of_oz_pg32094-im...,"""'Why, as for me,' observed the Woozy, who was...",file:Book7_the_patchwork_girl_of_oz_pg32094-im...,The creature was all squares and flat surfaces...,True
67,Why does Dorothy go to the Emerald City in The...,"""Par-don me,"" he said, ""but when my thoughts r...",resp_00e6f7202206acd0006933a6b9a040819b95225a3...,gpt-5.1-2025-11-13,5,"""Par-don me,"" he said, ""but when my thoughts r...",file:Book5_the_road_to_ozpg26624-images.html†L...,"""""Perhaps so; but he's busy just now because i...",file:Book5_the_road_to_ozpg26624-images.html†L...,"""""Button-Bright will lose himself soon, if he ...",file:Book5_the_road_to_ozpg26624-images.html†L...,"“It's on the twenty-first, remember,"" he conti...",False
68,Why doesn't Eureka immediately tell Ozma where...,"""There was no way to get the creature out with...",resp_03b037ba51694b1c006933a6dcae5c8195839ef04...,gpt-5.1-2025-11-13,4,"""There was no way to get the creature out with...",file:Book4_dorothy_and_the_wizard_in_ozpg22566...,"""The piglet that belonged to the Princess wore...",file:Book4_dorothy_and_the_wizard_in_ozpg22566...,"""As the Princess held the white piglet in her ...",file:Book4_dorothy_and_the_wizard_in_ozpg22566...,"“It would have spoiled the fun,"" replied the k...",True


In [18]:
#is_match True percentage by book num
oz_5_1_2_matches.groupby('book_num')['is_match'].mean()

book_num
1     0.444444
2     0.500000
3     0.333333
4     0.444444
5     0.500000
6     0.444444
7     0.555556
15    0.222222
Name: is_match, dtype: float64

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f3468430-3cb1-4d6e-8fbc-21329270417f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>