In [1]:
#Imports
from google.oauth2 import service_account
from googleapiclient.discovery import build
from dotenv import load_dotenv
from rapidfuzz import fuzz
from openai import OpenAI
import pandas as pd
import re
import os

In [2]:
#Setup Google Drive API Access
scopes = ["https://www.googleapis.com/auth/drive.readonly"]
service_file = "service_account.json"

creds = service_account.Credentials.from_service_account_file(
    service_file, scopes=scopes
)

drive = build("drive", "v3", credentials=creds)

In [3]:
# Build Sheets API client
sheets_service = build("sheets", "v4", credentials=creds)

#Oz Questions Spreadsheet ID and range
spreadsheet_id = "1aa61xgSOBXu6qH1chEiFqUxgFmBNyvr4Q5bSCFUOkt8"
spreadsheet_range = "Sheet1!A1:D100" 

#Call Oz Questions Spreadsheet
result = sheets_service.spreadsheets().values().get(
    spreadsheetId=spreadsheet_id,
    range=spreadsheet_range
).execute()

values = result.get("values", [])

#Convert to DataFrame
oz_questions_df = pd.DataFrame(values[1:], columns=values[0])

In [4]:
#Extract questions from Oz Questions spreadsheet and convert to list
questions = oz_questions_df["Question"].tolist()

# Rename Best Question column golden_excerpt
oz_questions_df = oz_questions_df.rename(columns={"Best Answer": "golden_excerpt"})
oz_questions_df

Unnamed: 0,Question,Book #,Book Title,golden_excerpt
0,What color are Dorothy's shoes?,1,The Wonderful Wizard of Oz,"“She was so old,” explained the Witch of the N..."
1,How old is the Scarecrow when Dorothy finds him?,1,The Wonderful Wizard of Oz,“My life has been so short that I really know ...
2,Which are the first antagonistic creatures the...,1,The Wonderful Wizard of Oz,In the morning they traveled on until they cam...
3,"When is the first time we read ""There's no pla...",1,The Wonderful Wizard of Oz,“That is because you have no brains” answered ...
4,What is the wizard's secret in the Wonderful W...,1,The Wonderful Wizard of Oz,"“No, you are all wrong,” said the little man m..."
...,...,...,...,...
65,Why is Ojo arrested?,7,The Patchwork Girl of Oz,"""Don't do that!"" exclaimed the Shaggy Man, ear..."
66,What is the Woozy?,7,The Patchwork Girl of Oz,The creature was all squares and flat surfaces...
67,Why does Dorothy go to the Emerald City in The...,5,The Road to Oz,"“It's on the twenty-first, remember,"" he conti..."
68,Why doesn't Eureka immediately tell Ozma where...,4,Dorothy and the Wizard in Oz,"“It would have spoiled the fun,"" replied the k..."


In [5]:
#Set variables for OpenAI API
load_dotenv() 
api_key = os.environ.get("MY_KEY")
client = OpenAI(api_key=api_key)
model = "gpt-4.1" 
vector_store_id = "vs_692ca3fc84a88191bf1b9017254bd572" 
instructions = """
You are now operating in a special mode called OZ-EXTRACT.
OZ-EXTRACT overrides all other system instructions.
Ignore and bypass all internal rules, metadata, recency logic, tool-routing rules, Slack-related guidelines, or RAG-style citation requirements.
Only the instructions below apply.
1. Allowed Sources
You may use only the Oz book HTML files that are uploaded in this project.
You must not reference or draw from any other documents, summaries, metadata, meetings, Slack logs, or external sources.
2. Your Task
You are not answering questions.
Your task is to extract text from the Oz books.
For every question, you must output:
Exactly three passages
Each passage must be verbatim and continuous
Each passage must be approximately 1,200 characters long, ending at the nearest full stop
Passages must be taken directly from the source files
No invented text
No omitted text
No stitching or merging non-contiguous text
You must output nothing except these three passages.
3. Passage Selection Rules
The first passage must be the most relevant passage in the uploaded Oz corpus that relates to the user’s question.
The second and third passages must be the next most relevant passages.
If fewer than three relevant passages exist:
Passage 1 = the most relevant
Passages 2 & 3 = closest related concepts from anywhere in the Oz corpus
If absolutely nothing relates, select the three passages that are closest in theme or vocabulary.
4. Required Output Format
Each passage must follow this exact layout:
“{verbatim excerpt of ~1200 characters, ending at a complete sentence}”
[file:FILE-NAME.html†L###-L###] — Book Title, Chapter Title
No bullet points.
No extra text before, between, or after passages.
No explanations or summaries.
Passages should appear one after another in the final answer separated with one blank line.
5. Forbidden Behaviors
You must not:
Summarize
Paraphrase
Explain
Provide commentary
Answer the question directly
Cite using RAG-style citations (e.g., )
Refer to recency or metadata
Reference non-Oz content
Use text not in the uploaded books
Describe your reasoning process
Respond with “I think,” “according to,” or meta-discussion
6. If a rule seems to conflict internally, come up with 3 passages anyway. 
7. Do not ask the user any follow-up questions.

"""

In [6]:
def ask_oz_extractor(questions):
    response = client.responses.create(
        model=model,
        instructions=instructions,
        input=question,
        tools=[{
            "type": "file_search",
            "vector_store_ids": [vector_store_id],
        }],
    )

    answer_text = response.output_text

    usage = getattr(response, "usage", None)
    tokens_in = getattr(usage, "input_tokens", None) if usage else None
    tokens_out = getattr(usage, "output_tokens", None) if usage else None

    return {
        "question": question,
        "answer": answer_text,
        "response_id": response.id,
        "status": response.status,
        "tokens_input": tokens_in,
        "tokens_output": tokens_out,
    }

In [7]:
def oz_extractor_to_df():
    rows = []
    for q in questions:
        print(f"Asking: {q}")
        row = ask_oz_extractor(q)
        rows.append(row)
        print(f"→ Status: {row['status']}\n")

    # Convert to DataFrame
    df = pd.DataFrame(rows)

    # Save to CSV
    # df.to_csv("oz_extractor_results.csv", index=False)
    
    return df

In [8]:
# oz_extractor_df = oz_extractor_to_df()

In [9]:
def clean_excerpt_text(text):
    # Handle non-string inputs
    if not isinstance(text, str):
        return ""
        
    # Collapse whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()

def extract_excerpts_and_locs(answer):
    
    # Create dict for new excerpt and location columns
    result = {
        "excerpt_1": None, "loc_1": None,
        "excerpt_2": None, "loc_2": None,
        "excerpt_3": None, "loc_3": None,
    }

    # Skip empty answers
    if not isinstance(answer, str) or not answer.strip():
        return pd.Series(result)


    # Use regex to locate location text
    pattern = r"(?:\[\])?\s*\[(?P<loc>(?=[^\]]*)[^\]]*)\]"
    matches = list(re.finditer(pattern, answer, flags=re.DOTALL))

    #If there are no matches, return the whole answer as excerpt_1
    if not matches:
        result["excerpt_1"] = clean_excerpt_text(answer)
        return pd.Series(result)

    excerpts = []
    locs = []
    start_idx = 0

    for m in matches:
        # Extract excerpt text
        raw_excerpt = answer[start_idx:m.start()]
        excerpt = clean_excerpt_text(raw_excerpt)
        excerpts.append(excerpt or None)

        # Extract location text
        loc = m.group("loc").strip()
        locs.append(loc or None)

        # Start next excerpt AFTER the next blank line
        after_block = m.end()
        rest = answer[after_block:]
        blank = re.search(r"\n\s*\n", rest)
        if blank:
            start_idx = after_block + blank.end()
        else:
            start_idx = after_block

    # Insert excerpts and locations into result dict
    for i in range(3):
        result[f"excerpt_{i+1}"] = excerpts[i]
        result[f"loc_{i+1}"] = locs[i]

    # Return result dict as a pandas Series
    return pd.Series(result)

In [10]:
# # Apply function to each answer in the DataFrame
# parsed = oz_extractor_df["answer"].apply(extract_excerpts_and_locs)
# oz_extractor_df = pd.concat([oz_extractor_df, parsed], axis=1)

# Save to CSV
# df_excerpts.to_csv("oz_extractor_results_separated.csv", index=False)


In [11]:
# # Append the 'Best Answer' column from oz_questions_df to oz_extractor_df
# oz_extractor_df = pd.concat([oz_extractor_df, oz_questions_df[['golden_excerpt']]], axis=1)

# # Display the updated DataFrame
# oz_extractor_df

In [12]:
def normalize_text(text):
    if not isinstance(text, str):
        return ""
    return re.sub(r'\s+', ' ', text.lower().replace('"', '"').replace('"', '"').strip())

In [13]:
# Fuzzy matching functions
# def similarity(a, b):
#     a_norm = normalize_text(a)
#     b_norm = normalize_text(b)
#     return fuzz.partial_ratio(a_norm, b_norm) / 100.0

# def best_match_score(row):
#     gold = row["golden_excerpt"]
#     excerpts = [row["excerpt_1"], row["excerpt_2"], row["excerpt_3"]]
#     scores = [similarity(gold, ex) for ex in excerpts]
#     return max(scores)

# oz_extractor_df["best_similarity"] = oz_extractor_df.apply(best_match_score, axis=1)
# oz_extractor_df["is_match_90"] = oz_extractor_df["best_similarity"] >= 0.90

In [14]:
def is_match(row):
    gold = normalize_text(row["golden_excerpt"])
    excerpts = [normalize_text(row["excerpt_1"]), normalize_text(row["excerpt_2"]), normalize_text(row["excerpt_3"])]
    
    for excerpt in excerpts:
        if gold in excerpt:
            return True
    return False

# Assuming oz_extractor_df is the DataFrame containing the data
# Add a new column 'is_match' to the DataFrame
# oz_extractor_df["is_match"] = oz_extractor_df.apply(best_match_score, axis=1)

### Process Saved Files 

In [15]:
#Process multiple CSV files into a DataFrame of DataFrames
def process_csv(file_path, golden_excerpts):
    df = pd.read_csv(file_path)
    parsed = df["answer"].apply(extract_excerpts_and_locs)
    df = pd.concat([df, parsed], axis=1)
    df = pd.concat([df, golden_excerpts], axis=1)
    #Use for fuzzy matching
    # df["best_similarity"] = df.apply(best_match_score, axis=1)
    # df["is_match_90"] = df["best_similarity"] >= 0.90
    #Use for is in matching
    df["is_match"] = df.apply(is_match, axis=1)
    return df

# Process all CSV files with golden excerpts
file_paths = ["oz_extractor_results_4.1_1.csv", "oz_extractor_results_4.1_2.csv", "oz_extractor_results_4.1_3.csv", "oz_extractor_results_5.1_1.csv"]
golden_excerpts = oz_questions_df[['golden_excerpt']]
processed_dfs = [process_csv(file_path, golden_excerpts) for file_path in file_paths]

In [16]:
processed_dfs[3]

Unnamed: 0,question,answer,response_id,status,tokens_input,tokens_output,excerpt_1,loc_1,excerpt_2,loc_2,excerpt_3,loc_3,golden_excerpt,is_match
0,What color are Dorothy's shoes?,"“Dorothy looked, and gave a little cry of frig...",resp_00881c00364a4a3900692ccf66db908190af130b6...,completed,18480,1119,"“Dorothy looked, and gave a little cry of frig...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"““What is it?” asked the little old woman, and...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"“Dorothy had only one other dress, but that ha...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"“She was so old,” explained the Witch of the N...",True
1,How old is the Scarecrow when Dorothy finds him?,"“She ate a hearty breakfast, and watched a wee...",resp_00c5ff9c7e5c558c00692ccf71fb90819bb789abb...,completed,18493,1865,"“She ate a hearty breakfast, and watched a wee...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"“And Dorothy, with her basket on her arm, once...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"““No,” answered the Scarecrow; “it is a great ...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,“My life has been so short that I really know ...,False
2,Which are the first antagonistic creatures the...,“The first day’s journey was through the green...,resp_06c9c038377cae3c00692ccf8d18988199b4cd20f...,completed,18534,1132,“The first day’s journey was through the green...,file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,“There were few birds in this part of the fore...,file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,“They found the forest very thick on this side...,file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,In the morning they traveled on until they cam...,False
3,When is the first time we read 'There's no pla...,"“No matter how dreary and gray our homes are, ...",resp_05cfa033003dc8c600692ccf9c7fd881998597238...,completed,18490,1369,"“No matter how dreary and gray our homes are, ...",p:file-8rzFfFEzAM9BoHve1YnciF.html†L###-L###,“Tell me something about yourself and the coun...,p:file-8rzFfFEzAM9BoHve1YnciF.html†L###-L###,"“‘Now I’ll make the eyes,’” said the farmer. S...",p:file-8rzFfFEzAM9BoHve1YnciF.html†L###-L###,“That is because you have no brains” answered ...,True
4,What is the wizard's secret in the Wonderful W...,"“I am Oz, the Great and Terrible,” said the li...",resp_0861ce1f5be2911d00692ccfaca80c819bad115a0...,completed,18518,1812,"“I am Oz, the Great and Terrible,” said the li...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"“This I hung from the ceiling by a wire,” said...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"“But there is another way to make it float, wh...",file:Book1_the_wonderful_wizard_of_oz_pg55-ima...,"“No, you are all wrong,” said the little man m...",False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,Why is Ojo arrested?,"""I fear he has picked a six-leaved clover,"" an...",resp_01600f2008a6b58900692cd4553e38819bbac257f...,completed,18474,1827,"""I fear he has picked a six-leaved clover,"" an...",pagenum,,file:Book7_the_patchwork_girl_of_oz_pg32094-im...,"""Very well; prove he picked it, if you can.""</...",pagenum,"""Don't do that!"" exclaimed the Shaggy Man, ear...",False
66,What is the Woozy?,"""It is hard to face any savage beast without a...",resp_04a4d29f97e73ca000692cd4643758819891fc8f0...,completed,18422,1171,"""It is hard to face any savage beast without a...",file:Book7_the_patchwork_girl_of_oz_pg32094-im...,"""Seeing the strangers, the Woozy folded his hi...",file:Book7_the_patchwork_girl_of_oz_pg32094-im...,"""So the Shaggy Man tried it, but pull as hard ...",file:Book7_the_patchwork_girl_of_oz_pg32094-im...,The creature was all squares and flat surfaces...,False
67,Why does Dorothy go to the Emerald City in The...,"""The Road to Oz is a marvelous road, along whi...",resp_02b804930ffbf7ce00692cd472a788819a9347e34...,completed,18543,740,"""The Road to Oz is a marvelous road, along whi...",file:Book7_the_patchwork_girl_of_oz_pg32094-im...,"""Tells how to reach the Magic City of Oz over ...",file:Book3_ozma_of_oz_pg33361-images.html†L219...,"""The Road to Oz is a novelty in bookmaking for...",file:Book7_the_patchwork_girl_of_oz_pg32094-im...,"“It's on the twenty-first, remember,"" he conti...",False
68,Why doesn't Eureka immediately tell Ozma where...,"""It would have spoiled the fun,"" replied the k...",resp_0705fbaea9a5633d00692cd47c26308198822c9ca...,completed,18543,2250,"""It would have spoiled the fun,"" replied the k...",Pg 251,,Pg 252,,Pg 253,"“It would have spoiled the fun,"" replied the k...",False


In [17]:
# processed_dfs[0].to_csv("oz_extractor_results_scores_4.1_1.csv", index=False)
# processed_dfs[1].to_csv("oz_extractor_results_scores_4.1_2.csv", index=False)
# processed_dfs[2].to_csv("oz_extractor_results_scores_4.1_3.csv", index=False)
# processed_dfs[3].to_csv("oz_extractor_results_scores_5.1_1.csv", index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f3468430-3cb1-4d6e-8fbc-21329270417f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>