# Part 15

# File Search Annotations

Universal code for the entire notebook

In [1]:
# Uncomment the line below to make sure you have all the packages needed
# %pip install -r requirements.txt

In [2]:
# Import necessary libraries
from openai import OpenAI  # Used for interacting with OpenAI's API
from typing_extensions import override  # Used for overriding methods in subclasses
from openai import AssistantEventHandler  # Used for handling events related to OpenAI assistants

In [3]:
# Create an instance of the OpenAI class to interact with the API.
# This assumes you have set the OPENAI_API_KEY environment variable.
client = OpenAI() 

### Special Event Handler (Doesn't Work Properly)

Ironically, the example for streaming, at the time of this writing, doesn't actually stream. You can see the post I made in the dev forum:
[Streaming Example for Assistants in Documentation Doesn’t Stream](https://community.openai.com/t/streaming-example-for-assistants-in-documentation-doesnt-stream/834370) 

In [4]:
# Event handler class that will be used to handle events related to the assistant
# This version doesn't actually stream and is supposed to according to the documentation
# I'm putting it here for reference and to show how it would be implemented later
# https://platform.openai.com/docs/assistants/tools/file-search/step-5-create-a-run-and-check-the-output
class EventHandler(AssistantEventHandler):
    @override
    def on_text_created(self, text) -> None:
        print(f"\nassistant > ", end="", flush=True)

    @override
    def on_tool_call_created(self, tool_call):
        print(f"\nassistant > {tool_call.type}\n", flush=True)

    @override
    def on_message_done(self, message) -> None:
        # print a citation to the file searched
        message_content = message.content[0].text
        annotations = message_content.annotations
        citations = []
        for index, annotation in enumerate(annotations):
            message_content.value = message_content.value.replace(
                annotation.text, f"[{index}]"
            )
            if file_citation := getattr(annotation, "file_citation", None):
                cited_file = client.files.retrieve(file_citation.file_id)
                citations.append(f"[{index}] {cited_file.filename}")

        print(message_content.value)
        print("\n".join(citations))


### Creating an Assistant with a Vector Store Already Attached

Now we will get a file reference and then create an assistant with a vector store that we will use for the rest of this notebook. It's a straightforward approach that we have seen before. 

In [5]:
drac_file = client.files.create(file=open("./artifacts/Dracula.pdf","rb"), purpose="assistants")

# Create an assistant using the client library.
try:
    assistant = client.beta.assistants.create(
        model="gpt-4o",  # Specify the model to be used.
        instructions=(
            "You are a helpful assistant that answers questions about the stories in your files. "
            "The stories are from a variety of authors. "
            "You will answer questions from the user about the stories. All you will do is answer questions about the stories in the files and provide related information. "
            "If the user asks you a question that is not related to the stories in the files, you should let them know that you can only answer questions about the stories."
        ),
        name="Quick Assistant and Vector Store at Once",  # Give the assistant a name.
        tools=[{"type": "file_search"}],  # Add the file search capability to the assistant.
        # Create a vector store and attach it to the assistant in one step.
        tool_resources={
            "file_search": {
                "vector_stores": [
                    {
                        "name": "Vector Store Auto Attached to Assistant",
                        "file_ids": [
                            drac_file.id,
                        ],
                        "metadata": {
                            "Book1": "Dracula", 
                        }
                    }
                ]
            }
        },
        metadata={  # Add metadata about the assistant's capabilities.
            "can_be_used_for_file_search": "True",
            "has_vector_store": "True",
        },
        temperature=1,  # Set the temperature for response variability.
        top_p=1,  # Set the top_p for nucleus sampling.
    )
except Exception as e:
    print(f"An error occurred while creating the assistant: {e}")
else:
    # Print the details of the created assistant to check its properties.
    print(assistant)  # Print the full assistant object.
    print("\n\n")
    print("Assistant Name: " + assistant.name)  # Print the name of the assistant.
    print("\n")
    
    # get the vector store information
    unnamed_assistant_vector_store = client.beta.vector_stores.retrieve(assistant.tool_resources.file_search.vector_store_ids[0])
    print("Vector Store Name: " + str(unnamed_assistant_vector_store.name))
    print("Vector Store Id: " + unnamed_assistant_vector_store.id)
    print("Vector Store Metadata: " + str(unnamed_assistant_vector_store.metadata))

Assistant(id='asst_7ukrvlxxPtELmNeAMrsAB1Ab', created_at=1719318801, description=None, instructions='You are a helpful assistant that answers questions about the stories in your files. The stories are from a variety of authors. You will answer questions from the user about the stories. All you will do is answer questions about the stories in the files and provide related information. If the user asks you a question that is not related to the stories in the files, you should let them know that you can only answer questions about the stories.', metadata={'can_be_used_for_file_search': 'True', 'has_vector_store': 'True'}, model='gpt-4o', name='Quick Assistant and Vector Store at Once', object='assistant', tools=[FileSearchTool(type='file_search', file_search=None)], response_format='auto', temperature=1.0, tool_resources=ToolResources(code_interpreter=None, file_search=ToolResourcesFileSearch(vector_store_ids=['vs_iZGvCjTqjIP50Jnw9tm21tsr'])), top_p=1.0)



Assistant Name: Quick Assistant

In [6]:
# Always name your vector stores
updated_vector_store = client.beta.vector_stores.update(
    vector_store_id=unnamed_assistant_vector_store.id,
    name="Dracula Vector Store",
    metadata={"Book1": "Dracula"}
)

print("Vector Store Name: " + str(updated_vector_store.name))
print("Vector Store Id: " + updated_vector_store.id)
print("Vector Store Metadata: " + str(updated_vector_store.metadata))

Vector Store Name: Dracula Vector Store
Vector Store Id: vs_iZGvCjTqjIP50Jnw9tm21tsr
Vector Store Metadata: {'Book1': 'Dracula'}


### Create a Thread and Run the Stream

Finally, we create a thread that we will use for the entire notebook and stream the output using the OLD event handler that doesn't really stream. 

In [7]:
# Create a thread and attach the file to the message
thread = client.beta.threads.create(
    messages=[
    {
    "role": "user",
    "content": "Who are all the main characters in Dracula? Cite the location they are first introduced in the book. Every character should have a separate citation.",
    }
]
)

In [8]:
# Stream the output from our assistant 
# using the event handler code from the documentation
with client.beta.threads.runs.stream(
    thread_id=thread.id,
    assistant_id=assistant.id,
    event_handler=EventHandler(),
) as stream:
    stream.until_done()


assistant > file_search


assistant > It appears that detailed searches did not yield specific excerpts about the main characters' introductions in "Dracula." Given this, I'll provide general locations within the book where these characters are introduced:

1. **Jonathan Harker**: Introduced in the first chapter as he travels to Transylvania to meet Count Dracula.
2. **Count Dracula**: Introduced in the first chapter when Jonathan Harker arrives at his castle.
3. **Mina Murray (later Mina Harker)**: Introduced through letters to her fiancé, Jonathan Harker, in the early chapters.
4. **Lucy Westenra**: Introduced early in the novel through Mina Murray’s letters and diary entries.
5. **Abraham Van Helsing**: Introduced around the midway point of the novel, after Lucy Westenra's health issues become apparent.
6. **John Seward**: Appears early in the novel as a suitor of Lucy Westenra and through his own diary entries.
7. **Arthur Holmwood**: Also introduced early in the novel as a suitor

### New Event Handler that Actually Streams

Having seen how the old event handler worked, or didn't in this case, we will create a new event handler to actually stream the output and then stream it

In [9]:
# Modified event handler that will actually stream the response from the assistant
class EventHandler(AssistantEventHandler):
    """Custom event handler for processing assistant events."""

    def __init__(self):
        super().__init__()
        self.results = []  # Initialize the results list

    @override
    def on_text_created(self, text) -> None:
        """Handle the event when text is first created."""
        # Print the created text to the console
        print("\nassistant text > ", end="", flush=True)
        # Append the created text to the results list
        self.results.append(text)

    @override
    def on_text_delta(self, delta, snapshot):
        """Handle the event when there is a text delta (partial text)."""
        # Print the delta value (partial text) to the console
        print(delta.value, end="", flush=True)
        # Append the delta value to the results list
        self.results.append(delta.value)

    def on_tool_call_created(self, tool_call):
        """Handle the event when a tool call is created."""
        # Print the type of the tool call to the console
        print(f"\nassistant tool > {tool_call.type}\n", flush=True)

    def on_tool_call_delta(self, delta, snapshot):
        """Handle the event when there is a delta (update) in a tool call."""
        if delta.type == 'code_interpreter':
            # Check if there is an input in the code interpreter delta
            if delta.code_interpreter.input:
                # Print the input to the console
                print(delta.code_interpreter.input, end="", flush=True)
                # Append the input to the results list
                self.results.append(delta.code_interpreter.input)
            # Check if there are outputs in the code interpreter delta
            if delta.code_interpreter.outputs:
                # Print a label for outputs to the console
                print("\n\noutput >", flush=True)
                # Iterate over each output and handle logs specifically
                for output in delta.code_interpreter.outputs or []:
                    if output.type == "logs":
                        # Print the logs to the console
                        print(f"\n{output.logs}", flush=True)
                        # Append the logs to the results list
                        self.results.append(output.logs)

In [10]:
# Stream the output from our assistant
with client.beta.threads.runs.stream(
    thread_id=thread.id,
    assistant_id=assistant.id,
    event_handler=EventHandler(),
) as stream:
    stream.until_done()


assistant tool > file_search


assistant text > Based on the search results, here are the main characters in "Dracula" along with the locations where they are first introduced:

1. **Jonathan Harker**: Introduced in the first chapter as he travels to Transylvania to meet Count Dracula.
   - Citation: 【12:14†Dracula】

2. **Count Dracula**: Introduced when Jonathan Harker arrives at his castle.
   - Citation: 【12:14†Dracula】

3. **Mina Murray (later Mina Harker)**: Introduced through a letter concerning Jonathan's condition.
   - Citation: 【12:17†Dracula】

4. **Lucy Westenra**: Mentioned early in the text when discussing letters and diary entries.
   - Citation: 【12:17†Dracula】

5. **Abraham Van Helsing**: He is first introduced through discussions and letters as a scholar and doctor who is called upon to help Lucy.
   - Citation: 【12:16†Dracula】

6. **John Seward**: Introduced as he describes his interactions with Renfield.
   - Citation: 【12:2†Dracula】

7. **Arthur Holmwood**: Introdu

### Cleaning Up the Annotations
You can see we get the list of characters and an indicator of where they are first introduced so that is a good thing. However, as of the time of this writing, dealing with annotations like 【4:13†Dracula.pdf】are still not an ideal visual output and confuse the user. To that end, we can make some adjustments to clean up the citations a bit. Unfortunately, as of the time of this writing there was no way to get the actual page number or other more detailed information from the citations. The OpenAI Dev Forums have an ongoing thread about that specific issue and you can check back to see if it has been resolved yet. 

[How can I access the specific text of the file that the annotation is referencing?](https://community.openai.com/t/how-can-i-access-the-specific-text-of-the-file-that-the-annotation-is-referencing/726723) 

First, let's see what is inside the message that is getting sent back to us. Run the code below and you will see all kinds of interesting things going on in the message. For example, there are two main parts annotations and value

In [11]:
# Get the last message from the thread
message = client.beta.threads.messages.retrieve(
    thread_id=thread.id,
    message_id=client.beta.threads.messages.list(thread_id=thread.id,order="desc").data[0].id
)

print(message.content[0].text)

Text(annotations=[FileCitationAnnotation(end_index=269, file_citation=FileCitation(file_id='file-zwjNvQHztLT7cp4271lplM9L'), start_index=254, text='【12:14†Dracula】', type='file_citation'), FileCitationAnnotation(end_index=378, file_citation=FileCitation(file_id='file-zwjNvQHztLT7cp4271lplM9L'), start_index=363, text='【12:14†Dracula】', type='file_citation'), FileCitationAnnotation(end_index=511, file_citation=FileCitation(file_id='file-zwjNvQHztLT7cp4271lplM9L'), start_index=496, text='【12:17†Dracula】', type='file_citation'), FileCitationAnnotation(end_index=636, file_citation=FileCitation(file_id='file-zwjNvQHztLT7cp4271lplM9L'), start_index=621, text='【12:17†Dracula】', type='file_citation'), FileCitationAnnotation(end_index=808, file_citation=FileCitation(file_id='file-zwjNvQHztLT7cp4271lplM9L'), start_index=793, text='【12:16†Dracula】', type='file_citation'), FileCitationAnnotation(end_index=918, file_citation=FileCitation(file_id='file-zwjNvQHztLT7cp4271lplM9L'), start_index=904, tex

### Breaking Down the Annotations
Now we will break down the annotations to make them easier to understand.

In [12]:
# Extract the message content and annotations
message_text_object = message.content[0]
message_text_content = message_text_object.text.value  # Access the value attribute for the actual text
annotations = message_text_object.text.annotations  # Access annotations directly

# Print the annotations in a cleaner format
for index, annotation in enumerate(annotations):
    print(f"Annotation {index + 1}:")
    print(f"  End Index: {annotation.end_index}")
    print(f"  Start Index: {annotation.start_index}")
    print(f"  Text: {annotation.text}")
    print(f"  Type: {annotation.type}")
    if hasattr(annotation, 'file_citation'):
        file_citation = annotation.file_citation
        print(f"  File Citation:")
        print(f"    File ID: {file_citation.file_id}")
    print("")  # Add a blank line for readability

Annotation 1:
  End Index: 269
  Start Index: 254
  Text: 【12:14†Dracula】
  Type: file_citation
  File Citation:
    File ID: file-zwjNvQHztLT7cp4271lplM9L

Annotation 2:
  End Index: 378
  Start Index: 363
  Text: 【12:14†Dracula】
  Type: file_citation
  File Citation:
    File ID: file-zwjNvQHztLT7cp4271lplM9L

Annotation 3:
  End Index: 511
  Start Index: 496
  Text: 【12:17†Dracula】
  Type: file_citation
  File Citation:
    File ID: file-zwjNvQHztLT7cp4271lplM9L

Annotation 4:
  End Index: 636
  Start Index: 621
  Text: 【12:17†Dracula】
  Type: file_citation
  File Citation:
    File ID: file-zwjNvQHztLT7cp4271lplM9L

Annotation 5:
  End Index: 808
  Start Index: 793
  Text: 【12:16†Dracula】
  Type: file_citation
  File Citation:
    File ID: file-zwjNvQHztLT7cp4271lplM9L

Annotation 6:
  End Index: 918
  Start Index: 904
  Text: 【12:2†Dracula】
  Type: file_citation
  File Citation:
    File ID: file-zwjNvQHztLT7cp4271lplM9L

Annotation 7:
  End Index: 1033
  Start Index: 1019
  Text:

#### Parts of the Annotations
Notice that each annotation comes with several parts:

**End Index:**

- Definition: This is the position in the text where the annotation ends.

- Example: end_index=269 means the annotation ends at the 269th character of the text.

**File Citation:**

- Definition: This is an object containing details about the file being cited.

- Example: file_citation=FileCitation(file_id='file-Ju5xvF4scTcfDfEu6cIbMY85') means this annotation references a file with the ID file-Ju5xvF4scTcfDfEu6cIbMY85.

**Start Index:**

- Definition: This is the position in the text where the annotation starts.

- Example: start_index=257 means the annotation starts at the 257th character of the text.

**Text:**

- Definition: This is the actual text of the annotation that appears in the document.

- Example: text=' ' is the annotation text that you see in the document.

**Type:**

- Definition: This indicates the type of annotation.

- Example: type='file_citation' means this annotation is a citation to a file.

### Final Formatting

Finally, let's return the best citations we can manage given the information that is available. It's not perfect and I don't suggest including character locations because those aren't accurate based on my tests. Rather, for now, I would just include the citation and the file name of where the citation comes from. I know it is not ideal by a long shot but that is what we have to work with at the time of this writing. 

In [13]:

# Retrieve the message object (replace this part with your actual message retrieval code)
message = client.beta.threads.messages.retrieve(
    thread_id=thread.id,
    message_id=client.beta.threads.messages.list(thread_id=thread.id, order="desc").data[0].id
)

# Extract the message content and annotations
message_text_object = message.content[0]
message_text_content = message_text_object.text.value  # Access the value attribute for the actual text
annotations = message_text_object.text.annotations  # Access annotations directly

# Create a list to store annotations with a dictionary for citation replacement
annotated_citations = []
citation_replacements = {}

# Iterate over the annotations, retrieve file names, and store the details
for index, annotation in enumerate(annotations):
    annotation_number = index + 1

    # Retrieve the file name using the file ID
    file_info = client.files.retrieve(annotation.file_citation.file_id)
    file_name = file_info.filename

    annotation_details = {
        "number": annotation_number,
        "text": f"[{annotation_number}]",
        "file_name": file_name,
        "start_index": annotation.start_index,
        "end_index": annotation.end_index,
    }
    annotated_citations.append(annotation_details)
    citation_replacements[annotation.text] = f"[{annotation_number}]"

# Replace the inline citations in the message text with numbered identifiers
for original_text, replacement_text in citation_replacements.items():
    message_text_content = message_text_content.replace(original_text, replacement_text)

# Print the message text with the annotations including file name and character positions
print("Message Text with Annotations:")
print(message_text_content)
print("\nAnnotations:")
for annotation in annotated_citations:
    print(f"Annotation {annotation['number']}:")
    print(f"  File Name: {annotation['file_name']}")
    print(f"  Character Positions: {annotation['start_index']} - {annotation['end_index']}")
    print("")  # Add a blank line for readability


Message Text with Annotations:
Based on the search results, here are the main characters in "Dracula" along with the locations where they are first introduced:

1. **Jonathan Harker**: Introduced in the first chapter as he travels to Transylvania to meet Count Dracula.
   - Citation: [2]

2. **Count Dracula**: Introduced when Jonathan Harker arrives at his castle.
   - Citation: [2]

3. **Mina Murray (later Mina Harker)**: Introduced through a letter concerning Jonathan's condition.
   - Citation: [4]

4. **Lucy Westenra**: Mentioned early in the text when discussing letters and diary entries.
   - Citation: [4]

5. **Abraham Van Helsing**: He is first introduced through discussions and letters as a scholar and doctor who is called upon to help Lucy.
   - Citation: [5]

6. **John Seward**: Introduced as he describes his interactions with Renfield.
   - Citation: [6]

7. **Arthur Holmwood**: Introduced early in the novel as a suitor of Lucy Westenra.
   - Citation: [7]

8. **Quincey Mor