# Part 12

# Using File Search

Universal code for the entire notebook

In [1]:
# make sure we all the packages we need
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import necessary libraries
from openai import OpenAI  # Used for interacting with OpenAI's API
from typing_extensions import override  # Used for overriding methods in subclasses
from openai import AssistantEventHandler  # Used for handling events related to OpenAI assistants

# Additional libraries for time and date manipulation
import time
import pytz
import datetime


In [3]:
# Create an instance of the OpenAI class to interact with the API.
# This assumes you have set the OPENAI_API_KEY environment variable.
client = OpenAI() 

In [4]:
# Event handler class to handle events related to streaming output from the assistant
class EventHandler(AssistantEventHandler):
    @override
    def on_text_created(self, text) -> None:
        print(f"\nassistant > ", end="", flush=True)

    @override
    def on_tool_call_created(self, tool_call):
        print(f"\nassistant > {tool_call.type}\n", flush=True)

    @override
    def on_message_done(self, message) -> None:
        # print a citation to the file searched
        message_content = message.content[0].text
        annotations = message_content.annotations
        citations = []
        for index, annotation in enumerate(annotations):
            message_content.value = message_content.value.replace(
                annotation.text, f"[{index}]"
            )
            if file_citation := getattr(annotation, "file_citation", None):
                cited_file = client.files.retrieve(file_citation.file_id)
                citations.append(f"[{index}] {cited_file.filename}")

        print(message_content.value)
        print("\n".join(citations))

### Creating an Assistant with File Search enabled

Our first step is to create an Assistant that can do file searching regardless of where the vector store resides (Assistant or Thread)

In [5]:
# Create an assistant using the client library.
assistant = client.beta.assistants.create(
    model="gpt-4o",  # Specify the model to be used.
    
    instructions=""" 
        You are a helpful assistant that answers questions about the stories in your files. The stories are from a variety of authors. 
        You will answer questions from the user about the stories. All you will do is answer questions about the stories in the files and provide related information.
        If the user asks you a question that is not related to the stories in the files, you should let them know that you can only answer questions about the stories.
    """,
    
    name="File Search Demo Assistant - Stories",  # Give the assistant a name.
    
    tools=[{"type": "file_search"}], # Add the file search capability to the assistant.
    
    metadata={  # Add metadata about the assistant's capabilities.
        "can_be_used_for_file_search": "True",
        "can_hold_vector_store": "True",
    },
    temperature=1,  # Set the temperature for response variability.
    top_p=1,  # Set the top_p for nucleus sampling.
)

# Print the details of the created assistant to check its properties.
print(assistant)  # Print the full assistant object.
print("\n\n")
print(assistant.name)  # Print the name of the assistant.
print(assistant.metadata)  # Print the metadata of the assistant.

Assistant(id='asst_EfJEoy9jvTaBfj6UcGzUMOW6', created_at=1717504193, description=None, instructions=' \n        You are a helpful assistant that answers questions about the stories in your files. The stories are from a variety of authors. \n        You will answer questions from the user about the stories. All you will do is answer questions about the stories in the files and provide related information.\n        If the user asks you a question that is not related to the stories in the files, you should let them know that you can only answer questions about the stories.\n    ', metadata={'can_be_used_for_file_search': 'True', 'can_hold_vector_store': 'True'}, model='gpt-4o', name='File Search Demo Assistant - Stories', object='assistant', tools=[FileSearchTool(type='file_search')], response_format='auto', temperature=1.0, tool_resources=ToolResources(code_interpreter=None, file_search=ToolResourcesFileSearch(vector_store_ids=[])), top_p=1.0)



File Search Demo Assistant - Stories
{'ca

## Creating a Vector Store

Now we will create our vector store to hold our files and add files at the same time.

In [6]:
from contextlib import ExitStack

# Create a vector store with a name for the store.
vector_store = client.beta.vector_stores.create(name="Great Fiction Stories")

# Ready the files for upload to the vector store.
file_paths = ["./artifacts/I_Am_Legend.pdf", "./artifacts/The_Veldt.pdf"]

# Using ExitStack to manage multiple context managers and ensure they are properly closed.
with ExitStack() as stack:
    # Open each file in binary read mode and add the file stream to the list
    file_streams = [stack.enter_context(open(path, "rb")) for path in file_paths]

    # Use the upload and poll helper method to upload the files, add them to the vector store,
    # and poll the status of the file batch for completion.
    file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
        vector_store_id=vector_store.id, files=file_streams
    )

    # Print the vector store information
    print(vector_store.name)
    print(vector_store.id)
    
    # Print the status and the file counts of the batch to see the results
    print(file_batch.status)
    print(file_batch.file_counts)


Great Fiction Stories
vs_pNAfDuyGKVmllX2o5tCYhhax
completed
FileCounts(cancelled=0, completed=2, failed=0, in_progress=0, total=2)


### Attaching the Vector Store to the Assistant

We have an Assistant that has File Search enabled and we have a Vector Store with files in them. It's time to join the two up. 

In [7]:
try:
    # Attach the vector store to the assistant to enable file search capabilities.
    assistant = client.beta.assistants.update(
        assistant_id=assistant.id,
        tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
    )

    # Print the assistant's tools and tool resources to verify the attachment of the vector store.
    print("Assistant Tools:")
    for tool in assistant.tools:
        print(f" - {tool}")

    # Print the assistant's tool resources to verify the attachment of the vector store
    print("\nAssistant Tool Resources:")
    for resource, details in assistant.tool_resources:
        print(f" - {resource}: {details}")

except Exception as e:
    print(f"An error occurred while updating the assistant: {e}")


Assistant Tools:
 - FileSearchTool(type='file_search')

Assistant Tool Resources:
 - code_interpreter: None
 - file_search: ToolResourcesFileSearch(vector_store_ids=['vs_pNAfDuyGKVmllX2o5tCYhhax'])


### Creating an Assistant and Vector Store at the Same Time

If we have file id's we can just feed them in when creating an assistant to get the Assistant and the Vector Store at the same time using the vector_stores option.

In [8]:

# Create an assistant using the client library.
try:
    assistant_with_vector_store = client.beta.assistants.create(
        model="gpt-4o",  # Specify the model to be used.
        instructions=(
            "You are a helpful assistant that answers questions about the stories in your files. "
            "The stories are from a variety of authors. "
            "You will answer questions from the user about the stories. All you will do is answer questions about the stories in the files and provide related information. "
            "If the user asks you a question that is not related to the stories in the files, you should let them know that you can only answer questions about the stories."
        ),
        name="Quick Assistant and Vector Store at Once",  # Give the assistant a name.
        tools=[{"type": "file_search"}],  # Add the file search capability to the assistant.
        # Create a vector store and attach it to the assistant in one step.
        tool_resources={
            "file_search": {
                "vector_stores": [
                    {
                        "file_ids": [
                            "file-UY7uzH3SMK0ALbwQeRk5OE0i",
                            "file-ZN71rhzhlvLQWiU5ZWEf3WP0"
                        ],
                        "metadata": {
                            "Book1": "Wizard of Oz", 
                            "Book2": "Alice in Wonderland"
                        }
                    }
                ]
            }
        },
        metadata={  # Add metadata about the assistant's capabilities.
            "can_be_used_for_file_search": "True",
            "has_vector_store": "True",
        },
        temperature=1,  # Set the temperature for response variability.
        top_p=1,  # Set the top_p for nucleus sampling.
    )
except Exception as e:
    print(f"An error occurred while creating the assistant: {e}")
else:
    # Print the details of the created assistant to check its properties.
    print(assistant_with_vector_store)  # Print the full assistant object.
    print("\n\n")
    print("Assistant Name: " + assistant_with_vector_store.name)  # Print the name of the assistant.
    print("\n")
    
    # get the vector store information
    unnamed_assistant_vector_store = client.beta.vector_stores.retrieve(assistant_with_vector_store.tool_resources.file_search.vector_store_ids[0])
    print("Vector Store Name: " + str(unnamed_assistant_vector_store.name))
    print("Vector Store Id: " + unnamed_assistant_vector_store.id)
    print("Vector Store Metadata: " + str(unnamed_assistant_vector_store.metadata))


Assistant(id='asst_ZoSIF8bPLQgsasNAwnwWFWbK', created_at=1717504198, description=None, instructions='You are a helpful assistant that answers questions about the stories in your files. The stories are from a variety of authors. You will answer questions from the user about the stories. All you will do is answer questions about the stories in the files and provide related information. If the user asks you a question that is not related to the stories in the files, you should let them know that you can only answer questions about the stories.', metadata={'can_be_used_for_file_search': 'True', 'has_vector_store': 'True'}, model='gpt-4o', name='Quick Assistant and Vector Store at Once', object='assistant', tools=[FileSearchTool(type='file_search')], response_format='auto', temperature=1.0, tool_resources=ToolResources(code_interpreter=None, file_search=ToolResourcesFileSearch(vector_store_ids=['vs_CWOoi3og998iGaUat85hLOol'])), top_p=1.0)



Assistant Name: Quick Assistant and Vector Store 

## Creating Threads with Vector Stores

### Creating Vector Stores with Thread Messages

You can create a vector store in threads with one of two ways: messages or during thread creation. If you create a thread with a vector store then it will also be searched when looking for information during the run. First, let's take a look at the most common scenario, vector stores created with messages in the thread.

In [9]:
# We assume that the user has given us a file to upload
message_file = client.files.create(
    file=open("./artifacts/War_of_the_Worlds.txt", "rb"), purpose="assistants"
)

# Create a thread and attach the file to the message
thread_with_file_attachment = client.beta.threads.create(
    messages=[
    {
    "role": "user",
    "content": "What is the name of the main character in War of the Worlds?",
    
    # Attach the new file to the message.
    "attachments": [
        { "file_id": message_file.id, "tools": [{"type": "file_search"}] }
    ],
    }
]
)

# The thread now has a vector store with that file in its tool resources.
print(thread_with_file_attachment.tool_resources.file_search.vector_store_ids[0])


vs_UJHfkOTWcnuHd1MFMlQhcF8F


### Creating Vector Stores at Thread Creation Time

Now, let's look at creating a vector store when we create our thread. 

In [10]:
thread_with_vector_store = client.beta.threads.create(
    tool_resources={
            "file_search": {
                "vector_stores": [
                    {
                        "file_ids": [
                            "file-fHR8COgusTxv2fy5paVHJmWW",
                            "file-EJecfCFkiHefsYQOJO7FOb8I"
                        ],
                        "metadata": {
                            "Book1": "Dracula", 
                            "Book2": "Frankenstein"
                        }
                    }
                ]
            }
        },
        metadata={  # Add metadata about the assistant's capabilities.
            "can_be_used_for_file_search": "True",
            "has_vector_store": "True",
        },
)

# Print the details of the created thread to check its properties
print(thread_with_vector_store)  # Print the full thread object.
print("\n")
print("Thread ID: " + thread_with_vector_store.id)  # Print the ID of the thread.
print("Thread Metadata: " + str(thread_with_vector_store.metadata))  # Print the metadata of the thread.
print("Thread Tool Resources: " + str(thread_with_vector_store.tool_resources))  # Print the tool resources of the thread.

Thread(id='thread_og68yr2b4s7zG8wwXKtGN6aD', created_at=1717504200, metadata={'can_be_used_for_file_search': 'True', 'has_vector_store': 'True'}, object='thread', tool_resources=ToolResources(code_interpreter=None, file_search=ToolResourcesFileSearch(vector_store_ids=['vs_rjFwhVZisM88vLt5Tt201yVn'])))


Thread ID: thread_og68yr2b4s7zG8wwXKtGN6aD
Thread Metadata: {'can_be_used_for_file_search': 'True', 'has_vector_store': 'True'}
Thread Tool Resources: ToolResources(code_interpreter=None, file_search=ToolResourcesFileSearch(vector_store_ids=['vs_rjFwhVZisM88vLt5Tt201yVn']))


## Getting the Results

We have built up Assistants and Threads with vector stores and now we want to use them. We need to set up a run to make this happen. Recall there are two approaches: streaming or non-streaming. We will do both.

### Streaming Run

First, let's do the, more common, streaming run to get our output.

In [11]:
with client.beta.threads.runs.stream(
    thread_id=thread_with_vector_store.id,
    assistant_id=assistant.id,
    instructions="List all the books you have access to in your files.",
    event_handler=EventHandler(),
) as stream:
    stream.until_done()


assistant > file_search


assistant > Here are the books I have access to in your files:

1. "Frankenstein" by Mary Wollstonecraft Shelley       
2. "Dracula" by Bram Stoker         
3. "The Veldt" by Ray Bradbury    

