# Tutorial: Prepare your own documents for vector search


In [None]:
%pip install langchain pypdf langchain-openai --quiet

## 1. Upload your documents
First, remove the existing files in the `/docs` folder and add your own PDF files. Then, run the cells below

In [None]:
# Create a loader that processes all files in the docs directory
import os
from langchain_community.document_loaders import PyPDFLoader

# Path to docs directory
docs_dir = "./docs"

# Get all files in the directory
all_files = [os.path.join(docs_dir, f) for f in os.listdir(docs_dir) 
             if os.path.isfile(os.path.join(docs_dir, f))]

# Process each file in the directory
documents = []
for file_path in all_files:
    try:
        loader = PyPDFLoader(
            file_path=file_path,
        )
        docs = loader.load()
        documents.extend(docs)
        print(f"Loaded {len(docs)} chunks from {file_path}")
    except Exception as e:
        print(f"Error loading {file_path}: {e}")

print(f"Loaded total of {len(documents)} document chunks")

## 2. Chunk documents
Split large documents into smaller chunks for better embedding quality.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split documents into chunks of 1000 characters with 200 characters overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")

In [None]:
# print chunks
for chunk in chunks:
    print(chunk.page_content)
    print("-"*100)

## 3. Generate embeddings
Use OpenAI embeddings to encode each chunk into a vector.

In [4]:
from langchain_openai import OpenAIEmbeddings
# define embeddings as default OpenAI embeddings
embeddings = OpenAIEmbeddings()

## 4. Store embeddings in Chroma
Initialize a Chroma vector store and persist it locally.

If you run into a  "OperationalError: attempt to write a readonly database" - restart the kernel and rerun the notebook.


In [None]:
from langchain.vectorstores import Chroma

# create vector store with Chroma
vectordb = Chroma.from_documents(
    chunks,
    embedding=embeddings,
    persist_directory="db",
    collection_name="my_custom_index"
)
vectordb.persist()


## 5. Example similarity search
Perform a similarity search query on your vector store.

In [None]:
# Test similarity search
query = "robotics"
results = vectordb.similarity_search(query, k=5)
for i, doc in enumerate(results):
    print(f"Result {i+1}: {doc.page_content}...\n")