#### In this notebook, documents are indexed in a vectorstore offline to be used in the application for retrieval. The LangChain indexing API is used to assist with the management of the indexing of documents. Documentation link: https://python.langchain.com/docs/modules/data_connection/indexing

In [1]:
import pickle

from dotenv import load_dotenv

from langchain.indexes import SQLRecordManager, index
from langchain.retrievers import BM25Retriever
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

load_dotenv()

True

In [2]:
persist_directory = 'vectordb'
embedding = OpenAIEmbeddings(model='text-embedding-ada-002')

#### Initialise vectorstore and SQL record manager

In [3]:
collection_name = "test_index"
vectorstore = Chroma(
    persist_directory=persist_directory, 
    embedding_function=embedding
)
vectorstore.persist()
namespace = f"chromadb/{collection_name}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)
record_manager.create_schema()

In [4]:
def _clear():
    """Hacky helper method to clear content. See the `full` mode section to to understand why it works."""
    index([], record_manager, vectorstore, cleanup="full", source_id_key="source")
    docs = vectorstore.get(include=["metadatas", "documents"])
    print(docs['metadatas'])
    print(docs["documents"])

In [5]:
def do_indexing(documents: list[Document], cleanup: str = "full") -> dict:
    """cleanup should take in either None, incremental or full"""
    print(index(
        documents,
        record_manager,
        vectorstore,
        cleanup=cleanup,
        source_id_key="source",
    ))

#### Load source file and create documents

In [6]:
loader = PyPDFLoader(r"./files/FIFA_World_Cup.pdf")
pages = loader.load()
len(pages)

21

In [7]:
# Vary the chunk size and overlap as appropriate
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len
)
# Chunk size refers to character count. 1000 characters is about 150 words or 200 tokens.
documents = text_splitter.split_documents(pages)
len(documents)

131

#### Initialise BM25 retriever and save to disk

In [8]:
bm25_retriever = BM25Retriever.from_documents(documents)
with open('bm25_retriever', 'wb') as bm25result_file:
    pickle.dump(bm25_retriever, bm25result_file)

#### Embed and store documents in vectorstore

In [9]:
_clear()

[]
[]


In [10]:
do_indexing(documents)

{'num_added': 131, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}
