# Parse the PDFs, make a vector database

In [4]:
import pdfplumber
from os import listdir
from os.path import isfile, join

In [None]:
folder_path = "./demo-pdfs/"

# for every file in the folder name, parse the pdf
pdfs = [ pdf for pdf in listdir(folder_path) if isfile(join(folder_path, pdf))]

# list of strings
text_chunks = []

for pdf_path in pdfs:
    with pdfplumber.open(folder_path + pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            # split the page text into multiple chunks based on the \n line
            page_chunks = page_text.split(sep='\n')
            text_chunks.extend(page_chunks)

text_chunks

In [None]:
# take these chunks & convert them into a vector database using chromadb
import chromadb
from chromadb.utils import embedding_functions

CHROMA_VECTORS_PATH = "./chroma_vectors/"
DEMO_COLLECTION_NAME = "Demo-docs"

# What's the difference between Client() and PersistentClient()?
client = chromadb.PersistentClient(path=CHROMA_VECTORS_PATH)

# by default it uses the model all-MiniLM-L6-v2
embedding_fx = embedding_functions.DefaultEmbeddingFunction()

collection = client.create_collection(
    name = DEMO_COLLECTION_NAME,
    embedding_function=embedding_fx,
    get_or_create=True
)

# add the data that was parsed, generating a unique ID for each chunk
collection.add(
    documents=text_chunks,
    ids=[f"id{i}" for i in range(len(text_chunks))]
)


In [None]:
query_results = collection.query(
    query_texts=["What is the procedure?"],
    n_results=10,
)

query_results

# Open the model, run the query

# Cache the result for performance, store it into the same session based on the PDFs chosen

# Optional: add support to immediately load up PDF text

In [None]:
""" 
Flow of communication:
1. Ask to load up a particular PDF (or just ask for which PDFs are available)
2. Ask what page of the PDF (also provide how many pages there exist in the PDF)
3. Ask for summaries of a particular page in the PDF
4. Return the text of the PDF (scrolling element) - front end mostly
"""

# Optional: add database support

In [None]:
""" 
Flow of communication:
1. Ask to retrieve a particular table (based on ID/name) - front end
2. Retrieve the list of column names, return those
3. Input one entry at a time (listing out the different attributes) - front end
4. Delete previous entry/undo - front end/back end
"""
