In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
import json

In [None]:
load_dotenv()

In [None]:
base_path = os.getcwd()

In [None]:
pdf_path = os.path.join(base_path, "pdfs", "metak10.pdf")

if not os.path.exists(pdf_path):
    raise FileNotFoundError(f"PDF file not found: {pdf_path}")

pdf_loader = PyPDFLoader(pdf_path)

try:
    pages = pdf_loader.load()
    print(f"PDF has been loaded and has {len(pages)} pages")
except Exception as e:
    print(f"Error loading PDF: {e}")
    raise

In [None]:
pages[0]

In [None]:
page_dict = {}

for i, page in enumerate(pages):
    page_number = i + 1       # Human-readable page number
    content = page.page_content
    page_dict[str(page_number)] = content

# Save to JSON
output_path = "meta_k10.json"

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(page_dict, f, ensure_ascii=False, indent=4)

print(f"Saved JSON to {output_path}")

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",
)

In [None]:
# Chunking Process
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)


pages_split = text_splitter.split_documents(pages)

In [None]:
# persist_directory="./chroma_langchain_db"
# collection_name="prototype"
# try:
#     # Here, we actually create the chroma database using our embeddigns model
#     vectorstore = Chroma.from_documents(
#         documents=pages_split,
#         embedding=embeddings,
#         persist_directory=persist_directory,
#         collection_name=collection_name
#     )
#     print(f"Created ChromaDB vector store!")
    
# except Exception as e:
#     print(f"Error setting up ChromaDB: {str(e)}")
#     raise


In [None]:
# retriever = vectorstore.as_retriever(
#     search_type="similarity",
#     search_kwargs={"k": 5} # K is the amount of chunks to return
# )

In [None]:
# docs = retriever.invoke("tell me the details from balancesheet")