In [26]:
from dotenv import load_dotenv
import gradio as gr
import os

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.callbacks import StdOutCallbackHandler
from langchain_chroma import Chroma


from langchain.chains import create_retrieval_chain
from pathlib import Path


load_dotenv()

folders = Path(".").glob("knowledge-base/*")

documents = []
for folder in folders:
    if folder.is_dir():
        doc_type = os.path.basename(folder)
        loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader)
        folder_docs = loader.load()
        for doc in folder_docs:
            doc.metadata["doc_type"] = doc_type
            documents.append(doc)

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

embeddings = OpenAIEmbeddings()

# Create vectorstore
# vectorstore = FAISS.from_documents(chunks, embedding=embeddings)

# total_vectors = vectorstore.index.ntotal
# dimensions = vectorstore.index.d

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings)
total_vector = vectorstore._collection.count()
dimensions = len(vectorstore._collection.get(limit=1, include=["embeddings"])["embeddings"][0])

print(f"There are {total_vectors} vectors with {dimensions:,} dimensions in the vector store")
print()




Created a chunk of size 1088, which is longer than the specified 1000


Document types found: company, contracts, products, employees
There are 123 vectors with 1,536 dimensions in the vector store



In [67]:
import re

class CustomStdOutHandler(StdOutCallbackHandler):
    def on_retriever_start(self,
                serialized,
                query,
                *,
                run_id,
                parent_run_id=None,
                tags=None,
                metadata=None,
                **kwargs,
            ):
        print("FROM HANDLER")
        print(serialized)

        
    def on_retriever_end(self, documents, *, 
                         run_id=None, 
                         parent_run_id=None, 
                         **kwargs):
        print("FROM HANDLER")
        for d in documents:
            if "IIOTY" in d.page_content:
                print(d.page_content)
        

template = """Answer the following question based only on the provided context:
\n\n
{context}
\n\n
Question: {input}
"""

retriever = vectorstore.as_retriever(search_kwargs={"k": 25})
# retriever = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI(model="gpt-4o-mini")
model_chain = prompt | model
chain = create_retrieval_chain(retriever, model_chain).with_config(callbacks=[CustomStdOutHandler()])

resp = chain.invoke({"input": "Who won prestigios award IIOTY 2023?"})
print(resp['answer'].content)



[1m> Entering new retrieval_chain chain...[0m


[1m> Entering new RunnableAssign<context> chain...[0m


[1m> Entering new RunnableParallel<context> chain...[0m


[1m> Entering new retrieve_documents chain...[0m


[1m> Entering new RunnableLambda chain...[0m

[1m> Finished chain.[0m
FROM HANDLER
None
FROM HANDLER
## Insurellm Career Progression
- **January 2017 - October 2018**: **Junior Data Engineer**  
  * Maxine joined Insurellm as a Junior Data Engineer, focusing primarily on ETL processes and data integration tasks. She quickly learned Insurellm's data architecture, collaborating with other team members to streamline data workflows.  
- **November 2018 - December 2020**: **Data Engineer**  
  * In her new role, Maxine expanded her responsibilities to include designing comprehensive data models and improving data quality measures. Though she excelled in technical skills, communication issues with non-technical teams led to some project delays.  
- **January 2021 - Pre