In [1]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [2]:
def load_pdfs(path):
    loader = PyPDFDirectoryLoader(path,
                                  glob = '**/[!.]*.pdf',
                                  extract_images = False
                                  )
    documents = loader.load()
    return documents

In [3]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=40)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks


In [4]:
def download_embeddings():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-3-small",
        #dimensions=687
    )
    return embeddings

In [5]:
extracted_data = load_pdfs("../data/")

In [6]:
len(extracted_data)

46

In [7]:
text_chunks = text_split(extracted_data)

In [8]:
len(text_chunks)

414

In [9]:
embeddings = download_embeddings()

In [12]:
docsearch = FAISS.from_documents(text_chunks, embeddings)

In [13]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":5})

In [14]:
query = "Who is the CEO of the company?"

In [15]:
retrieved_docs = retriever.invoke(query)

In [16]:
retrieved_docs

[Document(metadata={'source': '..\\data\\handbook.pdf', 'page': 44}, page_content='Closing Statement\nThank you for reading our handbook. We hope it has provided you with an understanding of our mission, history, and\nstructure as well as our current policies and guidelines. We look forward to working with you to create a successful\nCompany and a safe, productive, and pleasant workplace.\nShruti Gupta, CEO\nZania, Inc.\n 45'),
 Document(metadata={'source': '..\\data\\handbook.pdf', 'page': 17}, page_content='Media Contacts\nIf you are not authorized to speak on behalf of the Company, do not speak to the media on behalf of the Company. Direct all\nmedia inquiries for official Company responses to Human Resources. 18'),
 Document(metadata={'source': '..\\data\\handbook.pdf', 'page': 3}, page_content='Nothing in this handbook or any other Company document should be understood as creating a contract, guaranteed or\ncontinued employment, a right to termination only "for cause," or any othe

In [17]:
llm = ChatOpenAI(
    model = "gpt-4o-mini", 
    temperature=0.6
)

In [18]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question."
    "If the information is not available in the provided context, say that Data Not Available. "
    "Keep the answer concise."
    "\n\n"
    "{context}"
)

In [19]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [20]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [21]:
response = rag_chain.invoke({"input": query})
print(response["answer"])

The CEO of the company is Shruti Gupta.


In [23]:
response = rag_chain.invoke({"input": "What is the name of the company?"})
print(response["answer"])

The name of the company is Zania, Inc.


In [24]:
response = rag_chain.invoke({"input": "What is their vacation policy?"})
print(response["answer"])

Zania, Inc. provides paid vacation to all full-time regular employees, with eligibility starting immediately upon hire or after completing a specified introductory period. Vacation time is accrued based on length of service, with part-time employees receiving proportional vacation. Employees must take vacation in increments of at least a certain number of hours or days. Vacation during the first year is prorated based on the hire date, and there may be a maximum accrual limit. The company may require unused vacation to be used during certain leaves of absence. Specific details such as the amount of vacation accrued and the maximum accrual limit are not provided.


In [25]:
response = rag_chain.invoke({"input": "What is the termination policy?"})
print(response["answer"])



In [26]:
response = rag_chain.invoke({"input": "Who is Shruti Gupta?"})
print(response["answer"])

Shruti Gupta is the CEO of Zania, Inc.


In [27]:
response = rag_chain.invoke({"input": "Who is Rahul Kumar?"})
print(response["answer"])

Data Not Available.


In [28]:
response = rag_chain.invoke({"input": "What is the company's mission and vision statement?"})
print(response["answer"])

Data Not Available.


In [29]:

response = rag_chain.invoke({"input": "What are the Ethics Code?"})
print(response["answer"])

The Ethics Code of Zania, Inc. states that the company will conduct business honestly and ethically wherever operations are maintained. It emphasizes improving the quality of services, products, and operations while maintaining a reputation for honesty, fairness, respect, responsibility, integrity, trust, and sound business judgment. Employees, including officers and directors, are expected to adhere to high standards of integrity and loyalty to the company, avoid misrepresentation, and maintain the confidentiality of trade secrets and proprietary information.


In [30]:
response = rag_chain.invoke({"input": "What if the Ethics Code is violated?"})
print(response["answer"])

If the Ethics Code is violated, it can result in discipline, which may include termination of employment. The degree of discipline may be influenced by voluntary disclosure of the violation and cooperation in any subsequent investigation.
