In [9]:
from langchain.document_loaders import (
    TextLoader,
    PyPDFLoader,
    Docx2txtLoader,
    CSVLoader,
    UnstructuredHTMLLoader,  # This one is fairly light
    UnstructuredExcelLoader  # Optional – consider replacing
)
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings


def choose_loader(file_path):
    extension = Path(file_path).suffix.lower()
    if extension == ".txt":
        return TextLoader(file_path)
    elif extension == ".pdf":
        return PyPDFLoader(file_path)
    elif extension == ".html":
        return UnstructuredHTMLLoader(file_path)
    elif extension in [".doc", ".docx"]:
        return Docx2txtLoader(file_path)
    elif extension in [".csv"]:
        return CSVLoader(file_path)
    elif extension in [".xlsx", ".xls"]:
        return UnstructuredExcelLoader(file_path)
    else:
        raise ValueError(f"Unsupported File Type: {extension}")
    

def load_and_embed_documents(file_path):
    loader = choose_loader(file_path)
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    split_docs = splitter.split_documents(docs)

    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    db = Chroma.from_documents(split_docs, embedding=embeddings)
    return db

In [14]:
import os
from dotenv import load_dotenv,find_dotenv

from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

_ = load_dotenv(find_dotenv())

gemini_api_key =  os.environ["GOOGLE_API_KEY"]

prompt = ChatPromptTemplate.from_template(
    """
    You are a helpful assistant. Use the following context to answer the question at the end.
    
    Context:
    {context}
    
    Question: {input}
    
    Answer:
    """
)

file_path = "data\MyResume.pdf" 
db = load_and_embed_documents(file_path)
retriever = db.as_retriever()

llm = ChatGoogleGenerativeAI(
    model = "gemini-1.5-flash",
)

combine_docs_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)

# Create the retrieval chain
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)


# 6. Ask your question here (could also be user input)
query = "What is the purpose of this document?"
response = retrieval_chain.invoke({"input":"Explain what the document is about?"})

# 7. Print the answer
print("Answer:", response)

  file_path = "data\MyResume.pdf"


Answer: {'input': 'Explain what the document is about?', 'context': [Document(metadata={'total_pages': 1, 'creationdate': '2024-05-07T03:35:49+00:00', 'moddate': '2024-05-07T03:35:49+00:00', 'author': 'RISHABH NARAYAN', 'creator': 'Microsoft® Word 2016', 'source': 'data\\MyResume.pdf', 'producer': 'www.ilovepdf.com', 'page_label': '1', 'page': 0}, page_content="solution for users seeking detailed insights into our loan portfolio, borrower profiles, and loan performance. \n\uf0b7 It provides a high-level summary of the bank's loan portfolio and monitors the performance of individual \nloans and assess risk. \n \nCERTIFICATIONS \n\uf0b7 Advanced Python with Django, Mindrisers"), Document(metadata={'creator': 'Microsoft® Word 2016', 'creationdate': '2024-05-07T03:35:49+00:00', 'page': 0, 'author': 'RISHABH NARAYAN', 'producer': 'www.ilovepdf.com', 'moddate': '2024-05-07T03:35:49+00:00', 'page_label': '1', 'total_pages': 1, 'source': 'data\\MyResume.pdf'}, page_content="solution for users 