In [None]:
import os
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from pdf2image import convert_from_path
import pytesseract

from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

load_dotenv()

groq_api_key= os.getenv("GROQ_API_KEY")
open_ai_key= os.getenv("OPENAI_API_KEY")

from langchain.schema import Document
pdf_path = "C:/Users/hp/Downloads/PhilipsPC.pdf" ## I am loading from this path
persist_directory = "C:/Users/hp/Downloads/chroma_embeddings"

# Step 1: Convert PDF to images
images = convert_from_path(pdf_path)  # Ensure Poppler is installed and PATH is correct

# Step 2: Convert extracted text to Document objects in to list in a for loop
documents = []
for i, image in enumerate(images):
    text = pytesseract.image_to_string(image)
    if text.strip():  # Skip empty pages
        doc = Document(
            page_content=text.strip(),  
            metadata={"page": i + 1}  
        )
        documents.append(doc)

print(f"Extracted {len(documents)} documents:")
for doc in documents:
    print(doc)

# Step 3: Split text into manageable chunks
try:
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
    splits = text_splitter.split_documents(documents)
except Exception as e:
    print("Error during text splitting:", e)
    raise

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# Step 4: Create embeddings
embeddings = OpenAIEmbeddings()

# Step 5: Store the documents and embeddings in a Chroma vector database
vector_store = Chroma.from_documents(splits, embeddings, persist_directory=persist_directory)



retreiver=vector_store.as_retriever()
from langchain.tools.retriever import create_retriever_tool
retreivertool= create_retriever_tool(retreiver,"Trimmer search", "search what i need")
from langchain_community.utilities import ArxivAPIWrapper,WikipediaAPIWrapper
from langchain_community.tools import ArxivQueryRun,WikipediaQueryRun
 
wikiwrapper = WikipediaAPIWrapper(top_k_results=1,doc_content_chars_max=250)
wiki= WikipediaQueryRun(api_wrapper=wikiwrapper)


from langchain import hub
tools= [wiki,retreivertool]
tools
##we created a tool, lets use now model with groqapi
from langchain_groq import ChatGroq
model=ChatGroq(model="Gemma2-9b-It",groq_api_key=groq_api_key)

prompt=hub.pull("hwchase17/openai-functions-agent")

from langchain.agents import create_openai_tools_agent
agent = create_openai_tools_agent(model,tools,prompt)
from langchain.agents import AgentExecutor
agent_executor=AgentExecutor(agent=agent,tools=tools,verbose=True)
agent_executor





