In [19]:
# Load all necessary packages

import pandas as pd
import os
from typing import List, Dict
from pathlib import Path
from openai import AzureOpenAI
from langchain_openai import ChatOpenAI, OpenAIEmbeddings, AzureOpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, TextLoader, PyPDFDirectoryLoader
from langchain_classic.agents import AgentExecutor, create_tool_calling_agent, create_react_agent, create_openai_tools_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.tools import Tool
from dotenv import load_dotenv
import time
from tqdm import tqdm



In [10]:
load_dotenv()

True

In [11]:
file_path =r"C:\Users\yannik_sassmann\Documents\YASA\Fortbildungen\Data_Science_Bootcamp\Final_Project\Ironhack_Capstone_Project\pdfs\giz"
documents_path=r"C:\Users\yannik_sassmann\Documents\YASA\Fortbildungen\Data_Science_Bootcamp\Final_Project\Ironhack_Capstone_Project\pdfs\giz"

In [None]:
class DeepResearchAgent:
    def __init__(self, documents_path: str, persist_directory: str = "./chroma_db"):
        """
        Initialize the deep research agent.
        
        Args:
            documents_path: Path to a directory containing PDF files
            persist_directory: Where to store the vector database
        """
        
        # Using GPT-5
        self.llm = ChatOpenAI(model="gpt-5",
                              base_url="https://bootcampai.openai.azure.com/openai/v1/",
                              api_key=os.environ["OPENAI_AZURE_API_KEY"])
        
        
        # Create/load vector store
        print("Loading vector store...")
        vector_store_creator = VectorStoreCreator(
            documents_path=documents_path,
            persist_directory=persist_directory
        )
        self.vectorstore = vector_store_creator.vectorstore
        print("âœ“ Vector store loaded!\n")


        # Create tools
        self.tools = self._create_tools()
        
        # Create agent
        self.agent_executor = self._create_agent()
    
    def _create_tools(self) -> List[Tool]:
        """Create tools for the agent."""
        
        def search_documents(query: str) -> str:
            """Search the document collection for relevant information."""
            docs = self.vectorstore.similarity_search(query, k=20)
            
            results = []
            for i, doc in enumerate(docs, 1):
                source = doc.metadata.get('source', 'Unknown')
                filename = os.path.basename(source)
                page = doc.metadata.get('page', 'N/A')
                results.append(f"Result {i} (Source: {filename}, Page: {page}):\n{doc.page_content}\n")
            
            return "\n".join(results) if results else "No relevant documents found."
        
        def deep_search(query: str) -> str:
            """Perform a deep search by generating multiple query variations."""
            variations_prompt = f"""Given this research question: "{query}"

Generate 3 different search queries that would help gather comprehensive information.
Focus on different aspects or angles of the question.

Format your response as a numbered list:
1. [query 1]
2. [query 2]
3. [query 3]"""
            
            response = self.llm.invoke(variations_prompt)
            queries = [line.split('. ', 1)[1] for line in response.content.split('\n') 
                      if line.strip() and line[0].isdigit()]
            
            all_queries = [query] + queries[:3]
            
            all_results = {}
            for q in all_queries:
                docs = self.vectorstore.similarity_search(q, k=10)
                for doc in docs:
                    all_results[doc.page_content] = doc
            
            results = []
            for i, doc in enumerate(list(all_results.values())[:10], 1):
                source = doc.metadata.get('source', 'Unknown')
                filename = os.path.basename(source)
                page = doc.metadata.get('page', 'N/A')
                results.append(f"Result {i} (Source: {filename}, Page: {page}):\n{doc.page_content}\n")
            
            return "\n".join(results) if results else "No relevant documents found."
        
        def diverse_search(query: str) -> str:
            """Search for diverse, relevant documents using Maximum Marginal Relevance."""
            docs = self.vectorstore.max_marginal_relevance_search(
                query, 
                k=10,
                fetch_k=20
            )
            
            results = []
            for i, doc in enumerate(docs, 1):
                source = doc.metadata.get('source', 'Unknown')
                filename = os.path.basename(source)
                page = doc.metadata.get('page', 'N/A')
                results.append(f"Result {i} (Source: {filename}, Page: {page}):\n{doc.page_content}\n")
            
            return "\n".join(results) if results else "No relevant documents found."
        
        return [
            Tool(
                name="search_documents",
                func=search_documents,
                description="Search the document collection for relevant information. Use this for straightforward queries."
            ),
            Tool(
                name="deep_search",
                func=deep_search,
                description="Perform a comprehensive search using multiple query variations. Use this when you need thorough, multi-faceted information."
            ),
            Tool(
                name="diverse_search",
                func=diverse_search,
                description="Search for diverse perspectives on a topic. Use this when you want varied viewpoints or comprehensive coverage."
            )
        ]
    
    def _create_agent(self) -> AgentExecutor:
        """Create the research agent."""
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", """You are a deep research agent with access to a collection of PDF documents.

Your goal is to provide comprehensive, well-researched answers by:
1. Breaking down complex questions into searchable components
2. Using multiple search strategies to gather information
3. Synthesizing information from multiple sources
4. Identifying gaps and conducting follow-up searches
5. Providing well-sourced, detailed answers with page numbers

When researching:
- Start with a deep_search for comprehensive coverage
- Use diverse_search if you need different perspectives
- Use search_documents for specific follow-up questions
- Always cite sources with document name and page number when presenting findings
- If information is incomplete, explicitly state what's missing

Be thorough but concise. Focus on accuracy and completeness."""),
            ("human", "{input}"),
            MessagesPlaceholder(variable_name="agent_scratchpad"),
        ])
        
        agent = create_openai_tools_agent(
            llm=self.llm,
            tools=self.tools,
            prompt=prompt
        )
        
        return AgentExecutor(
            agent=agent,
            tools=self.tools,
            verbose=True,
            max_iterations=10,
            return_intermediate_steps=True
        )
    
    def research(self, question: str) -> Dict:
        """Conduct research on a question."""
        result = self.agent_executor.invoke({"input": question})
        return result

In [None]:
# Example usage
if __name__ == "__main__":
    # Path to your giz folder with 20 PDFs
    documents_path = r"C:\Users\yannik_sassmann\Documents\YASA\Fortbildungen\Data_Science_Bootcamp\Final_Project\Ironhack_Capstone_Project\pdfs\giz"
    
    # Or use forward slashes (works on Windows too)
    # documents_path = "Final_Project/Ironhack_Capstone_Project/giz"

    # Initialize agent
    print("Initializing Deep Research Agent...")
    agent = DeepResearchAgent(
        documents_path=documents_path,
        persist_directory="./chroma_db_giz"
    )
    print("Agent ready!\n")

    # Conduct research
    while True:
        question = input("\nYour question (or 'quit' to exit): ")
        if question.lower() in ['quit', 'exit', 'q']:
            break
        
        print(f"\n{'='*80}")
        print(f"RESEARCHING: {question}")
        print(f"{'='*80}\n")
        
        result = agent.research(question)
    
        print(f"\n{'='*80}")
        print(f"RESEARCH QUESTION: {question}")
        print(f"{'='*80}\n")
        
        result = agent.research(question)
        
        print(f"\n{'='*80}")
        print("FINAL ANSWER:")
        print(f"{'='*80}\n")
        print(result['output'])
