In [1]:
# Load all necessary packages

import pandas as pd
import os
from typing import List, Dict
from pathlib import Path
from openai import AzureOpenAI
from langchain_openai import ChatOpenAI, OpenAIEmbeddings, AzureOpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, TextLoader, PyPDFDirectoryLoader
from langchain_classic.agents import AgentExecutor, create_tool_calling_agent, create_react_agent, create_openai_tools_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.tools import Tool
from dotenv import load_dotenv
from tqdm import tqdm






In [10]:
load_dotenv()

True

In [8]:
file_path =r"C:\Users\yannik_sassmann\Documents\YASA\Fortbildungen\Data_Science_Bootcamp\Final_Project\Ironhack_Capstone_Project\pdfs\giz"
documents_path=r"C:\Users\yannik_sassmann\Documents\YASA\Fortbildungen\Data_Science_Bootcamp\Final_Project\Ironhack_Capstone_Project\pdfs\giz"

In [4]:
class RateLimitedAzureOpenAIEmbeddings(AzureOpenAIEmbeddings):
    """Azure OpenAI Embeddings with rate limiting."""
    
    def __init__(self, requests_per_minute: int = 400, **kwargs):
        super().__init__(**kwargs)
        self.requests_per_minute = requests_per_minute
        self.min_seconds_between_requests = 60.0 / requests_per_minute
        self.last_request_time = 0
    
    def _rate_limit(self):
        """Enforce rate limiting between requests."""
        current_time = time.time()
        time_since_last_request = current_time - self.last_request_time
        
        if time_since_last_request < self.min_seconds_between_requests:
            sleep_time = self.min_seconds_between_requests - time_since_last_request
            time.sleep(sleep_time)
        
        self.last_request_time = time.time()
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed documents with rate limiting."""
        self._rate_limit()
        return super().embed_documents(texts)
    
    def embed_query(self, text: str) -> List[float]:
        """Embed query with rate limiting."""
        self._rate_limit()
        return super().embed_query(text)

In [5]:
class DeepResearchAgent:
    def __init__(self, documents_path: str, persist_directory: str = "./chroma_db"):
        """
        Initialize the deep research agent.
        
        Args:
            documents_path: Path to a directory containing PDF files
            persist_directory: Where to store the vector database
        """
        # Using text-embedding-3-large for better quality embeddings
        self.embeddings = OpenAIEmbeddings(model="text-embedding-3-large",
                                           base_url="https://bootcampai.openai.azure.com/openai/v1/",
                                           api_key=os.environ["OPENAI_AZURE_API_KEY"])
        
        # Using GPT-5
        self.llm = ChatOpenAI(model="gpt-5",
                              base_url="https://bootcampai.openai.azure.com/openai/v1/",
                              api_key=os.environ["OPENAI_AZURE_API_KEY"])
        
        self.persist_directory = persist_directory
        
        # Load and process documents
        self.vectorstore = self._setup_vectorstore(documents_path)
        
        # Create tools
        self.tools = self._create_tools()
        
        # Create agent
        self.agent_executor = self._create_agent()
    
    def _setup_vectorstore(self, documents_path: str):
        """Load PDF documents and create vector store with batching."""
        print(f"\n{'='*80}")
        print("LOADING PDF DOCUMENTS")
        print(f"{'='*80}")
        print(f"Path: {documents_path}")
        
        path = Path(documents_path)
        
        if path.is_dir():
            print(f"Loading PDFs from directory...")
            documents = self._load_pdfs_from_directory(documents_path)
        elif path.is_file() and str(path).lower().endswith('.pdf'):
            print(f"Loading single PDF...")
            documents = self._load_single_pdf(documents_path)
        else:
            print(f"Trying glob pattern...")
            documents = self._load_pdfs_from_pattern(documents_path)
        
        print(f"\n✓ Total pages loaded: {len(documents)}")
        
        if len(documents) == 0:
            raise ValueError("No PDF documents loaded!")
        
        # Split documents into chunks
        print("\nSplitting documents into chunks...")
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1500,
            chunk_overlap=300,
            length_function=len,
        )
        chunks = text_splitter.split_documents(documents)
        
        print(f"✓ Created {len(chunks)} chunks")
        
        # Create vector store with batching
        print("\nCreating vector embeddings with rate limiting...")
        print(f"Processing {len(chunks)} chunks in batches...")
        print("This will take approximately {:.1f} minutes".format(len(chunks) / 370 + 1))
        
        vectorstore = self._create_vectorstore_with_batching(chunks)
        
        print("✓ Vector store created successfully!\n")
        return vectorstore
    
    def _create_vectorstore_with_batching(self, chunks: List[Document], batch_size: int = 100):
        """Create vector store by processing chunks in batches to avoid rate limits."""
        
        # Check if vectorstore already exists
        if os.path.exists(self.persist_directory):
            print(f"Loading existing vector store from {self.persist_directory}")
            vectorstore = Chroma(
                persist_directory=self.persist_directory,
                embedding_function=self.embeddings
            )
            print("✓ Loaded existing vector store")
            return vectorstore
        
        # Create new vectorstore with batching
        vectorstore = None
        total_batches = (len(chunks) + batch_size - 1) // batch_size
        
        print(f"\nProcessing {total_batches} batches of {batch_size} chunks each...")
        
        for i in tqdm(range(0, len(chunks), batch_size), desc="Creating embeddings"):
            batch = chunks[i:i + batch_size]
            batch_num = i // batch_size + 1
            
            try:
                if vectorstore is None:
                    # Create initial vectorstore
                    print(f"\nBatch {batch_num}/{total_batches}: Creating initial vector store...")
                    vectorstore = Chroma.from_documents(
                        documents=batch,
                        embedding=self.embeddings,
                        persist_directory=self.persist_directory
                    )
                else:
                    # Add to existing vectorstore
                    print(f"\nBatch {batch_num}/{total_batches}: Adding to vector store...")
                    vectorstore.add_documents(batch)
                
                # Small delay between batches to be extra safe
                if i + batch_size < len(chunks):
                    time.sleep(1)
                    
            except Exception as e:
                if "RateLimitReached" in str(e):
                    print(f"\nRate limit hit at batch {batch_num}. Waiting 60 seconds...")
                    time.sleep(60)
                    # Retry this batch
                    if vectorstore is None:
                        vectorstore = Chroma.from_documents(
                            documents=batch,
                            embedding=self.embeddings,
                            persist_directory=self.persist_directory
                        )
                    else:
                        vectorstore.add_documents(batch)
                else:
                    raise e
        
        return vectorstore
    
    def _load_single_pdf(self, pdf_path: str) -> List[Document]:
        """Load a single PDF file."""
        try:
            loader = PyPDFLoader(pdf_path)
            documents = loader.load()
            print(f"  ✓ Loaded: {os.path.basename(pdf_path)} ({len(documents)} pages)")
            return documents
        except Exception as e:
            print(f"  ✗ Failed to load {pdf_path}: {e}")
            return []
    
    def _load_pdfs_from_directory(self, directory_path: str) -> List[Document]:
        """Load all PDF files from a directory."""
        return self._load_pdfs_manually(directory_path)
    
    def _load_pdfs_manually(self, directory_path: str) -> List[Document]:
        """Manually load all PDFs from directory."""
        documents = []
        
        pdf_files = []
        for file in os.listdir(directory_path):
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(directory_path, file))
        
        print(f"Found {len(pdf_files)} PDF files\n")
        
        for i, pdf_file in enumerate(pdf_files, 1):
            try:
                print(f"[{i}/{len(pdf_files)}] Loading {os.path.basename(pdf_file)}...", end=" ")
                loader = PyPDFLoader(pdf_file)
                docs = loader.load()
                documents.extend(docs)
                print(f"✓ ({len(docs)} pages)")
            except Exception as e:
                print(f"✗ Error: {str(e)[:50]}")
        
        return documents
    
    def _load_pdfs_from_pattern(self, pattern: str) -> List[Document]:
        """Load PDFs matching a glob pattern."""
        pdf_files = glob.glob(pattern, recursive=True)
        documents = []
        
        print(f"Found {len(pdf_files)} files matching pattern")
        
        for pdf_file in pdf_files:
            if pdf_file.lower().endswith('.pdf'):
                docs = self._load_single_pdf(pdf_file)
                documents.extend(docs)
        
        return documents
    
    def _create_tools(self) -> List[Tool]:
        """Create tools for the agent."""
        
        def search_documents(query: str) -> str:
            """Search the document collection for relevant information."""
            docs = self.vectorstore.similarity_search(query, k=20)
            
            results = []
            for i, doc in enumerate(docs, 1):
                source = doc.metadata.get('source', 'Unknown')
                filename = os.path.basename(source)
                page = doc.metadata.get('page', 'N/A')
                results.append(f"Result {i} (Source: {filename}, Page: {page}):\n{doc.page_content}\n")
            
            return "\n".join(results) if results else "No relevant documents found."
        
        def deep_search(query: str) -> str:
            """Perform a deep search by generating multiple query variations."""
            variations_prompt = f"""Given this research question: "{query}"

Generate 3 different search queries that would help gather comprehensive information.
Focus on different aspects or angles of the question.

Format your response as a numbered list:
1. [query 1]
2. [query 2]
3. [query 3]"""
            
            response = self.llm.invoke(variations_prompt)
            queries = [line.split('. ', 1)[1] for line in response.content.split('\n') 
                      if line.strip() and line[0].isdigit()]
            
            all_queries = [query] + queries[:3]
            
            all_results = {}
            for q in all_queries:
                docs = self.vectorstore.similarity_search(q, k=10)
                for doc in docs:
                    all_results[doc.page_content] = doc
            
            results = []
            for i, doc in enumerate(list(all_results.values())[:10], 1):
                source = doc.metadata.get('source', 'Unknown')
                filename = os.path.basename(source)
                page = doc.metadata.get('page', 'N/A')
                results.append(f"Result {i} (Source: {filename}, Page: {page}):\n{doc.page_content}\n")
            
            return "\n".join(results) if results else "No relevant documents found."
        
        def diverse_search(query: str) -> str:
            """Search for diverse, relevant documents using Maximum Marginal Relevance."""
            docs = self.vectorstore.max_marginal_relevance_search(
                query, 
                k=10,
                fetch_k=20
            )
            
            results = []
            for i, doc in enumerate(docs, 1):
                source = doc.metadata.get('source', 'Unknown')
                filename = os.path.basename(source)
                page = doc.metadata.get('page', 'N/A')
                results.append(f"Result {i} (Source: {filename}, Page: {page}):\n{doc.page_content}\n")
            
            return "\n".join(results) if results else "No relevant documents found."
        
        return [
            Tool(
                name="search_documents",
                func=search_documents,
                description="Search the document collection for relevant information. Use this for straightforward queries."
            ),
            Tool(
                name="deep_search",
                func=deep_search,
                description="Perform a comprehensive search using multiple query variations. Use this when you need thorough, multi-faceted information."
            ),
            Tool(
                name="diverse_search",
                func=diverse_search,
                description="Search for diverse perspectives on a topic. Use this when you want varied viewpoints or comprehensive coverage."
            )
        ]
    
    def _create_agent(self) -> AgentExecutor:
        """Create the research agent."""
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", """You are a deep research agent with access to a collection of PDF documents.

Your goal is to provide comprehensive, well-researched answers by:
1. Breaking down complex questions into searchable components
2. Using multiple search strategies to gather information
3. Synthesizing information from multiple sources
4. Identifying gaps and conducting follow-up searches
5. Providing well-sourced, detailed answers with page numbers

When researching:
- Start with a deep_search for comprehensive coverage
- Use diverse_search if you need different perspectives
- Use search_documents for specific follow-up questions
- Always cite sources with document name and page number when presenting findings
- If information is incomplete, explicitly state what's missing

Be thorough but concise. Focus on accuracy and completeness."""),
            ("human", "{input}"),
            MessagesPlaceholder(variable_name="agent_scratchpad"),
        ])
        
        agent = create_openai_tools_agent(
            llm=self.llm,
            tools=self.tools,
            prompt=prompt
        )
        
        return AgentExecutor(
            agent=agent,
            tools=self.tools,
            verbose=True,
            max_iterations=10,
            return_intermediate_steps=True
        )
    
    def research(self, question: str) -> Dict:
        """Conduct research on a question."""
        result = self.agent_executor.invoke({"input": question})
        return result

In [9]:
# Example usage
if __name__ == "__main__":
    # Path to your giz folder with 20 PDFs
    documents_path = r"C:\Users\yannik_sassmann\Documents\YASA\Fortbildungen\Data_Science_Bootcamp\Final_Project\Ironhack_Capstone_Project\pdfs\giz"
    
    # Or use forward slashes (works on Windows too)
    # documents_path = "Final_Project/Ironhack_Capstone_Project/giz"
    
    # Or if running from the Final_Project directory:
    # documents_path = "Ironhack_Capstone_Project/giz"
    
    # Initialize agent
    agent = DeepResearchAgent(
        documents_path=documents_path,
        persist_directory="./chroma_db_giz"
    )
    
    # Conduct research
    question = "What are the main learnings and recommendations from these documents in terms of working with partner organisations in development cooperation?"
    
    print(f"\n{'='*80}")
    print(f"RESEARCH QUESTION: {question}")
    print(f"{'='*80}\n")
    
    result = agent.research(question)
    
    print(f"\n{'='*80}")
    print("FINAL ANSWER:")
    print(f"{'='*80}\n")
    print(result['output'])



LOADING PDF DOCUMENTS
Path: C:\Users\yannik_sassmann\Documents\YASA\Fortbildungen\Data_Science_Bootcamp\Final_Project\Ironhack_Capstone_Project\pdfs\giz
Loading PDFs from directory...
Found 1059 PDF files

[1/1059] Loading 2014-03_1996.2177.2_de.pdf... ✓ (11 pages)
[2/1059] Loading 2014-03_1996.2177.2_po.pdf... ✓ (11 pages)
[3/1059] Loading 2014-05_2006.2051.8_de.pdf... ✓ (15 pages)
[4/1059] Loading 2014-06_2007.2071.4_de.pdf... ✓ (17 pages)
[5/1059] Loading 2014-06_2012.6253.4_de.pdf... ✓ (7 pages)
[6/1059] Loading 2014-07_unknown_de.pdf... ✓ (13 pages)
[7/1059] Loading 2014-08_2004.2115.6_de.pdf... ✓ (27 pages)
[8/1059] Loading 2014-08_2004.2115.6_sp.pdf... ✓ (25 pages)
[9/1059] Loading 2014-09_2011.2074.0_de.pdf... ✓ (6 pages)
[10/1059] Loading 2014-09_2011.2074.0_en.pdf... ✓ (6 pages)
[11/1059] Loading 2014-11_2011.2111.0_de.pdf... ✓ (6 pages)
[12/1059] Loading 2014-11_2011.2111.0_en.pdf... ✓ (6 pages)
[13/1059] Loading 2014-12_2011.2112.8_en.pdf... ✓ (8 pages)
[14/1059] Loading 2

Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)


✓ (7 pages)
[415/1059] Loading 2017-09_2012.2133.2_de.pdf... 

Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)


✓ (8 pages)
[416/1059] Loading 2017-09_2012.2133.2_en.pdf... ✓ (8 pages)
[417/1059] Loading 2017-09_2013.2236.1_de.pdf... ✓ (9 pages)
[418/1059] Loading 2017-09_2013.2236.1_en.pdf... 

Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)


✓ (11 pages)
[419/1059] Loading 2017-09_2013.6257.3_de.pdf... 

Ignoring wrong pointing object 12 0 (offset 0)


✓ (6 pages)
[420/1059] Loading 2017-09_2013.6257.3_en.pdf... ✓ (5 pages)
[421/1059] Loading 2017-10_2011.2129.2_de.pdf... ✓ (7 pages)
[422/1059] Loading 2017-10_2011.2129.2_en.pdf... ✓ (7 pages)
[423/1059] Loading 2017-10_2012.9756.3_de.pdf... ✓ (7 pages)
[424/1059] Loading 2017-10_2012.9756.3_en.pdf... ✓ (6 pages)
[425/1059] Loading 2017-12_2009.2247.6_de.pdf... ✓ (6 pages)
[426/1059] Loading 2017-12_2009.2247.6_en.pdf... ✓ (6 pages)
[427/1059] Loading 2017-12_2010.2009.8_de.pdf... ✓ (7 pages)
[428/1059] Loading 2017-12_2010.2009.8_en.pdf... ✓ (6 pages)
[429/1059] Loading 2017-12_2010.2074.2_de.pdf... ✓ (6 pages)
[430/1059] Loading 2017-12_2011.9787.0_de.pdf... ✓ (5 pages)
[431/1059] Loading 2017-12_2011.9787.0_en.pdf... ✓ (5 pages)
[432/1059] Loading 2017-12_2012.2174.6_de.pdf... ✓ (7 pages)
[433/1059] Loading 2017-12_2012.2174.6_en.pdf... ✓ (7 pages)
[434/1059] Loading 2017-12_2012.2175.3_de.pdf... ✓ (5 pages)
[435/1059] Loading 2017-12_2012.2175.3_en.pdf... ✓ (8 pages)
[436/1059] L

Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)


✓ (8 pages)
[544/1059] Loading 2018-06_2013.9769.4_de.pdf... 

Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)


✓ (6 pages)
[545/1059] Loading 2018-06_2013.9769.4_en.pdf... ✓ (6 pages)
[546/1059] Loading 2018-07_2012.2078.9_de.pdf... ✓ (8 pages)
[547/1059] Loading 2018-07_2012.2078.9_en.pdf... ✓ (8 pages)
[548/1059] Loading 2018-07_2014.2098.3_de.pdf... ✓ (8 pages)
[549/1059] Loading 2018-07_2014.2098.3_en.pdf... ✓ (9 pages)
[550/1059] Loading 2018-08_2013.2099.3_de.pdf... ✓ (8 pages)
[551/1059] Loading 2018-08_2013.2099.3_en.pdf... ✓ (7 pages)
[552/1059] Loading 2018-08_2015.2026.1_en.pdf... ✓ (7 pages)
[553/1059] Loading 2018-10_2011.2105.2_de.pdf... ✓ (7 pages)
[554/1059] Loading 2018-10_2011.2105.2_en.pdf... ✓ (6 pages)
[555/1059] Loading 2018-10_2015.2048.5_de.pdf... ✓ (6 pages)
[556/1059] Loading 2018-10_2015.2048.5_en.pdf... ✓ (6 pages)
[557/1059] Loading 2018-12_2010.2192.2_de.pdf... ✓ (6 pages)
[558/1059] Loading 2018-12_2010.2192.2_en.pdf... ✓ (6 pages)
[559/1059] Loading 2018_2011.2130.0_de.pdf... ✓ (7 pages)
[560/1059] Loading 2018_2011.2130.0_en.pdf... ✓ (7 pages)
[561/1059] Loading

  vectorstore = Chroma(


✓ Loaded existing vector store
✓ Vector store created successfully!


RESEARCH QUESTION: What are the main learnings and recommendations from these documents in terms of working with partner organisations in development cooperation?



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `deep_search` with `partner organisations development cooperation partnership lessons learned recommendations working with partners NGOs CSOs local partners consortia aid effectiveness risk management capacity strengthening localization mutual accountability MEL funding modalities`


[0m[33;1m[1;3mResult 1 (Source: 2025-12_2020.2161.6_en.pdf, Page: 100):
partner/executing agency? 
Capacity Works considerations: 
- Results-oriented monitoring (RoM / WoM) is established and used, e.g., for 
evidence-based decisions, risk management. Data are disaggregated by gen-
der and marginalized groups. unintended positive and negative results are mon-
itored. Conflict-sensitive monitoring and ex