In [7]:
# Load all necessary packages

import pandas as pd
import os
from typing import List, Dict
from pathlib import Path
from openai import AzureOpenAI
from langchain_openai import ChatOpenAI, OpenAIEmbeddings, AzureOpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, TextLoader, PyPDFDirectoryLoader
from langchain_classic.agents import AgentExecutor, create_tool_calling_agent, create_react_agent, create_openai_tools_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.tools import Tool
from dotenv import load_dotenv
import time
import json
import pickle
from tqdm import tqdm

In [4]:
# Load enviroment variable
load_dotenv()

True

In [3]:
# Load file paths to pdfs
file_path =r"C:\Users\yannik_sassmann\Documents\YASA\Fortbildungen\Data_Science_Bootcamp\Final_Project\Ironhack_Capstone_Project\pdfs\giz"
documents_path=r"C:\Users\yannik_sassmann\Documents\YASA\Fortbildungen\Data_Science_Bootcamp\Final_Project\Ironhack_Capstone_Project\pdfs\giz"

In [5]:
# Create batch sizing to stay within rate limits in Azure
class RateLimitedAzureOpenAIEmbeddings(AzureOpenAIEmbeddings):
    """Azure OpenAI Embeddings with rate limiting."""
    
    def __init__(self, requests_per_minute: int = 400, **kwargs):
        super().__init__(**kwargs)
        self.requests_per_minute = requests_per_minute
        self.min_seconds_between_requests = 60.0 / requests_per_minute
        self.last_request_time = 0
    
    def _rate_limit(self):
        """Enforce rate limiting between requests."""
        current_time = time.time()
        time_since_last_request = current_time - self.last_request_time
        
        if time_since_last_request < self.min_seconds_between_requests:
            sleep_time = self.min_seconds_between_requests - time_since_last_request
            time.sleep(sleep_time)
        
        self.last_request_time = time.time()
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed documents with rate limiting."""
        self._rate_limit()
        return super().embed_documents(texts)
    
    def embed_query(self, text: str) -> List[float]:
        """Embed query with rate limiting."""
        self._rate_limit()
        return super().embed_query(text)

In [9]:
class VectorStoreCreator:
    """Creates and persists a vector store from PDF documents."""

    CHUNKS_CACHE = "./chunks_cache.pkl"

    def __init__(self, documents_path: str, persist_directory: str = "./chroma_db",
                 metadata_json_path: str = "results_giz.json"):
        self.embeddings = OpenAIEmbeddings(
            model="text-embedding-3-large",
            base_url="https://bootcampai.openai.azure.com/openai/v1/",
            api_key=os.environ["OPENAI_AZURE_API_KEY"]
        )
        self.persist_directory = persist_directory
        self.metadata_json_path = metadata_json_path
        self.vectorstore = self._setup_vectorstore(documents_path)

    # ─────────────────────────────────────────────────────────────────────
    # SETUP
    # ─────────────────────────────────────────────────────────────────────
    def _setup_vectorstore(self, documents_path: str):

        # ── STEP 1: Load or create chunks ────────────────────────────────
        if os.path.exists(self.CHUNKS_CACHE):
            print(f"✓ Loading chunks from cache: {self.CHUNKS_CACHE} (skipping PDF reload)")
            with open(self.CHUNKS_CACHE, "rb") as f:
                chunks = pickle.load(f)
            print(f"✓ {len(chunks)} chunks loaded from cache")
        else:
            print(f"\n{'='*80}")
            print("LOADING PDF DOCUMENTS")
            print(f"{'='*80}")

            path = Path(documents_path)
            if path.is_dir():
                documents = self._load_pdfs_from_directory(documents_path)
            elif path.is_file() and str(path).lower().endswith('.pdf'):
                documents = self._load_single_pdf(documents_path)
            else:
                documents = self._load_pdfs_from_pattern(documents_path)

            print(f"\n✓ Total pages loaded: {len(documents)}")
            if len(documents) == 0:
                raise ValueError("No PDF documents loaded!")

            # Split into chunks
            print("\nSplitting into chunks...")
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1500,
                chunk_overlap=300,
                length_function=len,
            )
            chunks = text_splitter.split_documents(documents)
            print(f"✓ Created {len(chunks)} chunks")

            # Enrich chunks with metadata BEFORE embedding
            chunks = self._enrich_chunks_with_metadata(chunks)

            # Save cache
            print(f"\nSaving chunk cache to {self.CHUNKS_CACHE}...")
            with open(self.CHUNKS_CACHE, "wb") as f:
                pickle.dump(chunks, f)
            print(f"✓ Cache saved ({len(chunks)} chunks)")

        # ── STEP 2: Embed ─────────────────────────────────────────────────
        print(f"\nProcessing {len(chunks)} chunks — estimated time: "
              f"{len(chunks) / 370 + 1:.1f} minutes")
        vectorstore = self._create_vectorstore_with_batching(chunks)
        print("✓ Vector store ready!\n")
        return vectorstore

    # ─────────────────────────────────────────────────────────────────────
    # METADATA ENRICHMENT
    # ─────────────────────────────────────────────────────────────────────
    def _enrich_chunks_with_metadata(self, chunks: List[Document]) -> List[Document]:
        """Prepend catalog metadata to chunk text so it becomes searchable."""

        if not os.path.exists(self.metadata_json_path):
            print(f"⚠ Metadata JSON not found at {self.metadata_json_path} — skipping enrichment")
            return chunks

        with open(self.metadata_json_path, "r", encoding="utf-8") as f:
            records = json.load(f)
        lookup = {r.get("filename", "").strip(): r for r in records if r.get("filename")}
        print(f"✓ Loaded metadata for {len(lookup)} documents")

        matched, unmatched = 0, 0
        for chunk in chunks:
            fname = os.path.basename(chunk.metadata.get("source", ""))
            record = lookup.get(fname)

            if record:
                # Build text prefix — this gets baked into the embedding
                lines = []
                for label, key in [
                    ("Title",          "Titel"),
                    ("Subtitle",       "Titelzusatz"),
                    ("Keywords",       "Schlagworte"),
                    ("Classification", "Klassifikation"),
                    ("Year",           "Erscheinungsdatum"),
                    ("Language",       "Sprache"),
                    ("Authors",        "Autor"),
                ]:
                    val = record.get(key, "").strip()
                    if val:
                        lines.append(f"{label}: {val}")

                chunk.page_content = "\n".join(lines) + "\n---\n" + chunk.page_content

                # Also store metadata fields for filtering
                chunk.metadata.update({
                    "titel":             record.get("Titel", ""),
                    "schlagworte":       record.get("Schlagworte", ""),
                    "klassifikation":    record.get("Klassifikation", ""),
                    "erscheinungsdatum": record.get("Erscheinungsdatum", ""),
                    "sprache":           record.get("Sprache", ""),
                    "projektnummer":     record.get("id", ""),
                    "metadata_enriched": True,
                })
                matched += 1
            else:
                chunk.metadata["metadata_enriched"] = False
                unmatched += 1

        print(f"✓ Enriched {matched} chunks | {unmatched} chunks had no metadata match")
        return chunks

    # ─────────────────────────────────────────────────────────────────────
    # EMBEDDING WITH RESUME
    # ─────────────────────────────────────────────────────────────────────
    def _create_vectorstore_with_batching(self, chunks: List[Document], batch_size: int = 100):
        """Embed chunks in batches — resumes automatically on interruption."""

        if os.path.exists(self.persist_directory):
            vectorstore = Chroma(
                persist_directory=self.persist_directory,
                embedding_function=self.embeddings
            )
            existing_count = vectorstore._collection.count()
            print(f"Found existing store with {existing_count}/{len(chunks)} chunks")

            if existing_count >= len(chunks):
                print("✓ Store is complete. Skipping embedding.")
                return vectorstore

            start_idx = (existing_count // batch_size) * batch_size
            print(f"⚠ Resuming from chunk {start_idx}...")
            chunks_to_process = chunks[start_idx:]
        else:
            vectorstore = None
            chunks_to_process = chunks
            start_idx = 0

        total_batches = (len(chunks_to_process) + batch_size - 1) // batch_size
        print(f"Processing {total_batches} remaining batches...")

        for i in tqdm(range(0, len(chunks_to_process), batch_size), desc="Embedding"):
            batch = chunks_to_process[i:i + batch_size]
            batch_num = (start_idx + i) // batch_size + 1
            max_retries = 3

            for attempt in range(max_retries):
                try:
                    if vectorstore is None:
                        vectorstore = Chroma.from_documents(
                            documents=batch,
                            embedding=self.embeddings,
                            persist_directory=self.persist_directory
                        )
                    else:
                        vectorstore.add_documents(batch)
                    time.sleep(1)
                    break  # success

                except Exception as e:
                    err = str(e).lower()
                    if any(x in err for x in ["ratelimit", "rate limit", "429", "too many requests", "quota"]):
                        wait = 60 * (attempt + 1)
                        print(f"\n  Rate limit (attempt {attempt+1}). Waiting {wait}s...")
                        time.sleep(wait)
                        if attempt == max_retries - 1:
                            raise
                    else:
                        raise

        print(f"\n✓ Complete! Total chunks: {vectorstore._collection.count()}")
        return vectorstore

    # ─────────────────────────────────────────────────────────────────────
    # PDF LOADERS (unchanged)
    # ─────────────────────────────────────────────────────────────────────
    def _load_single_pdf(self, pdf_path: str) -> List[Document]:
        try:
            loader = PyPDFLoader(pdf_path)
            documents = loader.load()
            print(f"  ✓ Loaded: {os.path.basename(pdf_path)} ({len(documents)} pages)")
            return documents
        except Exception as e:
            print(f"  ✗ Failed to load {pdf_path}: {e}")
            return []

    def _load_pdfs_from_directory(self, directory_path: str) -> List[Document]:
        return self._load_pdfs_manually(directory_path)

    def _load_pdfs_manually(self, directory_path: str) -> List[Document]:
        documents = []
        pdf_files = [
            os.path.join(directory_path, f)
            for f in os.listdir(directory_path)
            if f.lower().endswith('.pdf')
        ]
        print(f"Found {len(pdf_files)} PDF files\n")
        for i, pdf_file in enumerate(pdf_files, 1):
            try:
                print(f"[{i}/{len(pdf_files)}] Loading {os.path.basename(pdf_file)}...", end=" ")
                docs = PyPDFLoader(pdf_file).load()
                documents.extend(docs)
                print(f"✓ ({len(docs)} pages)")
            except Exception as e:
                print(f"✗ Error: {str(e)[:50]}")
        return documents

    def _load_pdfs_from_pattern(self, pattern: str) -> List[Document]:
        documents = []
        for pdf_file in glob.glob(pattern, recursive=True):
            if pdf_file.lower().endswith('.pdf'):
                documents.extend(self._load_single_pdf(pdf_file))
        return documents


# ── RUN ───────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    vector_store_creator = VectorStoreCreator(
        documents_path=documents_path,
        persist_directory="./chroma_db",
        metadata_json_path="results_giz.json"
    )
    print("Done! Vector store is ready.")



LOADING PDF DOCUMENTS
Found 1059 PDF files

[1/1059] Loading 2014-03_1996.2177.2_de.pdf... ✓ (11 pages)
[2/1059] Loading 2014-03_1996.2177.2_po.pdf... ✓ (11 pages)
[3/1059] Loading 2014-05_2006.2051.8_de.pdf... ✓ (15 pages)
[4/1059] Loading 2014-06_2007.2071.4_de.pdf... ✓ (17 pages)
[5/1059] Loading 2014-06_2012.6253.4_de.pdf... ✓ (7 pages)
[6/1059] Loading 2014-07_unknown_de.pdf... ✓ (13 pages)
[7/1059] Loading 2014-08_2004.2115.6_de.pdf... ✓ (27 pages)
[8/1059] Loading 2014-08_2004.2115.6_sp.pdf... ✓ (25 pages)
[9/1059] Loading 2014-09_2011.2074.0_de.pdf... ✓ (6 pages)
[10/1059] Loading 2014-09_2011.2074.0_en.pdf... ✓ (6 pages)
[11/1059] Loading 2014-11_2011.2111.0_de.pdf... ✓ (6 pages)
[12/1059] Loading 2014-11_2011.2111.0_en.pdf... ✓ (6 pages)
[13/1059] Loading 2014-12_2011.2112.8_en.pdf... ✓ (8 pages)
[14/1059] Loading 2014_2006.2051.8_en.pdf... ✓ (14 pages)
[15/1059] Loading 2015-01_2010.2259.9_de.pdf... ✓ (7 pages)
[16/1059] Loading 2015-02_2010.2005.6_de.pdf... ✓ (6 pages)
[17

Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)


✓ (7 pages)
[415/1059] Loading 2017-09_2012.2133.2_de.pdf... 

Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)


✓ (8 pages)
[416/1059] Loading 2017-09_2012.2133.2_en.pdf... ✓ (8 pages)
[417/1059] Loading 2017-09_2013.2236.1_de.pdf... ✓ (9 pages)
[418/1059] Loading 2017-09_2013.2236.1_en.pdf... 

Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)


✓ (11 pages)
[419/1059] Loading 2017-09_2013.6257.3_de.pdf... 

Ignoring wrong pointing object 12 0 (offset 0)


✓ (6 pages)
[420/1059] Loading 2017-09_2013.6257.3_en.pdf... ✓ (5 pages)
[421/1059] Loading 2017-10_2011.2129.2_de.pdf... ✓ (7 pages)
[422/1059] Loading 2017-10_2011.2129.2_en.pdf... ✓ (7 pages)
[423/1059] Loading 2017-10_2012.9756.3_de.pdf... ✓ (7 pages)
[424/1059] Loading 2017-10_2012.9756.3_en.pdf... ✓ (6 pages)
[425/1059] Loading 2017-12_2009.2247.6_de.pdf... ✓ (6 pages)
[426/1059] Loading 2017-12_2009.2247.6_en.pdf... ✓ (6 pages)
[427/1059] Loading 2017-12_2010.2009.8_de.pdf... ✓ (7 pages)
[428/1059] Loading 2017-12_2010.2009.8_en.pdf... ✓ (6 pages)
[429/1059] Loading 2017-12_2010.2074.2_de.pdf... ✓ (6 pages)
[430/1059] Loading 2017-12_2011.9787.0_de.pdf... ✓ (5 pages)
[431/1059] Loading 2017-12_2011.9787.0_en.pdf... ✓ (5 pages)
[432/1059] Loading 2017-12_2012.2174.6_de.pdf... ✓ (7 pages)
[433/1059] Loading 2017-12_2012.2174.6_en.pdf... ✓ (7 pages)
[434/1059] Loading 2017-12_2012.2175.3_de.pdf... ✓ (5 pages)
[435/1059] Loading 2017-12_2012.2175.3_en.pdf... ✓ (8 pages)
[436/1059] L

Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)


✓ (8 pages)
[544/1059] Loading 2018-06_2013.9769.4_de.pdf... 

Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)


✓ (6 pages)
[545/1059] Loading 2018-06_2013.9769.4_en.pdf... ✓ (6 pages)
[546/1059] Loading 2018-07_2012.2078.9_de.pdf... ✓ (8 pages)
[547/1059] Loading 2018-07_2012.2078.9_en.pdf... ✓ (8 pages)
[548/1059] Loading 2018-07_2014.2098.3_de.pdf... ✓ (8 pages)
[549/1059] Loading 2018-07_2014.2098.3_en.pdf... ✓ (9 pages)
[550/1059] Loading 2018-08_2013.2099.3_de.pdf... ✓ (8 pages)
[551/1059] Loading 2018-08_2013.2099.3_en.pdf... ✓ (7 pages)
[552/1059] Loading 2018-08_2015.2026.1_en.pdf... ✓ (7 pages)
[553/1059] Loading 2018-10_2011.2105.2_de.pdf... ✓ (7 pages)
[554/1059] Loading 2018-10_2011.2105.2_en.pdf... ✓ (6 pages)
[555/1059] Loading 2018-10_2015.2048.5_de.pdf... ✓ (6 pages)
[556/1059] Loading 2018-10_2015.2048.5_en.pdf... ✓ (6 pages)
[557/1059] Loading 2018-12_2010.2192.2_de.pdf... ✓ (6 pages)
[558/1059] Loading 2018-12_2010.2192.2_en.pdf... ✓ (6 pages)
[559/1059] Loading 2018_2011.2130.0_de.pdf... ✓ (7 pages)
[560/1059] Loading 2018_2011.2130.0_en.pdf... ✓ (7 pages)
[561/1059] Loading

Embedding: 100%|██████████| 1158/1158 [2:20:10<00:00,  7.26s/it] 



✓ Complete! Total chunks: 115752
✓ Vector store ready!

Done! Vector store is ready.
