### Cell 1: Import Libraries and Define Paths


In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define paths
DATA_DIR = "data"
PDF_NAME = "EU_AI_Act_latest.pdf"
PDF_PATH = os.path.join(DATA_DIR, PDF_NAME)

# Create data directory if it doesn't exist (good practice)
os.makedirs(DATA_DIR, exist_ok=True)

print(f"Project Setup:")
print(f"Data directory: {os.path.abspath(DATA_DIR)}")
print(f"Expected PDF path: {os.path.abspath(PDF_PATH)}")

Project Setup:
Data directory: C:\eu_ai_act_navigator\data
Expected PDF path: C:\eu_ai_act_navigator\data\EU_AI_Act_latest.pdf


### Cell 2: Load the PDF Document


In [2]:
if not os.path.exists(PDF_PATH):
    print(f"---")
    print(f"ERROR: PDF not found at {PDF_PATH}.")
    print(f"Please download the EU AI Act PDF, name it '{PDF_NAME}', and place it in the '{DATA_DIR}' directory.")
    print(f"---")
    pages = [] # Initialize pages as empty list to prevent errors in subsequent cells
else:
    print(f"Loading PDF from: {PDF_PATH}")
    loader = PyPDFLoader(PDF_PATH)
    try:
        pages = loader.load() # This loads the PDF into a list of Document objects (one per page)
        print(f"Successfully loaded {len(pages)} pages from the PDF.")

        # Optional: Inspect the first few pages (uncomment to run)
        # for i in range(min(3, len(pages))):
        #     print(f"\n--- Content of Page {i+1} (first 300 chars) ---")
        #     print(pages[i].page_content[:300])
        #     print(f"--- Metadata of Page {i+1} ---")
        #     print(pages[i].metadata)

    except Exception as e:
        print(f"An error occurred while loading the PDF: {e}")
        pages = [] # Initialize pages as empty list

Loading PDF from: data\EU_AI_Act_latest.pdf
Successfully loaded 144 pages from the PDF.


### Cell 3: Split Documents into Chunks


In [3]:
if 'pages' in locals() and pages: # Check if pages list was successfully created and is not empty
    print(f"\nSplitting the {len(pages)} pages into smaller chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,        # The maximum number of characters in a chunk
        chunk_overlap=150,      # Number of characters to overlap between chunks
        length_function=len,    # How to measure chunk length (standard is len())
        is_separator_regex=False, # We are not using regex separators here
    )
    docs_chunks = text_splitter.split_documents(pages)
    print(f"Split the document into {len(docs_chunks)} chunks.")

    # Optional: Inspect the first few chunks (uncomment to run)
    # if docs_chunks:
    #     for i in range(min(3, len(docs_chunks))):
    #         print(f"\n--- Chunk {i+1} (first 200 chars) ---")
    #         print(docs_chunks[i].page_content[:200])
    #         print(f"--- Metadata of Chunk {i+1} ---")
    #         # Metadata (like page number) is usually propagated from the original Document object
    #         print(docs_chunks[i].metadata)
    #         print(f"Length of Chunk {i+1}: {len(docs_chunks[i].page_content)}")
else:
    print("Variable 'pages' not defined or is empty. Please ensure the PDF was loaded successfully in the previous cell.")
    docs_chunks = [] # Initialize docs_chunks as empty list


Splitting the 144 pages into smaller chunks...
Split the document into 792 chunks.


### CUDA Check

In [8]:
import torch

cuda_available = torch.cuda.is_available()
print(f"CUDA Available: {cuda_available}")

if cuda_available:
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"Current CUDA device: {torch.cuda.current_device()}")
    print(f"Device Name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("CUDA is not available. Ensure NVIDIA drivers and CUDA toolkit are installed correctly, and PyTorch was installed with CUDA support.")
    print("Falling back to CPU for embeddings. This will be slower.")

CUDA Available: True
Number of GPUs: 1
Current CUDA device: 0
Device Name: NVIDIA GeForce GTX 1650


### Cell 4: Initialize Embedding Model


In [5]:
import torch
from langchain_huggingface import HuggingFaceEmbeddings # Updated import for newer Langchain

# Determine device: 'cuda' if available, otherwise 'cpu'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device} for HuggingFaceEmbeddings.")

# Choose an embedding model
# 'all-MiniLM-L6-v2' is small, fast, and good for CPU/prototyping.
# 'all-mpnet-base-v2' is larger, a bit slower, but generally better quality for GPU.
# Given you have CUDA, let's default to a slightly better one if you like,

model_name = "sentence-transformers/all-mpnet-base-v2" # Good quality, runs well on CUDA
# model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1" # Even better for QA tasks

# model_kwargs ensure the model runs on the specified device
model_kwargs = {'device': device}

# encode_kwargs ensure that normalization is applied, good practice for many sentence-transformer models
encode_kwargs = {'normalize_embeddings': True} # Often improves performance

if 'docs_chunks' in locals() and docs_chunks: # Proceed only if chunks exist
    print(f"\nInitializing HuggingFaceEmbeddings model: {model_name}")
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    print("HuggingFaceEmbeddings model initialized successfully.")

    # Optional: Test embedding a single sentence (uncomment to run)
    # text_to_embed = "What are the requirements for high-risk AI systems?"
    # query_result = embeddings.embed_query(text_to_embed)
    # print(f"\nTest embedding for '{text_to_embed}':")
    # print(f"  First 5 dimensions: {query_result[:5]}")
    # print(f"  Embedding length: {len(query_result)}")
else:
    print("Variable 'docs_chunks' not defined or empty. Please ensure the document splitting cell ran successfully.")
    embeddings = None # Initialize as None to prevent errors

Using device: cuda for HuggingFaceEmbeddings.

Initializing HuggingFaceEmbeddings model: sentence-transformers/all-mpnet-base-v2


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


HuggingFaceEmbeddings model initialized successfully.


### Cell 5: Create and Save FAISS Vector Store


In [6]:
from langchain_community.vectorstores import FAISS

DB_FAISS_PATH = "vectorstore/db_faiss_eu_ai_act" # Store in a subfolder for neatness

if 'docs_chunks' in locals() and docs_chunks and 'embeddings' in locals() and embeddings:
    print(f"\nCreating FAISS vector store with {len(docs_chunks)} chunks using '{model_name}' embeddings...")
    print(f"This may take a few minutes depending on the number of chunks and GPU speed...")

    # Create the directory for the FAISS index if it doesn't exist
    os.makedirs(os.path.dirname(DB_FAISS_PATH), exist_ok=True)

    # This step generates embeddings for all chunks and stores them.
    db = FAISS.from_documents(docs_chunks, embeddings)
    print("FAISS vector store created successfully in memory.")

    # Save the FAISS index locally so you don't have to rebuild it every time
    db.save_local(DB_FAISS_PATH)
    print(f"FAISS index saved locally to: {DB_FAISS_PATH}")

    # Optional: Test a similarity search (uncomment to run)
    # print("\nTesting similarity search in the newly created DB...")
    # test_query = "What are the obligations for AI providers?"
    # # Note: For a real test, you'd load the DB first if not in memory,
    # # but here 'db' is already in memory.
    # search_results = db.similarity_search_with_score(test_query, k=2) # Get top 2 results
    # print(f"Search results for query: '{test_query}'")
    # for i, (doc, score) in enumerate(search_results):
    #     print(f"\nResult {i+1} (Score: {score:.4f}):") # Lower score is better for FAISS L2 distance
    #     print(f"  Source Page: {doc.metadata.get('page', 'N/A')}")
    #     print(f"  Content (first 150 chars): {doc.page_content[:150]}...")

elif not ('docs_chunks' in locals() and docs_chunks):
    print("Variable 'docs_chunks' not defined or empty. Run the document processing and chunking cells first.")
elif not ('embeddings' in locals() and embeddings):
    print("Variable 'embeddings' not initialized. Run the embedding model initialization cell first.")


Creating FAISS vector store with 792 chunks using 'sentence-transformers/all-mpnet-base-v2' embeddings...
This may take a few minutes depending on the number of chunks and GPU speed...
FAISS vector store created successfully in memory.
FAISS index saved locally to: vectorstore/db_faiss_eu_ai_act
