# Notebook 1: Data Preprocessing and Embedding Creation

**Objective:** This notebook walks through the steps of loading documents, splitting them into chunks, generating embeddings, and creating/saving a vector store (FAISS index).

**Steps:**
1. Setup: Import necessary libraries and configure paths.
2. Load Configuration: Access settings from `src/config.py`.
3. Load Documents: Use `data_processor.load_documents_from_directory`.
4. Split Documents: Use `data_processor.split_documents_into_chunks`.
5. Initialize Embedding Model: Use `vector_store.get_embedding_model`.
6. Create and Save Vector Store: Use `vector_store.create_and_save_vector_store`.
7. (Optional) Test Loading Vector Store: Verify the saved store can be loaded.

In [None]:
import sys
import os
from pathlib import Path

# Add project root to sys.path to allow imports from src
# This assumes the notebook is in 'project-rag-demo/notebooks/'
project_root = Path(os.getcwd()).parent 
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Now we can import from src
from src import config
from src.data_processor import load_documents_from_directory, split_documents_into_chunks
from src.vector_store import get_embedding_model, create_and_save_vector_store, load_vector_store

print(f"Project Root: {project_root}")
print(f"OpenAI API Key Loaded: {'Yes' if config.OPENAI_API_KEY else 'No (Please check .env file!)'}")

## 1. Load Configuration

In [None]:
print(f"Data Path: {config.DATA_PATH}")
print(f"Vector Store Path: {config.VECTOR_STORE_PATH}")
print(f"Chunk Size: {config.CHUNK_SIZE}")
print(f"Chunk Overlap: {config.CHUNK_OVERLAP}")
print(f"Embedding Model: {config.EMBEDDING_MODEL_NAME}")

## 2. Load Documents

In [None]:
raw_documents = load_documents_from_directory(config.DATA_PATH)
if raw_documents:
    print(f"Successfully loaded {len(raw_documents)} documents.")
    for i, doc in enumerate(raw_documents):
        print(f"--- Document {i+1} ---")
        print(f"Source: {doc.metadata.get('source', 'N/A')}")
        print(f"Content (first 100 chars): {doc.page_content[:100].strip()}...")
else:
    print("No documents found or loaded. Please check the 'data/' directory.")

## 3. Split Documents into Chunks

In [None]:
if raw_documents:
    document_chunks = split_documents_into_chunks(
        raw_documents,
        chunk_size=config.CHUNK_SIZE,
        chunk_overlap=config.CHUNK_OVERLAP
    )
    print(f"Split into {len(document_chunks)} chunks.")
    if document_chunks:
        print("\n--- First Chunk Example ---")
        print(f"Content: {document_chunks[0].page_content[:300]}...")
        print(f"Metadata: {document_chunks[0].metadata}")
else:
    document_chunks = []
    print("Skipping chunking as no documents were loaded.")

## 4. Initialize Embedding Model

In [None]:
embeddings_model = get_embedding_model()
if embeddings_model:
    print(f"Embedding model ({config.EMBEDDING_MODEL_NAME}) initialized successfully.")
else:
    print(f"Failed to initialize embedding model. Check API key or model name.")

## 5. Create and Save Vector Store

In [None]:
vector_store_instance = None
if document_chunks and embeddings_model:
    # Ensure the parent directory for the index exists
    config.VECTOR_STORE_PATH.parent.mkdir(parents=True, exist_ok=True)
    
    vector_store_instance = create_and_save_vector_store(
        chunks=document_chunks, 
        embeddings_model=embeddings_model, 
        index_path=str(config.VECTOR_STORE_PATH)
    )
    if vector_store_instance:
        print(f"Vector store created and saved to {config.VECTOR_STORE_PATH}")
    else:
        print("Failed to create or save vector store.")
elif not document_chunks:
    print("Cannot create vector store: No document chunks available.")
elif not embeddings_model:
    print("Cannot create vector store: Embedding model not initialized.")

## 6. (Optional) Test Loading the Saved Vector Store

In [None]:
if config.VECTOR_STORE_PATH.exists() and embeddings_model:
    print(f"\nAttempting to load vector store from: {config.VECTOR_STORE_PATH}")
    loaded_vs = load_vector_store(
        index_path=str(config.VECTOR_STORE_PATH), 
        embeddings_model=embeddings_model
    )
    if loaded_vs:
        print("Vector store loaded successfully for testing.")
        # You could try a sample search here if desired
        # test_query = "What is RAG?"
        # results = loaded_vs.similarity_search(test_query, k=1)
        # if results:
        #     print(f"Test search for '{test_query}' found: {results[0].page_content[:100]}...")
    else:
        print("Failed to load the saved vector store.")
else:
    print("Skipping load test: Vector store file does not exist or embedding model not ready.")

--- End of Notebook 1 ---