In [129]:
!which python

4146.45s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


/Users/phuaweijie/self/configure-llama-stack/.venv/bin/python


In [130]:
# import os
# import asyncio
from pathlib import Path
from typing import List, Dict, Any
from datetime import datetime

# Llama Stack SDK
from llama_stack_client import LlamaStackClient

# Document processing libraries
# import PyPDF2
from docx import Document as DocxDocument
import mammoth  # For better DOCX to text conversion

In [142]:
llama_stack_port = "8321"
client = LlamaStackClient(base_url=f"http://localhost:{llama_stack_port}")

In [153]:
def extract_text_from_docx(file_path: str) -> str:
    """Extract text from DOCX files (modern Word format)"""
    try:
        # Using mammoth for better formatting preservation
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            return result.value
    except Exception as e:
        print(f"Error processing DOCX {file_path}: {e}")
        # Fallback to python-docx
        try:
            doc = DocxDocument(file_path)
            text = ""
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
            return text
        except:
            return ""

In [163]:
def extract_document_text(file_path: str) -> str:
    """Extract text based on file extension"""
    file_ext = Path(file_path).suffix.lower()

    if file_ext == ".docx":
        return extract_text_from_docx(file_path)
    else:
        print(f"Unsupported file type: {file_ext}")
        return ""

In [164]:
def chunk_text(text: str, metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
    """Split text into chunks for better retrieval"""
    chunks = []
    words = text.split()
    chunk_size = 1000
    chunk_overlap = 200

    for i in range(0, len(words), chunk_size):
        chunk_words = words[i : i + chunk_size + chunk_overlap]
        chunk_text = " ".join(chunk_words)

        chunk_metadata = metadata.copy()
        chunk_metadata.update(
            {
                "chunk_index": len(chunks),
                "chunk_start": i,
                "chunk_end": min(i + chunk_size, len(words)),
            }
        )

        chunks.append({"text": chunk_text, "metadata": chunk_metadata})

    return chunks

In [165]:
def extract_metadata(file_path: str) -> Dict[str, Any]:
    """Extract metadata from file path and content"""
    path = Path(file_path)

    # Basic metadata
    metadata = {
        "filename": path.name,
        "file_type": path.suffix.lower(),
        "file_path": str(path),
        "category": path.parent.name,  # specs, requirements, apis, etc.
        "processed_date": datetime.now().isoformat(),
        "file_size": path.stat().st_size if path.exists() else 0,
    }

    # Try to extract additional metadata from filename
    filename_lower = path.stem.lower()

    # Identify service type from filename
    if "user" in filename_lower or "auth" in filename_lower:
        metadata["service_type"] = "authentication"
    elif "payment" in filename_lower or "billing" in filename_lower:
        metadata["service_type"] = "payment"
    elif "report" in filename_lower or "analytics" in filename_lower:
        metadata["service_type"] = "reporting"
    elif "api" in filename_lower:
        metadata["service_type"] = "api"
    else:
        metadata["service_type"] = "general"

    # Identify document type
    if "spec" in filename_lower or "specification" in filename_lower:
        metadata["document_type"] = "specification"
    elif "requirement" in filename_lower:
        metadata["document_type"] = "requirements"
    elif "api" in filename_lower:
        metadata["document_type"] = "api_documentation"
    else:
        metadata["document_type"] = "general"

    return metadata

In [172]:
def register_vector_store(vector_db_id: str = "service_requests_db"):
    """Register a vector database using the SDK"""
    try:
        response = client.vector_dbs.register(
            vector_db_id=vector_db_id,
            embedding_model="all-MiniLM-L6-v2",
            embedding_dimension=384,
            provider_id="faiss",
        )
        print(f"✅ Registered vector database: {response}")
        print(f"✅ Registered vector database: {vector_db_id}")
        print(f"✅ All vector db {client.vector_dbs.list()}")
        return True
    except Exception as e:
        # Check if it already exists
        if "already exists" in str(e).lower():
            print(f"✅ Vector database '{vector_db_id}' already exists")
            return True
        else:
            print(f"❌ Failed to register vector database: {e}")
            return False

In [178]:
def list_vector_stores():
    """List all available vector databases"""
    try:
        response = client.vector_dbs.list()
        print("📋 Available vector databases:")
        for db in response:
            print(f"  - {db.vector_db_id} (model: {db.embedding_model})")
        return response
    except Exception as e:
        print(f"❌ Failed to list vector databases: {e}")
        return []

In [183]:
def ingest_document_chunks(
    chunks: List[Dict[str, Any]], vector_db_id: str = "service_requests_db"
):
    """Ingest document chunks into the vector database using SDK"""
    print("starting ingestion")
    # Prepare documents for insertion
    documents = []
    for chunk in chunks:
        # print(f"==>> chunk: {chunk}")

        chunk["metadata"][
            "document_id"
        ] = f"{chunk['metadata']['filename']}_{chunk['metadata']['chunk_index']}"

        documents.append(
            {
                # "document_id": f"{chunk['metadata']['filename']}_{chunk['metadata']['chunk_index']}",
                "content": chunk["text"],
                "mime_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                "metadata": chunk["metadata"],
            }
        )
    print("🚀 ~ documents:", documents)

    try:
        providers = client.providers.list()
        provider = next((p for p in providers if p.api == "vector_io"), None)
        print("provider", provider)
        print("hello weijie")
        providers2 = client.vector_dbs.list()
        print(f"==>> providers2: {providers2}")

        resposne = client.vector_io.insert(
            vector_db_id=vector_db_id, chunks=documents
        )

        print(f"==>> resposne: {resposne}")
        print(f"✅ Ingested {len(documents)} chunks")
        return True
    except Exception as e:
        print(f"❌ Failed to ingest chunks: {e}")
        return False

In [192]:
def process_directory(directory_path: str, vector_db_id: str = "service_requests_db"):
    """Process all documents in a directory"""
    directory = Path(directory_path)
    print(f"==>> directory: {directory}")

    if not directory.exists():
        print(f"❌ Directory does not exist: {directory_path}")
        return

    # Register vector database
    if not register_vector_store(vector_db_id):
        print(f"❌ Failed to register vector database. Checking existing databases...")
        list_vector_stores()
        return

    # Supported file extensions
    supported_extensions = {".pdf", ".docx", ".doc", ".txt", ".md"}

    # Process all files
    processed_count = 0
    total_chunks = 0

    for file_path in directory.rglob("*"):
        if file_path.is_file() and file_path.suffix.lower() in supported_extensions:
            print(f"📄 Processing: {file_path.name}")

            # Extract text
            text = extract_document_text(str(file_path))
            if not text.strip():
                print(f"⚠️  No text extracted from {file_path.name}")
                continue

            # Extract metadata
            metadata = extract_metadata(str(file_path))

            # Create chunks
            chunks = chunk_text(text, metadata)
            print(f"==>> chunks: {chunks}")

            # Ingest chunks
            success = ingest_document_chunks(chunks, vector_db_id)
            if success:
                processed_count += 1
                total_chunks += len(chunks)
                print(f"✅ Processed {file_path.name} - {len(chunks)} chunks")
            else:
                print(f"❌ Failed to process {file_path.name}")

    print(f"Hello!")
    print(f"\n🎉 Processing complete!")
    print(f"📊 Files processed: {processed_count}")
    print(f"📊 Total chunks created: {total_chunks}")
    print(f"🗃️  Vector database: {vector_db_id}")

In [195]:
documents_path = "./service_requests"
process_directory(documents_path)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/vector-dbs "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8321/v1/vector-dbs "HTTP/1.1 200 OK"


==>> directory: service_requests
✅ Registered vector database: VectorDBRegisterResponse(embedding_dimension=384, embedding_model='all-MiniLM-L6-v2', identifier='service_requests_db', provider_id='faiss', type='vector_db', provider_resource_id='service_requests_db', access_attributes=None)
✅ Registered vector database: service_requests_db
✅ All vector db [VectorDBListResponseItem(embedding_dimension=384, embedding_model='all-MiniLM-L6-v2', identifier='my_demo_vector_db', provider_id='faiss', type='vector_db', provider_resource_id='my_demo_vector_db'), VectorDBListResponseItem(embedding_dimension=384, embedding_model='all-MiniLM-L6-v2', identifier='v4a56e22261584bbca2e6ab6147b70700', provider_id='faiss', type='vector_db', provider_resource_id='v4a56e22261584bbca2e6ab6147b70700'), VectorDBListResponseItem(embedding_dimension=384, embedding_model='all-MiniLM-L6-v2', identifier='v562d5547d9d6422cae493d0d82eddff0', provider_id='faiss', type='vector_db', provider_resource_id='v562d5547d9d6422

INFO:httpx:HTTP Request: GET http://localhost:8321/v1/providers "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8321/v1/vector-dbs "HTTP/1.1 200 OK"


==>> chunks: [{'text': 'The Government of The Republic of Singapore Ministry of Manpower Leading Edge cApability Program IWPS Page Definition: PIWPS430 PIWPSCR002- Cancel WP Version 2.4 06 July 2012 Version Control Document Information Prepared By Sonni Gunawan Date Prepared 30 August 2001 Reviewed By Aymedy Galias Date Reviewed 30 August 2001 Version History Date Version Number Author Description of Changes 12 June 2001 0.1 Sonni Gunawan Creatiion of document 21 June 2001 0.2 Sonni Gunawan Updated document 09 July 2001 0.3 Sonni Gunawan Updated document based on user’s input 27 July 2001 0.4 Sonni Gunawan Updated document based on user’s input 31 July 2001 1.0 Sonni Gunawan Added validation and Notification 30 August 2001 1.0 Yeoh Ai Leen Updated document 11 September 2001 1.0 Aymedy Galias Updated document based on DDO comments 24 September 2001 1.0 Victor Eng Updated document based on WPD comments 6 January 2003 2.0 Michelle Chau Addition of CR00194, CR00338, CR00385, CR00384 19 Nov

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/vector-io/insert "HTTP/1.1 200 OK"


==>> resposne: None
✅ Ingested 9 chunks
✅ Processed PIWPSCR002 - Cancel WP.docx - 9 chunks
Hello!

🎉 Processing complete!
📊 Files processed: 1
📊 Total chunks created: 9
🗃️  Vector database: service_requests_db


In [197]:
#!/usr/bin/env python3
"""
FAISS Index Decoder - Decode and analyze FAISS index data
"""

import base64
import json
import sqlite3
import numpy as np
import struct
from typing import List, Dict, Any


def decode_faiss_index_from_db(db_path: str, vector_db_id: str = "service_requests_db"):
    """Decode FAISS index data from SQLite database"""
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()

        # Get the FAISS index data
        cursor.execute(
            """
            SELECT value FROM kvstore 
            WHERE key = ?
        """,
            (f"faiss_index:v3::{vector_db_id}",),
        )

        result = cursor.fetchone()
        conn.close()

        if not result:
            print(f"❌ No FAISS index found for {vector_db_id}")
            return None

        # Parse the JSON and get the base64 data
        data = json.loads(result[0])
        faiss_data_b64 = data.get("faiss_index", "")

        if not faiss_data_b64:
            print("❌ No faiss_index field found in data")
            return None

        # Decode base64
        faiss_binary = base64.b64decode(faiss_data_b64)

        print(f"✅ Successfully decoded FAISS index")
        print(
            f"📊 Binary size: {len(faiss_binary)} bytes ({len(faiss_binary)/1024:.2f} KB)"
        )

        return faiss_binary, data

    except Exception as e:
        print(f"❌ Error decoding FAISS index: {e}")
        return None, None


def analyze_faiss_binary(faiss_binary: bytes):
    """Analyze the binary FAISS data"""
    print(f"\n🔍 Analyzing FAISS Binary Data")
    print("=" * 40)

    # Check first few bytes (FAISS header)
    print(f"📋 First 20 bytes (hex): {faiss_binary[:20].hex()}")
    print(f"📋 First 20 bytes (ascii): {faiss_binary[:20]}")

    # Try to find patterns in the data
    # FAISS often stores float32 vectors
    try:
        # Convert some bytes to float32 to see if we can find vector data
        float_data = struct.unpack(
            f"{len(faiss_binary)//4}f", faiss_binary[: len(faiss_binary) // 4 * 4]
        )

        print(f"📊 Interpreted as float32 array: {len(float_data)} values")
        print(f"📊 Sample values: {float_data[:10]}")
        print(f"📊 Value range: {min(float_data):.6f} to {max(float_data):.6f}")

        # Check if this looks like embeddings (typically -1 to 1 range)
        valid_embeddings = [v for v in float_data if -2 <= v <= 2]
        print(
            f"📊 Values in embedding range (-2 to 2): {len(valid_embeddings)}/{len(float_data)}"
        )

    except Exception as e:
        print(f"❌ Error interpreting as float32: {e}")


def decode_scientific_notation_sample(sample_text: str):
    """Decode the scientific notation pattern you saw"""
    print(f"\n🔢 Decoding Scientific Notation Pattern")
    print("=" * 45)

    # Extract numbers from your sample
    sample_numbers = [
        "7.30000000000000000e+01",
        "1.20000000000000000e+02",
        "7.00000000000000000e+01",
        "5.00000000000000000e+01",
        "1.28000000000000000e+02",
    ]

    decoded_values = []
    for num_str in sample_numbers:
        try:
            value = float(num_str)
            decoded_values.append(value)
            print(f"📊 {num_str} = {value}")
        except:
            pass

    print(f"\n📊 Decoded values: {decoded_values}")
    print(f"📊 These could be part of a 384-dimensional embedding vector")

    # Check if these look like embedding values
    if all(-200 <= v <= 200 for v in decoded_values):
        print("✅ Values are in reasonable range for embeddings")
    else:
        print("⚠️  Values seem large for typical embeddings")


def analyze_document_chunks(db_path: str, vector_db_id: str = "service_requests_db"):
    """Analyze the document chunks stored with the FAISS index"""
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()

        # Get the FAISS index data
        cursor.execute(
            """
            SELECT value FROM kvstore 
            WHERE key = ?
        """,
            (f"faiss_index:v3::{vector_db_id}",),
        )

        result = cursor.fetchone()
        conn.close()

        if result:
            data = json.loads(result[0])

            print(f"\n📄 Document Chunks Analysis")
            print("=" * 35)

            # Look for document-related fields
            for key, value in data.items():
                if key != "faiss_index":  # Skip the binary data
                    print(f"📋 {key}: {type(value)}")

                    if isinstance(value, list) and len(value) > 0:
                        print(f"  📊 Array length: {len(value)}")
                        print(f"  📊 Sample items: {value[:3]}")
                    elif isinstance(value, dict):
                        print(f"  📊 Dict keys: {list(value.keys())[:5]}")
                    else:
                        print(f"  📊 Value: {str(value)[:100]}...")

    except Exception as e:
        print(f"❌ Error analyzing document chunks: {e}")


def main():
    print("🔍 FAISS Index Decoder")
    print("=" * 30)

    # Path to your database
    db_path = "~/.llama/distributions/ollama/faiss_store.db"
    db_path = (
        input(f"Enter path to faiss_store.db (or press Enter for {db_path}): ").strip()
        or db_path
    )
    db_path = db_path.replace("~", os.path.expanduser("~"))

    if not os.path.exists(db_path):
        print(f"❌ Database not found at {db_path}")
        return

    vector_db_id = "service_requests_db"

    # Decode the FAISS index
    faiss_binary, full_data = decode_faiss_index_from_db(db_path, vector_db_id)

    if faiss_binary:
        # Analyze the binary data
        analyze_faiss_binary(faiss_binary)

        # Analyze document chunks
        analyze_document_chunks(db_path, vector_db_id)

    # Decode the scientific notation pattern
    decode_scientific_notation_sample("")


if __name__ == "__main__":
    import os

    main()

🔍 FAISS Index Decoder
✅ Successfully decoded FAISS index
📊 Binary size: 346725 bytes (338.60 KB)

🔍 Analyzing FAISS Binary Data
📋 First 20 bytes (hex): 372e333030303030303030303030303030303030
📋 First 20 bytes (ascii): b'7.300000000000000000'
📊 Interpreted as float32 array: 86681 values
📊 Sample values: (6.518549588996336e-10, 6.409690556097303e-10, 6.409690556097303e-10, 6.409690556097303e-10, 6.409690556097303e-10, 2.563603773708678e-09, 1.0139283190824244e-08, 6.409690556097303e-10, 6.409690556097303e-10, 6.409690556097303e-10)
📊 Value range: 0.000000 to 52001587716569339789312.000000
📊 Values in embedding range (-2 to 2): 83214/86681

📄 Document Chunks Analysis
📋 chunk_by_index: <class 'dict'>
  📊 Dict keys: ['0', '1', '2', '3', '4']

🔢 Decoding Scientific Notation Pattern
📊 7.30000000000000000e+01 = 73.0
📊 1.20000000000000000e+02 = 120.0
📊 7.00000000000000000e+01 = 70.0
📊 5.00000000000000000e+01 = 50.0
📊 1.28000000000000000e+02 = 128.0

📊 Decoded values: [73.0, 120.0, 70.0, 50.0, 

In [198]:
# vector_db_id = "service_requests_db"
# client.vector_dbs.unregister(vector_db_id=vector_db_id)