In [1]:
import os
import pickle
import numpy as np
import faiss
from typing import Dict, List

In [2]:
class VectorStore:
    def __init__(self, dimension: int, output_dir: str = '../vector_store'):
        """Initialize FAISS index and output directory"""
        self.dimension = dimension
        self.index = faiss.IndexFlatL2(dimension)
        self.chunk_map = {}
        self.output_dir = output_dir

        # Ensure the output directory exists
        os.makedirs(self.output_dir, exist_ok=True)

    def add_vectors(self, vectors: Dict[int, np.ndarray], chunk_map: Dict[int, Dict]):
        """Add vectors to the FAISS index"""
        # Convert vectors dictionary to numpy array
        vector_array = np.array(list(vectors.values())).astype('float32')

        # Add to FAISS index
        self.index.add(vector_array)

        # Store chunk mapping
        self.chunk_map = chunk_map

        print(f"Added {len(vectors)} vectors to index")

    def save_index(self):
        """Save FAISS index and chunk mapping"""
        index_path = os.path.join(self.output_dir, 'faiss_index.idx')
        chunk_map_path = os.path.join(self.output_dir, 'chunk_map.pkl')

        # Save FAISS index
        faiss.write_index(self.index, index_path)

        # Save chunk mapping
        with open(chunk_map_path, 'wb') as f:
            pickle.dump(self.chunk_map, f)

        print(f"Index saved to {index_path}")
        print(f"Chunk map saved to {chunk_map_path}")

    def load_index(self):
        """Load FAISS index and chunk mapping"""
        index_path = os.path.join(self.output_dir, 'faiss_index.idx')
        chunk_map_path = os.path.join(self.output_dir, 'chunk_map.pkl')

        # Load FAISS index
        self.index = faiss.read_index(index_path)

        # Load chunk mapping
        with open(chunk_map_path, 'rb') as f:
            self.chunk_map = pickle.load(f)

        print(f"Index loaded from {index_path}")
        print(f"Chunk map loaded from {chunk_map_path}")

    def search(self, query_vector: np.ndarray, k: int = 5) -> List[Dict]:
        """Search for similar vectors"""
        # Ensure query vector is in correct format
        query_vector = query_vector.reshape(1, -1).astype('float32')

        # Search
        distances, indices = self.index.search(query_vector, k)

        # Get results
        results = []
        for i, idx in enumerate(indices[0]):
            if idx < len(self.chunk_map):  # Check if index is valid
                result = self.chunk_map[idx].copy()
                result['distance'] = float(distances[0][i])
                results.append(result)

        return results




In [3]:

if __name__ == "__main__":
    input_dir = '../embeddings'
    output_dir = '../vector_store'

    # Load combined vectors from input directory
    with open(os.path.join(input_dir, 'vectors.pkl'), 'rb') as f:
        data = pickle.load(f)
    vectors = data['vectors']
    chunk_map = data['chunk_map']

    # Initialize vector store
    vector_store = VectorStore(
        dimension=len(next(iter(vectors.values()))), output_dir=output_dir
    )

    # Add vectors to FAISS index
    print("Adding vectors to FAISS index...")
    vector_store.add_vectors(vectors, chunk_map)

    # Save index and chunk mapping
    print("Saving FAISS index and chunk map...")
    vector_store.save_index()

    # Test index stats
    print("\nVector Store Statistics:")
    print(f"Total vectors in index: {vector_store.index.ntotal}")

Adding vectors to FAISS index...
Added 1454 vectors to index
Saving FAISS index and chunk map...
Index saved to ../vector_store/faiss_index.idx
Chunk map saved to ../vector_store/chunk_map.pkl

Vector Store Statistics:
Total vectors in index: 1454
