In [None]:
import os
import json
from dotenv import load_dotenv
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings # Using Ollama as an example
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Load environment variables (good practice, even if not used by Ollama)
load_dotenv()

In [None]:
# --- Configuration ---
FEATURE_DATABASE_FILE = "data/sample-feature-data.json" 
VECTOR_STORE_PATH = "feature_vector_store"  # Save to a separate directory

In [None]:
def process_feature_record(feature_item, source_file):
    """Transforms a single feature dictionary into a LangChain Document."""
    
    # Create a descriptive text string from the feature's JSON object
    content = (
        f"Feature Name: {feature_item.get('feature_name', 'N/A')}. "
        f"Type: {feature_item.get('feature_type', 'N/A')}. "
        f"Description: {feature_item.get('feature_description', 'N/A')}. "
        f"Relevant labels include: {', '.join(feature_item.get('relevant_labels', []))}."
    )
    
    # Create specific metadata for the feature
    metadata = {
        "feature_id": feature_item.get('feature_id'),
        "source_type": "feature",  # Critical for identifying this data type later
        "source_file": source_file,
        "feature_name": feature_item.get('feature_name')
    }
    return Document(page_content=content, metadata=metadata)

In [None]:
def main():
    """Main function to ingest feature data and create a dedicated vector store."""
    print(f"Starting feature ingestion from '{FEATURE_DATABASE_FILE}'...")
    
    all_docs = []

    # Check if the feature database file exists
    if not os.path.exists(FEATURE_DATABASE_FILE):
        print(f"Error: Feature database file not found at '{FEATURE_DATABASE_FILE}'")
        return

    # Load the list of features from the JSON file
    with open(FEATURE_DATABASE_FILE, 'r', encoding='utf-8') as f:
        feature_data_list = json.load(f)

    # Process each feature record in the list
    for feature_record in feature_data_list:
        doc = process_feature_record(feature_record, FEATURE_DATABASE_FILE)
        all_docs.append(doc)

    if not all_docs:
        print("No feature documents found in the file.")
        return

    print(f"Loaded {len(all_docs)} feature documents.")

    # Split the documents into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) # Adjust chunk size as needed, IMPORTANT
    split_docs = text_splitter.split_documents(all_docs)
    print(f"Split feature documents into {len(split_docs)} chunks.")

    # Initialize the embedding model (e.g., Ollama)
    # IMPORTANT: Use the same embedding model as your law ingestion script!
    embeddings = OllamaEmbeddings(model="mxbai-embed-large")
    print("Initialized embedding model.")

    # Create the vector store from the feature chunks
    print(f"Creating feature vector store with FAISS at '{VECTOR_STORE_PATH}'...")
    vector_store = FAISS.from_documents(split_docs, embeddings)

    # Save the dedicated feature vector store
    vector_store.save_local(VECTOR_STORE_PATH)
    print("Feature vector store created and saved successfully.")
    print("Feature ingestion complete.")

In [None]:
if __name__ == "__main__":
    main()