In [1]:
import os
import json
import time
from dotenv import load_dotenv
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_google_genai  import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
load_dotenv()

True

In [3]:
LAW_DATABASE_FILE = "law_data/laws.jsonl"
VECTOR_STORE_PATH = "law_test_vector_store"

In [4]:
def process_law_json(item, source_file):
    
    # Create a descriptive text string from the JSON object
    content = (
        f"Provision Title: {item.get('provision_title', 'N/A')}, applicable in {item.get('country', 'N/A')}, {item.get('region', 'N/A')}. "
        f"Provision Body: {item.get('provision_body', 'N/A')}. "
        f"Provision Code: {item.get('provision_code', 'N/A')}. "
        f"Relevant labels include: {', '.join(item.get('relevant_labels', []))}."
    )
    
    # Create metadata to store structured information
    metadata = {
        "law_id": item.get('law_code'),
        "source_type": "law",
        "source_file": source_file,
        "law_name": item.get('law_title'),
        "country": item.get('country'),
        "region": item.get('region')
    }
    return Document(page_content=content, metadata=metadata)

In [5]:
def main():
    all_docs = []

    # Check if the database file exists
    if not os.path.exists(LAW_DATABASE_FILE):
        print(f"Error: Law database file not found at '{LAW_DATABASE_FILE}'")
        return

    # Load the entire list of laws from the single JSON file
    with open(LAW_DATABASE_FILE, 'r', encoding='utf-8') as jsonl_file:
        law_data_list = list(jsonl_file)

    # Iterate through each law record in the list
    for law_record in law_data_list:
        doc = json.loads(law_record)
        doc = process_law_json(doc, LAW_DATABASE_FILE)
        all_docs.append(doc)

    embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
    
    # Create the vector store from the document chunks
    vector_store = FAISS.from_documents(all_docs[0:1], embeddings)
    vector_store.save_local(VECTOR_STORE_PATH)
    time.sleep(5)

    # Save the dedicated feature vector store
    ## vector_store.save_local(VECTOR_STORE_PATH)

    for i in range(1, len(all_docs), 1):
        vector_store.add_documents(all_docs[i:i+1])
        vector_store.save_local(VECTOR_STORE_PATH)
        print(f"Added document {i} to vector store.")
        time.sleep(5)


In [6]:
if __name__ == "__main__":
    main()

Added document 1 to vector store.
Added document 2 to vector store.
Added document 3 to vector store.
Added document 4 to vector store.
Added document 5 to vector store.
Added document 6 to vector store.
Added document 7 to vector store.
Added document 8 to vector store.
Added document 9 to vector store.
Added document 10 to vector store.
Added document 11 to vector store.
Added document 12 to vector store.
Added document 13 to vector store.
Added document 14 to vector store.
Added document 15 to vector store.
Added document 16 to vector store.
Added document 17 to vector store.
Added document 18 to vector store.
Added document 19 to vector store.
Added document 20 to vector store.
Added document 21 to vector store.
Added document 22 to vector store.
Added document 23 to vector store.
Added document 24 to vector store.
Added document 25 to vector store.
Added document 26 to vector store.
Added document 27 to vector store.
Added document 28 to vector store.
Added document 29 to vector s