In [None]:
LANGSMITH_TRACING="true"
LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
LANGSMITH_API_KEY="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"
LANGSMITH_PROJECT="TechJamPitre"

In [74]:
import os
import json
from dotenv import load_dotenv
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [75]:
# Load environment variables from .env file
load_dotenv()

False

In [None]:
LAW_DATABASE_FILE = "data/sample-file.json"
VECTOR_STORE_PATH = "law_vector_store"

In [None]:
def process_law_json(item, source_file):
    
    # Create a descriptive text string from the JSON object
    content = (
        f"Law Name: {item.get('law', 'N/A')}, applicable in {item.get('country', 'N/A')}, {item.get('region', 'N/A')}. "
        f"Description: {item.get('law_desc', 'N/A')}. "
        f"Relevant labels include: {', '.join(item.get('relevant_labels', []))}."
    )
    
    # Create metadata to store structured information
    metadata = {
        "law_id": item.get('law_id'),
        "source_type": "law",
        "source_file": source_file,
        "law_name": item.get('law'),
        "country": item.get('country'),
        "region": item.get('region')
    }
    return Document(page_content=content, metadata=metadata)

In [None]:
def main():
    """Main function to run the data ingestion and vector store creation from a single JSON file."""
    print(f"Starting data ingestion from '{LAW_DATABASE_FILE}'...")
    
    all_docs = []

    # Check if the database file exists
    if not os.path.exists(LAW_DATABASE_FILE):
        print(f"Error: Law database file not found at '{LAW_DATABASE_FILE}'")
        return

    # Load the entire list of laws from the single JSON file
    with open(LAW_DATABASE_FILE, 'r', encoding='utf-8') as f:
        law_data_list = json.load(f)

    # Iterate through each law record in the list
    for law_record in law_data_list:
        doc = process_law_json(law_record, LAW_DATABASE_FILE)
        all_docs.append(doc)

    if not all_docs:
        print("No law documents found in the file.")
        return

    print(f"Loaded {len(all_docs)} law documents.")

    # Split documents into smaller chunks for better retrieval performance
    # This step remains the same
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) # Adjust chunk size as needed, IMPORTANT
    split_docs = text_splitter.split_documents(all_docs)
    print(f"Split documents into {len(split_docs)} chunks.")

    # Initialize the ByteDance Doubao embedding model
    embeddings = OllamaEmbeddings(
        model="mxbai-embed-large"
    )

    # Create the vector store from the document chunks
    print("Creating vector store with FAISS...")
    vector_store = FAISS.from_documents(split_docs, embeddings)

    # Save the vector store locally for later use
    vector_store.save_local(VECTOR_STORE_PATH)
    print(f"Vector store created and saved at '{VECTOR_STORE_PATH}'.")
    print("Ingestion complete.")

In [78]:
if __name__ == "__main__":
    main()

Starting data ingestion from './sample-file.json'...
Loaded 20 law documents.
Split documents into 20 chunks.
Creating vector store with FAISS...
Vector store created and saved at 'vector_store'.
Ingestion complete.


In [79]:


'''
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")
'''

'\nfrom langchain_text_splitters import RecursiveCharacterTextSplitter\n\ntext_splitter = RecursiveCharacterTextSplitter(\n    chunk_size=1000,  # chunk size (characters)\n    chunk_overlap=200,  # chunk overlap (characters)\n    add_start_index=True,  # track index in original document\n)\nall_splits = text_splitter.split_documents(docs)\n\nprint(f"Split blog post into {len(all_splits)} sub-documents.")\n'