In [1]:
# CELL 1: Imports & Configuration

import os, json, time
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from pinecone import Pinecone, ServerlessSpec

load_dotenv()

# --- CONFIG ---
JSON_PATH = '../data/extracted_legal_data.json'
INDEX_NAME = "legal-assistant"
DIMENSIONS = 384 # Critical for free embeddings

if not os.getenv("PINECONE_API_KEY"): print("‚ùå Error: Missing Pinecone API Key")
else: print("‚úÖ Ready.")

  from .autonotebook import tqdm as notebook_tqdm


‚úÖ Ready.


In [None]:
# CELL 2: Process & Upload (The Workhorse)

# 1. Load Data & Create Docs
if os.path.exists(JSON_PATH):
    with open(JSON_PATH, 'r', encoding='utf-8') as f:
        data = json.load(f)
    # Convert to LangChain Documents efficiently
    docs = [
        Document(page_content=d.pop('extracted_text'), metadata=d) 
        for d in data if d.get('extracted_text')
    ]
    print(f"üìÇ Loaded {len(docs)} docs.")
else:
    raise FileNotFoundError("JSON file missing. Run Phase 1 first.")

# 2. Chunking
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)
print(f"‚úÇÔ∏è Split into {len(chunks)} chunks.")

# 3. Setup Embeddings & Index
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

if INDEX_NAME not in [i.name for i in pc.list_indexes()]:
    print(f"‚öôÔ∏è Creating Index '{INDEX_NAME}'...")
    pc.create_index(name=INDEX_NAME, dimension=DIMENSIONS, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))
    time.sleep(10) # Wait for init

# 4. Upload
print(f"üöÄ Uploading to Pinecone...")
PineconeVectorStore.from_documents(chunks, embeddings, index_name=INDEX_NAME)
print("‚úÖ SUCCESS! Database updated.")

üìÇ Loaded 14 docs.
‚úÇÔ∏è  Split into 3241 chunks.


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


üöÄ Uploading to Pinecone...
‚úÖ SUCCESS! Database updated.
