In [None]:
# In a new Colab cell
import os
import torch
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# --- Configuration ---
PDF_SOURCE_DIRECTORY = "/content/drive/MyDrive/judgments-data"
PERSIST_DIRECTORY = "vector_db"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

def create_vector_database():
    """
    Creates and persists a vector database from PDF documents in Google Colab.
    """
    print("--- 🚀 Starting Vector Database Creation ---")

    # 1. Load Documents
    print(f"Loading documents from '{PDF_SOURCE_DIRECTORY}'...")
    loader = DirectoryLoader(
        PDF_SOURCE_DIRECTORY,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True,
        use_multithreading=True
    )
    documents = loader.load()
    if not documents:
        print("No documents found. Please check you've uploaded PDFs to the correct folder.")
        return
    print(f"✅ Loaded {len(documents)} document(s).")

    # 2. Chunk Documents
    print("Splitting documents into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = text_splitter.split_documents(documents)
    print(f"✅ Split documents into {len(texts)} chunks.")

    # 3. Initialize Embedding Model (with GPU support)
    print(f"Initializing embedding model: '{EMBEDDING_MODEL_NAME}'...")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    embeddings = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        model_kwargs={'device': device}
    )
    print("✅ Embedding model loaded.")

    # 4. Create and Persist Vector Database
    print(f"Creating and persisting vector database in '{PERSIST_DIRECTORY}'...")
    vectordb = Chroma.from_documents(
        documents=texts,
        embedding=embeddings,
        persist_directory=PERSIST_DIRECTORY
    )

    print("\n--- ✅ Vector Database Creation Complete! ---")
    print(f"Database stored in: {PERSIST_DIRECTORY}")

# Run the function
create_vector_database()

In [None]:
# In a new Colab cell
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import torch

# --- Configuration ---
PERSIST_DIRECTORY = "vector_db"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

# 1. Initialize Embeddings and Load Database
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={'device': device})
vectordb = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)

# 2. Define your query and search
query = "what are the legal principles for granting injunctions"
results = vectordb.similarity_search(query, k=3) # Get the top 3 most similar chunks

# 3. Display results
print(f"\nTop results for query: '{query}'\n")
for i, doc in enumerate(results):
    print(f"--- Result {i+1} ---")
    print(f"Source: {doc.metadata.get('source', 'N/A')}")
    print(f"Content: {doc.page_content[:500]}...") # Print first 500 characters
    print("-" * 20 + "\n")

In [None]:
# In a new Colab cell
!zip -r vector_db.zip /content/vector_db