In [14]:
# In a new Colab cell
import os
import torch
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# --- Configuration ---
PDF_SOURCE_DIRECTORY = "/content/drive/MyDrive/judgments-data"
PERSIST_DIRECTORY = "vector_db"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

def create_vector_database():
    """
    Creates and persists a vector database from PDF documents in Google Colab.
    """
    print("--- 🚀 Starting Vector Database Creation ---")

    # 1. Load Documents
    print(f"Loading documents from '{PDF_SOURCE_DIRECTORY}'...")
    loader = DirectoryLoader(
        PDF_SOURCE_DIRECTORY,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True,
        use_multithreading=True
    )
    documents = loader.load()
    if not documents:
        print("No documents found. Please check you've uploaded PDFs to the correct folder.")
        return
    print(f"✅ Loaded {len(documents)} document(s).")

    # 2. Chunk Documents
    print("Splitting documents into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = text_splitter.split_documents(documents)
    print(f"✅ Split documents into {len(texts)} chunks.")

    # 3. Initialize Embedding Model (with GPU support)
    print(f"Initializing embedding model: '{EMBEDDING_MODEL_NAME}'...")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    embeddings = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        model_kwargs={'device': device}
    )
    print("✅ Embedding model loaded.")

    # 4. Create and Persist Vector Database
    print(f"Creating and persisting vector database in '{PERSIST_DIRECTORY}'...")
    vectordb = Chroma.from_documents(
        documents=texts,
        embedding=embeddings,
        persist_directory=PERSIST_DIRECTORY
    )

    print("\n--- ✅ Vector Database Creation Complete! ---")
    print(f"Database stored in: {PERSIST_DIRECTORY}")

# Run the function
create_vector_database()

--- 🚀 Starting Vector Database Creation ---
Loading documents from '/content/drive/MyDrive/judgments-data'...


100%|██████████| 5/5 [00:02<00:00,  2.34it/s]
  embeddings = HuggingFaceEmbeddings(


✅ Loaded 65 document(s).
Splitting documents into chunks...
✅ Split documents into 135 chunks.
Initializing embedding model: 'all-MiniLM-L6-v2'...
Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Embedding model loaded.
Creating and persisting vector database in 'vector_db'...

--- ✅ Vector Database Creation Complete! ---
Database stored in: vector_db


In [15]:
# In a new Colab cell
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import torch

# --- Configuration ---
PERSIST_DIRECTORY = "vector_db"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

# 1. Initialize Embeddings and Load Database
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={'device': device})
vectordb = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)

# 2. Define your query and search
query = "what are the legal principles for granting injunctions"
results = vectordb.similarity_search(query, k=3) # Get the top 3 most similar chunks

# 3. Display results
print(f"\nTop results for query: '{query}'\n")
for i, doc in enumerate(results):
    print(f"--- Result {i+1} ---")
    print(f"Source: {doc.metadata.get('source', 'N/A')}")
    print(f"Content: {doc.page_content[:500]}...") # Print first 500 characters
    print("-" * 20 + "\n")


Top results for query: 'what are the legal principles for granting injunctions'

--- Result 1 ---
Source: /content/drive/MyDrive/judgments-data/j1.pdf
Content: decision of the Special Secretary (Revenue 
Department) and remanded the matter to the Deputy 
Collector for reconsideration. It was contended that, 
despite setting aside the Deputy Collector’s order for 
violating natural justice, the appellant failed to 
remand the matter back for fresh adjudication and 
instead directly issued directions for handing over 
possession, thereby exceeding his jurisdiction....
--------------------

--- Result 2 ---
Source: /content/drive/MyDrive/judgments-data/j2.pdf
Content: interference and taking a different view, this appeal appears to be devoid 
of merits and accordingly dismissed.   
31. The appellant is enjoying the benefit of suspension of sentence 
vide order dated 05.07.2000 which is hereby cancelled and the appellant 
is directed to surrender  forthwith before the concerned trial Cour

  vectordb = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)


In [16]:
# In a new Colab cell
!zip -r vector_db.zip /content/vector_db

  adding: content/vector_db/ (stored 0%)
  adding: content/vector_db/2145bde2-d5be-4ae5-9fd0-e42f660e74b8/ (stored 0%)
  adding: content/vector_db/2145bde2-d5be-4ae5-9fd0-e42f660e74b8/link_lists.bin (stored 0%)
  adding: content/vector_db/2145bde2-d5be-4ae5-9fd0-e42f660e74b8/data_level0.bin (deflated 100%)
  adding: content/vector_db/2145bde2-d5be-4ae5-9fd0-e42f660e74b8/length.bin (deflated 94%)
  adding: content/vector_db/2145bde2-d5be-4ae5-9fd0-e42f660e74b8/header.bin (deflated 61%)
  adding: content/vector_db/chroma.sqlite3 (deflated 58%)
