In [3]:
pip install sentence-transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [1]:
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

In [2]:
import zipfile
import os

# 📂 Path to your ZIP file
zip_path = "/content/constitution_parts.zip"

# 📁 Folder where you want to extract
extract_to = "unzipped_files"

# ✅ Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print(f"✅ Unzipped to folder: {extract_to}")


✅ Unzipped to folder: unzipped_files


In [2]:
# Path where all 12 JSON files are saved
json_dir = "/content/unzipped_files/constitution_parts"

In [3]:
# Step 1: Load all JSON files
all_parts_data = []
for filename in sorted(os.listdir(json_dir)):
    if filename.endswith(".json"):
        with open(os.path.join(json_dir, filename), "r", encoding="utf-8") as f:
            part_data = json.load(f)
            all_parts_data.append(part_data)

In [4]:
# Step 2: Flatten articles and clauses
chunks, sources = [], []

for part in all_parts_data:
    for page in part.get("pages", []):
        content = page.get("content", {})
        part_name = content.get("part", "UNKNOWN PART")
        for article in content.get("articles", []):
            art_num = article["number"]
            art_title = article["title"]
            if "clauses" in article:
                for clause in article["clauses"]:
                    text = f"{part_name} - Article {art_num} - {art_title} - Clause {clause['number']}: {clause['text']}"
                    chunks.append(text)
                    sources.append(f"{part_name} - Article {art_num}, Clause {clause['number']}")
            elif "text" in article:
                text = f"{part_name} - Article {art_num} - {art_title}: {article['text']}"
                chunks.append(text)
                sources.append(f"{part_name} - Article {art_num}")

In [5]:
# Step 3: Encode text and build FAISS index
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks, convert_to_numpy=True)

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# Step 4: Ask a question
def ask(query, top_k=1):
    query_vec = model.encode([query])
    D, I = index.search(np.array(query_vec), top_k)
    return [(chunks[i], sources[i]) for i in I[0]]



In [7]:
# Example usage
query = "What is the punishment for high treason?"
answers = ask(query)

for ans, src in answers:
    print(f"\n🔎 Found in: {src}\n📘 Answer: {ans}")


🔎 Found in: PART I - Article 6, Clause (1)
📘 Answer: PART I - Article 6 - High treason - Clause (1): Any person who abrogates or subverts or suspends or holds in abeyance, or attempts or conspires to abrogate or subvert or suspend or hold in abeyance, the Constitution by use of force or show of force or by any other unconstitutional means shall be guilty of high treason.


In [15]:
#  Example usage
query = "Who is president of Pakistan?"
answers = ask(query)

for ans, src in answers:
    print(f"\n🔎 Found in: {src}\n📘 Answer: {ans}")


🔎 Found in: PART VI - Article 168, Clause (1)
📘 Answer: PART VI - Article 168 - Auditor-General of Pakistan - Clause (1): President appoints Auditor-General.


In [10]:
# Example usage
query = "Official religion of Pakistan?"
answers = ask(query)

for ans, src in answers:
    print(f"\n🔎 Found in: {src}\n📘 Answer: {ans}")


🔎 Found in: PART I - Article 1, Clause (1)
📘 Answer: PART I - Article 1 - The Republic and its territories - Clause (1): Pakistan shall be Federal Republic to be known as the Islamic Republic of Pakistan, hereinafter referred to as Pakistan.


In [14]:
#  Example usage
query = "official name of Pakistan?"
answers = ask(query)

for ans, src in answers:
    print(f"\n🔎 Found in: {src}\n📘 Answer: {ans}")


🔎 Found in: PART I - Article 1, Clause (1)
📘 Answer: PART I - Article 1 - The Republic and its territories - Clause (1): Pakistan shall be Federal Republic to be known as the Islamic Republic of Pakistan, hereinafter referred to as Pakistan.


In [16]:
#  Example usage
query = "Election time of Provincial Assembly?"
answers = ask(query)

for ans, src in answers:
    print(f"\n🔎 Found in: {src}\n📘 Answer: {ans}")


🔎 Found in: PART VIII - Article 224, Clause (2)
📘 Answer: PART VIII - Article 224 - Time of Election and Bye-Election - Clause (2): General election within 90 days of Assembly dissolution; results within 14 days post-polls.
