In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
from google.colab import auth
from googleapiclient.discovery import build

# Authenticate to Google Drive
auth.authenticate_user()

service = build('drive', 'v3')

FOLDER_ID = "1J8oI8RJEgmbcRdyHzoMzyz519FaJkuSA"
PDF_BASE_URL = "https://drive.google.com/file/d/{id}/view?usp=sharing"

# --- Fetch all PDFs using pagination ---
all_files = []
page_token = None

while True:
    response = service.files().list(
        q=f"'{FOLDER_ID}' in parents and mimeType='application/pdf' and trashed=false",
        fields="nextPageToken, files(id, name)",
        pageToken=page_token
    ).execute()

    all_files.extend(response.get('files', []))
    page_token = response.get('nextPageToken', None)
    if not page_token:
        break

pdf_links = {f["name"]: PDF_BASE_URL.format(id=f["id"]) for f in all_files}

print(f"‚úÖ Found {len(pdf_links)} PDFs in Drive folder (after pagination).")
print("üîó Example mappings:")
list(pdf_links.items())[:5]


‚úÖ Found 856 PDFs in Drive folder (after pagination).
üîó Example mappings:


[('the_code_of_criminal_procedure,_1973.pdf',
  'https://drive.google.com/file/d/1N7mmxnKB6qREkm3XBqCVAECFmCpzbXAt/view?usp=sharing'),
 ('the_code_of_civil_procedure,_1908.pdf',
  'https://drive.google.com/file/d/1CtvoN7pM7Of7Rg-8zru01TXKcUjuQCng/view?usp=sharing'),
 ('2023_11_297_312_EN.pdf',
  'https://drive.google.com/file/d/1hB2H7sT3dncaPhVUr4Uj_LPBNGETLu-x/view?usp=sharing'),
 ('2023_7_887_898_EN.pdf',
  'https://drive.google.com/file/d/1uH9NHeNaaKxbvNi0bNUNWHEWGoVS_l4f/view?usp=sharing'),
 ('2023_16_484_524_EN.pdf',
  'https://drive.google.com/file/d/1xJizftlTiEJe68KL5pdYmLSUCRIB44Ux/view?usp=sharing')]

In [17]:
# Step 2 ‚Äî Check case and name match between local folder and Drive API list

drive_names = [f['name'] for f in all_files]

print("üîç Checking case-insensitive match between local and Drive filenames:\n")
for f in [
    "2023_16_1209_1524_EN.pdf",
    "2023_14_871_889_EN.pdf",
    "2023_13_832_846_EN.pdf",
    "2023_5_165_214_EN.pdf",
    "2023_14_1073_1082_EN.pdf",
    "2023_3_552_563_EN.pdf"
]:
    match = any(f.lower() == dn.lower() for dn in drive_names)
    if match:
        print(f"‚úÖ {f} ‚Äî Found in Drive listing")
    else:
        print(f"‚ùå {f} ‚Äî Not returned by Drive API (possible rename/mime mismatch)")


üîç Checking case-insensitive match between local and Drive filenames:

‚úÖ 2023_16_1209_1524_EN.pdf ‚Äî Found in Drive listing
‚úÖ 2023_14_871_889_EN.pdf ‚Äî Found in Drive listing
‚úÖ 2023_13_832_846_EN.pdf ‚Äî Found in Drive listing
‚úÖ 2023_5_165_214_EN.pdf ‚Äî Found in Drive listing
‚úÖ 2023_14_1073_1082_EN.pdf ‚Äî Found in Drive listing
‚úÖ 2023_3_552_563_EN.pdf ‚Äî Found in Drive listing


In [3]:
!pip uninstall -y langchain langchain-core langchain-community langsmith faiss-cpu
!pip install -U pip setuptools wheel

!pip install \
  "langchain==0.1.9" \
  "langchain-core==0.1.53" \
  "langchain-community==0.0.38" \
  "langchain-huggingface==0.0.3" \
  "sentence-transformers==2.7.0" \
  "faiss-cpu==1.8.0.post1" \
  "tqdm" \
  "pymupdf==1.24.10" \
  "python-dotenv"

print("‚úÖ Clean compatible versions installed successfully. Please RESTART RUNTIME after this cell.")

Found existing installation: langchain 0.3.27
Uninstalling langchain-0.3.27:
  Successfully uninstalled langchain-0.3.27
Found existing installation: langchain-core 0.3.79
Uninstalling langchain-core-0.3.79:
  Successfully uninstalled langchain-core-0.3.79
[0mFound existing installation: langsmith 0.4.40
Uninstalling langsmith-0.4.40:
  Successfully uninstalled langsmith-0.4.40
Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Collecting setuptools
  Downloading setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.8/1.8 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading setuptools-80.9.0-py3-none-any.whl (1.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.2/1.2 

Collecting langchain==0.1.9
  Downloading langchain-0.1.9-py3-none-any.whl.metadata (13 kB)
Collecting langchain-core==0.1.53
  Downloading langchain_core-0.1.53-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-community==0.0.38
  Downloading langchain_community-0.0.38-py3-none-any.whl.metadata (8.7 kB)
Collecting langchain-huggingface==0.0.3
  Downloading langchain_huggingface-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Collecting sentence-transformers==2.7.0
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting faiss-cpu==1.8.0.post1
  Downloading faiss_cpu-1.8.0.post1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting pymupdf==1.24.10
  Downloading PyMuPDF-1.24.10-cp312-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain==0.1.9)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langsmith<0.2.0,>=0.1.0 (from langchain==0.1.9)
  

‚úÖ Clean compatible versions installed successfully. Please RESTART RUNTIME after this cell.


In [20]:
import langchain, faiss, sentence_transformers, pymupdf
import langchain_core, langchain_community, langchain_huggingface

print("‚úÖ Environment check successful")
print("LangChain:", langchain.__version__)
print("FAISS:", faiss.__version__)
print("SentenceTransformers:", sentence_transformers.__version__)
print("PyMuPDF:", pymupdf.version)


‚úÖ Environment check successful
LangChain: 0.1.9
FAISS: 1.8.0
SentenceTransformers: 2.7.0
PyMuPDF: ('1.24.10', '1.24.9', '20240902000001')


In [21]:
import os
import glob
import zipfile
import gc
from tqdm import tqdm
import fitz  # PyMuPDF

# LangChain + FAISS imports
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

print("‚úÖ Libraries imported successfully.")

# --- CONFIG ---
BASE_DRIVE_PATH = "/content/drive/My Drive/Nyay-Sahayak-Project"  # adjust to your project folder
CORPUS_BASE_PATH = f"{BASE_DRIVE_PATH}/corpus"
PDF_PATH = f"{CORPUS_BASE_PATH}/case_law/supreme_court_2023"
DB_SAVE_PATH = f"{BASE_DRIVE_PATH}/database"

os.makedirs(DB_SAVE_PATH, exist_ok=True)
print("‚úÖ Paths configured.")

‚úÖ Libraries imported successfully.
‚úÖ Paths configured.


In [22]:
# ‚úÖ SECTION EXTRACTION CELL

def extract_text_from_pdf(pdf_path):
    """Extract text safely from PDFs using PyMuPDF."""
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text("text")
        doc.close()
        return text
    except Exception as e:
        print(f"‚ö†Ô∏è Error reading {pdf_path}: {e}")
        return ""


import re

def parse_statute_sections(doc_name, text):
    """
    Extract full statutory sections (number, title + body) from CPC/CrPC PDFs.
    Avoids counting 'Order/Rule' numbering but still catches all true sections.
    """
    # Normalize text
    text = re.sub(r'\s+', ' ', text.replace('\n', ' ')).strip()

    # --- Improved pattern ---
    section_pattern = re.compile(
        r'(?<!ORDER\s)(?<!RULE\s)'                  # don‚Äôt match inside Orders/Rules
        r'\b(?P<section_no>\d{1,3}[A-Z]?)\.\s*'     # section number
        r'(?P<section_text>'                        # capture title + body
        r'(?:[A-Z][^.]{3,}?\.)'                     # title ends at first period
        r'(?:.*?))'                                 # body text (lazy)
        r'(?=(?:\b\d{1,3}[A-Z]?\.\s)|CHAPTER|PART|\Z)',  # until next section/chapter/part/end
        flags=re.DOTALL
    )

    sections = []
    for m in section_pattern.finditer(text):
        no = m.group("section_no").strip()
        content = re.sub(r'\s+', ' ', m.group("section_text")).strip()
        sections.append({
            "doc_name": doc_name,
            "section_no": no,
            "title_and_text": content
        })

    print(f"‚úÖ Extracted {len(sections)} full sections from {doc_name}")
    return sections



In [23]:
import os
import re
import gc
import glob
import fitz  # PyMuPDF
import zipfile
from tqdm import tqdm

from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

In [24]:
import os

PDF_DIR = "/content/drive/MyDrive/Nyay-Sahayak-Project/corpus/case_law/supreme_court_2023"
documents = []

# --- Load Judgment PDFs ---
judgment_dir = f"{CORPUS_BASE_PATH}/case_law/supreme_court_2023"
documents = []

for file in os.listdir(judgment_dir):
    if file.endswith(".pdf"):
        full_path = os.path.join(judgment_dir, file)
        text = extract_text_from_pdf(full_path)
        if len(text.strip()) > 100:
            documents.append({
                "doc_name": file,
                "text": text,
                "type": "judgment"
            })

print(f"‚úÖ Loaded {len(documents)} judgment documents")


# --- Statutes ---
for law_pdf in [
    "the_code_of_civil_procedure,_1908.pdf",
    "the_code_of_criminal_procedure,_1973.pdf"
]:
    pdf_path = os.path.join(PDF_DIR, law_pdf)
    text = extract_text_from_pdf(pdf_path)
    sections = parse_statute_sections(law_pdf, text)
    for section in sections:
        documents.append({
            "doc_name": law_pdf,
            "section_no": section["section_no"],
            "title_and_text": section["title_and_text"],
            "type": "statute"
        })

print(f"üìö Total documents collected: {len(documents)}")


‚úÖ Loaded 856 judgment documents
‚úÖ Extracted 2584 full sections from the_code_of_civil_procedure,_1908.pdf
‚úÖ Extracted 1213 full sections from the_code_of_criminal_procedure,_1973.pdf
üìö Total documents collected: 4653


In [25]:
from google.colab import auth
from googleapiclient.discovery import build

# 1Ô∏è‚É£ Authenticate to Google Drive
auth.authenticate_user()
service = build('drive', 'v3')

# 2Ô∏è‚É£ Your public Drive folder ID
FOLDER_ID = "1J8oI8RJEgmbcRdyHzoMzyz519FaJkuSA"

# 3Ô∏è‚É£ Get all PDFs inside the folder
query = f"'{FOLDER_ID}' in parents and mimeType='application/pdf' and trashed=false"
results = service.files().list(q=query, fields="files(id, name)").execute()
files = results.get('files', [])

# 4Ô∏è‚É£ Build filename ‚Üí link mapping
# --- Build robust Drive link mapping (case-insensitive + trimmed) ---
PDF_BASE_URL = "https://drive.google.com/file/d/{id}/view?usp=sharing"

pdf_links = {
    f["name"].strip().lower(): PDF_BASE_URL.format(id=f["id"])
    for f in all_files
}

print(f"‚úÖ Found {len(pdf_links)} PDFs in Drive folder (after normalization).")
example_items = list(pdf_links.items())[:5]
print("üîó Example mappings:")
for k, v in example_items:
    print(f"  {k} -> {v}")


print(f"‚úÖ Found {len(pdf_links)} PDFs in Drive folder")
list(pdf_links.items())[:5]  # preview first 5

missing_links = [d["doc_name"] for d in documents if d["doc_name"] not in pdf_links]
print("‚ùå Missing Google Drive links for:", missing_links[:10])



‚úÖ Found 856 PDFs in Drive folder (after normalization).
üîó Example mappings:
  the_code_of_criminal_procedure,_1973.pdf -> https://drive.google.com/file/d/1N7mmxnKB6qREkm3XBqCVAECFmCpzbXAt/view?usp=sharing
  the_code_of_civil_procedure,_1908.pdf -> https://drive.google.com/file/d/1CtvoN7pM7Of7Rg-8zru01TXKcUjuQCng/view?usp=sharing
  2023_11_297_312_en.pdf -> https://drive.google.com/file/d/1hB2H7sT3dncaPhVUr4Uj_LPBNGETLu-x/view?usp=sharing
  2023_7_887_898_en.pdf -> https://drive.google.com/file/d/1uH9NHeNaaKxbvNi0bNUNWHEWGoVS_l4f/view?usp=sharing
  2023_16_484_524_en.pdf -> https://drive.google.com/file/d/1xJizftlTiEJe68KL5pdYmLSUCRIB44Ux/view?usp=sharing
‚úÖ Found 856 PDFs in Drive folder
‚ùå Missing Google Drive links for: ['2023_2_312_325_EN.pdf', '2023_16_1535_1546_EN.pdf', '2023_15_1067_1073_EN.pdf', '2023_16_888_916_EN.pdf', '2023_7_322_346_EN.pdf', '2023_11_623_642_EN.pdf', '2023_6_419_424_EN.pdf', '2023_2_326_338_EN.pdf', '2023_16_377_434_EN.pdf', '2023_8_152_182_EN.pdf']


In [26]:
docs_final = []
for d in documents:
    filename = d.get("doc_name", "").strip()
    pdf_url = pdf_links.get(filename.lower(), None)


    if "section_no" in d:  # Statutory law
        text = d.get("title_and_text", "")
        meta = {
            "source": filename,
            "type": "statute",
            "section_no": d.get("section_no", ""),
            "section_heading": f"Section {d.get('section_no','')}",
            "pdf_url": pdf_url
        }
    else:  # Judgment
        text = d.get("text", "")
        meta = {
            "source": filename,
            "type": "judgment",
            "case_title": d.get("case_title", ""),
            "pdf_url": pdf_url
        }

    if text and len(text.strip()) > 50:
        docs_final.append(Document(page_content=text.strip(), metadata=meta))

print(f"‚úÖ Final document count with links: {len(docs_final)}")

linked_count = sum(1 for d in docs_final if d.metadata.get("pdf_url"))
print(f"üîó {linked_count}/{len(docs_final)} documents have valid Drive links.")


‚úÖ Final document count with links: 3757
üîó 3757/3757 documents have valid Drive links.


In [27]:
for f in [
    "2023_3_552_563_EN.pdf",
    "2023_5_165_214_EN.pdf",
    "2023_14_871_889_EN.pdf"
]:
    print(f, "->", pdf_links.get(f.strip().lower()))


2023_3_552_563_EN.pdf -> https://drive.google.com/file/d/1gydnf0_umK8rB3bfal6PR_0JdttFzCK6/view?usp=sharing
2023_5_165_214_EN.pdf -> https://drive.google.com/file/d/1uWmN4Xpor97OKZQWEwF1JTnm9aTJoNwZ/view?usp=sharing
2023_14_871_889_EN.pdf -> https://drive.google.com/file/d/1vwRzGiYcZqvnM7_M0jJneULg3bHP_vbv/view?usp=sharing


In [9]:
print(docs_final[0])


page_content='A\nB\nC\nD\nE\nF\nG\nH\n312\nSUPREME COURT REPORTS\n[2023] 2 S.C.R.\n   [2023] 2 S.C.R. 312\n312\nNATIONAL INSURANCE COMPANY LTD.\nv.\n THE CHIEF ELECTORAL OFFICER & ORS.\n(Civil Appeal No.4769 of 2022)\nFEBRUARY 08, 2023\n[SANJAY KISHAN KAUL AND ABHAY S. OKA, JJ.]\nInsurance ‚Äì MOU entered into between Appellant-Insurance\ncompany and Respondent No.1 to provide insurance cover to the\npersons deployed for election related work for Bihar Legislative\nAssembly Elections in 2000 ‚Äì Respondent No.2‚Äôs husband, a\nConstable died due to sun stroke while performing election duty ‚Äì\nRespondent No.2 sought compensation in 2008 ‚Äì Respondent No.1\nacknowledged the eligibility for payment, Single Judge assigned\nthe liability to pay the amount on the Respondent No.1 and the DM\n‚Äì Appeal filed by Respondent No.1, Division Bench fastened the\nliability on the appellant ‚Äì Held: Respondent No.2‚Äôs claim was\nbeyond reasonable time period ‚Äì It was negligence of Respondent\n

In [28]:
# ‚úÖ FAISS CREATION CELL

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def build_and_save_faiss_db(documents, db_name):
    """Creates FAISS DB from LangChain documents and saves to Drive."""
    if not documents:
        print(f"‚ö†Ô∏è No documents to process for {db_name}.")
        return

    local_db_path = f"/content/faiss_db_{db_name}"
    zip_save_path = os.path.join(DB_SAVE_PATH, f"faiss_db_{db_name}.zip")

    if os.path.exists(local_db_path):
        !rm -rf {local_db_path}

    print(f"üì¶ Splitting {len(documents)} docs into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
    chunks = text_splitter.split_documents(documents)
    print(f"‚úÖ Total chunks: {len(chunks)}")

    print("üîπ Building FAISS index...")
    vectorstore = FAISS.from_documents(chunks, embeddings)
    vectorstore.save_local(local_db_path)

    print("üì¶ Zipping database for Drive storage...")
    with zipfile.ZipFile(zip_save_path, "w", zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(local_db_path):
            for f in files:
                file_path = os.path.join(root, f)
                zipf.write(file_path, os.path.relpath(file_path, local_db_path))

    print(f"‚úÖ FAISS DB saved and zipped ‚Üí {zip_save_path}")

    del vectorstore, chunks, documents
    gc.collect()


In [29]:
# üß™ Test for both PDFs before embedding
# from PyPDF2 import PdfReader # Removed PyPDF2 import

def extract_text_from_pdf_test(pdf_path): # Renamed function to avoid conflict
    """Extract text safely from PDFs using PyMuPDF."""
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text("text")
        doc.close()
        return text
    except Exception as e:
        print(f"‚ö†Ô∏è Error reading {pdf_path}: {e}")
        return ""

pdf_crpc = "/content/drive/My Drive/Nyay-Sahayak-Project/corpus/case_law/supreme_court_2023/the_code_of_criminal_procedure,_1973.pdf"
pdf_cpc = "/content/drive/My Drive/Nyay-Sahayak-Project/corpus/case_law/supreme_court_2023/the_code_of_civil_procedure,_1908.pdf"

text_crpc = extract_text_from_pdf_test(pdf_crpc)
docs_crpc = parse_statute_sections(os.path.basename(pdf_crpc), text_crpc)
print(f"CrPC sections: {len(docs_crpc)}")

text_cpc = extract_text_from_pdf_test(pdf_cpc)
docs_cpc = parse_statute_sections(os.path.basename(pdf_cpc), text_cpc)
print(f"CPC sections: {len(docs_cpc)}")


‚úÖ Extracted 1213 full sections from the_code_of_criminal_procedure,_1973.pdf
CrPC sections: 1213
‚úÖ Extracted 2584 full sections from the_code_of_civil_procedure,_1908.pdf
CPC sections: 2584


In [30]:
# ‚úÖ MAIN EXTRACTION LOOP

pdf_files = glob.glob(f"{PDF_PATH}/*.pdf")
print(f"Found {len(pdf_files)} PDF files in {PDF_PATH}")

documents = []

for pdf_path in tqdm(pdf_files, desc="Extracting PDFs"):
    pdf_name = os.path.basename(pdf_path)
    text = extract_text_from_pdf(pdf_path)

    if "code_of_criminal_procedure" in pdf_name.lower() or "code_of_civil_procedure" in pdf_name.lower():
        print(f"üìë Parsing statute: {pdf_name}")
        documents.extend(parse_statute_sections(pdf_name, text))
    elif len(text.strip()) > 200:
        documents.append(Document(page_content=text, metadata={"source": pdf_name}))

print(f"üìö Total documents collected: {len(documents)}")

build_and_save_faiss_db(docs_final, "FULL_FINAL_FINALL_FR")

print("üéâ Embeddings created and FAISS DB ready!")


Found 856 PDF files in /content/drive/My Drive/Nyay-Sahayak-Project/corpus/case_law/supreme_court_2023


Extracting PDFs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 855/856 [01:10<00:00,  4.30it/s]

üìë Parsing statute: the_code_of_civil_procedure,_1908.pdf
‚úÖ Extracted 2584 full sections from the_code_of_civil_procedure,_1908.pdf


Extracting PDFs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 856/856 [01:11<00:00, 11.95it/s]

üìë Parsing statute: the_code_of_criminal_procedure,_1973.pdf
‚úÖ Extracted 1213 full sections from the_code_of_criminal_procedure,_1973.pdf
üìö Total documents collected: 4651
üì¶ Splitting 3757 docs into chunks...





‚úÖ Total chunks: 36664
üîπ Building FAISS index...
üì¶ Zipping database for Drive storage...
‚úÖ FAISS DB saved and zipped ‚Üí /content/drive/My Drive/Nyay-Sahayak-Project/database/faiss_db_FULL_FINAL_FINALL_FR.zip
üéâ Embeddings created and FAISS DB ready!


In [31]:
problematic_files = set([
    "2023_16_1209_1524_EN.pdf",
    "2023_14_871_889_EN.pdf",
    "2023_13_832_846_EN.pdf",
    "2023_5_165_214_EN.pdf",
    "2023_14_1073_1082_EN.pdf",
    "2023_3_552_563_EN.pdf"
])

for f in problematic_files:
    print(f"{f} ->", pdf_links.get(f))


2023_14_1073_1082_EN.pdf -> None
2023_16_1209_1524_EN.pdf -> None
2023_14_871_889_EN.pdf -> None
2023_13_832_846_EN.pdf -> None
2023_5_165_214_EN.pdf -> None
2023_3_552_563_EN.pdf -> None


In [14]:
import os

local_folder = "/content/drive/MyDrive/Nyay-Sahayak-Project/corpus/case_law/supreme_court_2023"
problematic_files = [
    "2023_16_1209_1524_EN.pdf",
    "2023_14_871_889_EN.pdf",
    "2023_13_832_846_EN.pdf",
    "2023_5_165_214_EN.pdf",
    "2023_14_1073_1082_EN.pdf",
    "2023_3_552_563_EN.pdf"
]

print("üîç Checking if these files actually exist in local Drive mount:\n")
for f in problematic_files:
    exists = os.path.exists(os.path.join(local_folder, f))
    print(f"{f}: {'‚úÖ Found' if exists else '‚ùå Missing'}")


üîç Checking if these files actually exist in local Drive mount:

2023_16_1209_1524_EN.pdf: ‚úÖ Found
2023_14_871_889_EN.pdf: ‚úÖ Found
2023_13_832_846_EN.pdf: ‚úÖ Found
2023_5_165_214_EN.pdf: ‚úÖ Found
2023_14_1073_1082_EN.pdf: ‚úÖ Found
2023_3_552_563_EN.pdf: ‚úÖ Found
