In [1]:
import os
import time
import csv
import sqlite3
import fitz  # PyMuPDF
import re
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ==== CONFIG ====
BASE_DIR = os.getcwd()
DOWNLOAD_DIR = r"C:\Users\ajits\Downloads\judgments_data"
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
DB_PATH = os.path.join(OUTPUT_DIR, "kanoon_cases.db")
CSV_PATH = os.path.join(OUTPUT_DIR, "audit_log.csv")
EXCEL_PATH = os.path.join(OUTPUT_DIR, "audit_log.xlsx")

# Create directories if they don't exist
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("✅ Configuration loaded and directories are ready.")

✅ Configuration loaded and directories are ready.


In [2]:
# ===============================================================
# CELL 2: HELPER FUNCTIONS (FINAL COMPREHENSIVE VERSION)
# ===============================================================

# ==== SELENIUM HELPERS ====
def init_driver():
    """Initializes the Selenium WebDriver with download preferences."""
    options = Options()
    # options.add_argument("--headless=new") # Uncomment to run without a browser window
    options.add_experimental_option("prefs", {
        "download.default_directory": DOWNLOAD_DIR,
        "plugins.always_open_pdf_externally": True,
        "download.prompt_for_download": False,
    })
    
    driver = webdriver.Chrome(options=options)
    driver.set_page_load_timeout(60) 
    return driver

def wait_for_downloads_to_complete(directory, timeout=300):
    """Waits for all .crdownload files in a directory to disappear."""
    seconds = 0
    while seconds < timeout:
        crdownload_files = [f for f in os.listdir(directory) if f.endswith('.crdownload')]
        if not crdownload_files:
            time.sleep(1)
            return True
        seconds += 1
        time.sleep(1)
    return False

# ==== PDF PARSING HELPERS ====
def extract_text_from_pdf(pdf_path):
    """Extracts all text from a given PDF file."""
    try:
        doc = fitz.open(pdf_path)
        return "".join(page.get_text() for page in doc)
    except Exception as e:
        print(f"❌ Failed to parse {pdf_path}: {e}")
        return ""

def clean_name(name):
    """Helper function to clean extracted names."""
    if not name: return ""
    return name.strip().split(" on ")[0].strip()

def extract_judgment_date(text):
    """Finds dates in various formats and returns the most likely judgment date."""
    # Regex for formats like "04-11-2021" or "06.09.2025"
    matches1 = re.findall(r"\b(\d{1,2}[./-]\d{1,2}[./-]\d{4})\b", text)
    # Regex for formats like "AUGUST 04, 2021"
    matches2 = re.findall(r"\b((?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4})\b", text, re.IGNORECASE)
    
    dates = []
    try:
        for d in matches1:
            dates.append(datetime.strptime(d.replace('.', '/').replace('-', '/'), "%d/%m/%Y"))
        for d in matches2:
            dates.append(datetime.strptime(d, "%B %d, %Y"))
        
        return max(dates).strftime("%d-%m-%Y") if dates else "Not Found"
    except:
        return "Not Found"

# ==== CORE PDF PARSING FUNCTION ====
def parse_case_info(text, file):
    """Parses text to find all case details using a series of regular expressions."""
    
    # --- Extract Court Name ---
    court_name = "Not Found"
    if "Supreme Court of India" in text:
        court_name = "Supreme Court of India"
    elif "High Court of Judicature at Madras" in text:
        court_name = "High Court of Judicature at Madras"

    # --- Extract Case Number (tries multiple patterns) ---
    case_number = "Not Found"
    patterns = [
        r"W\.P\.\(MD\)No\.\s*([\d\s&,]+\s+of\s+\d{4})",
        r"C\.M\.A\.No\.\s*([\d\s&,]+\s+of\s+\d{4})",
        r"CIVIL\s+APPEAL\s+NO\.\s*(\d+\s+OF\s+\d{4})",
        r"Special\s+Leave\s+to\s+Appeal\s+\(C\)\s+No\(s\)\.\s*(\d+/\d{4})"
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            case_number = match.group(1).strip()
            break
            
    # --- Extract Petitioner and Respondent ---
    petitioner_match = re.search(r"\n(.*?)\s+(?:VERSUS|\.\.\.PETITIONER\(S\)|vs|\.\.APPELLANT\(S\))\s*\n", text, re.DOTALL | re.IGNORECASE)
    respondent_match = re.search(r"\n.*?\s+(?:VERSUS|vs)\s*\n(.*?)\s+(?:\.\.\.RESPONDENT\(S\)|CORAM)", text, re.DOTALL | re.IGNORECASE)

    petitioner = clean_name(petitioner_match.group(1).strip()) if petitioner_match else "Not Found"
    respondent = clean_name(respondent_match.group(1).strip()) if respondent_match else "Not Found"

    # --- Extract Judges (Coram) ---
    judges = "Not Found"
    coram_match = re.search(r"(?:Coram|CORAM)\s*:?\s*\n(.*?)(?=\n\w|\n\n)", text, re.DOTALL)
    if coram_match:
        judges = ' '.join(coram_match.group(1).strip().split())
    else:
        # Fallback for Supreme Court format
        judge_list = re.findall(r"HON'BLE\s+MR\.?\s*JUSTICE\s+([A-Z\.\s\w]+)", text)
        if judge_list:
            judges = " & ".join([j.strip() for j in judge_list])

    return {
        "File Name": file,
        "Court Name": court_name,
        "Case Number": case_number,
        "Petitioner": petitioner,
        "Respondent": respondent,
        "Judgment Date": extract_judgment_date(text),
        "Judges": judges,
        "Timestamp": datetime.now().isoformat()
    }

# ==== DATABASE AND LOGGING FUNCTIONS ====
def init_db():
    """Initializes the SQLite database with the correct table structure."""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS cases (
            id INTEGER PRIMARY KEY AUTOINCREMENT, file_name TEXT, court_name TEXT, 
            case_number TEXT, petitioner TEXT, respondent TEXT, judgment_date TEXT, 
            judges TEXT, timestamp TEXT
        )''')
    conn.commit()
    return conn

def save_to_db(cursor, data):
    """Saves a dictionary of case data to the database."""
    cursor.execute('''
        INSERT INTO cases (file_name, court_name, case_number, petitioner, respondent, judgment_date, judges, timestamp)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    ''', (
        data["File Name"], data["Court Name"], data["Case Number"], data["Petitioner"],
        data["Respondent"], data["Judgment Date"], data["Judges"], data["Timestamp"]
    ))

def export_logs(records):
    """Exports a list of dictionaries to CSV and Excel files."""
    if not records:
        print("No records to export.")
        return
    df = pd.DataFrame(records)
    df.to_excel(EXCEL_PATH, index=False)
    print(f"📁 Logs saved to: {EXCEL_PATH}")

print("✅ All helper functions are defined and updated.")

✅ All helper functions are defined and updated.


In [13]:
from selenium.common.exceptions import TimeoutException

# ==== CRAWLER ====
user_url = input("🔗 Enter Indian Kanoon search URL: ").strip()
max_pages = int(input("🔢 Enter number of pages to scrape (default 5): ") or "5")

driver = init_driver()
driver.get(user_url)
all_case_links = set()

print("\n--- Starting Stage 1: Crawling & Downloading ---")
for page in range(1, max_pages + 1):
    print(f"📄 Scraping page {page}...")
    time.sleep(2)
    links_on_page = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/doc/"]')
    all_case_links.update(a.get_attribute("href") for a in links_on_page)
    
    try:
        next_button = driver.find_element(By.LINK_TEXT, str(page + 1))
        next_button.click()
    except Exception:
        print("⚠ No more pages found.")
        break

print(f"\n🔗 Found {len(all_case_links)} unique case links. Starting downloads...")
for idx, link in enumerate(all_case_links, start=1):
    print(f"  -> Visiting PDF {idx}/{len(all_case_links)}")
    
    try:
        driver.get(link)
        
        # Click the download button
        download_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'pdfdoc')))
        download_button.click()
        
        # Wait reliably for the download to finish
        print("     Waiting for download to complete...")
        if wait_for_downloads_to_complete(DOWNLOAD_DIR):
            print(f"     ✅ Download complete for: {link}")
        else:
            print(f"     ⚠️ Download timed out for: {link}")

    except TimeoutException:
        print(f"     ❌ Page timed out and was unresponsive: {link}. Skipping.")
    except Exception as e:
        print(f"     ❌ An unexpected error occurred on page {link}: {e}. Skipping.")

driver.quit()
print("\n✅ Stage 1 Complete: All PDFs downloaded.")

🔗 Enter Indian Kanoon search URL:  https://indiankanoon.org/search/?formInput=criminal%202000%20to%202010&pagenum=4
🔢 Enter number of pages to scrape (default 5):  5



--- Starting Stage 1: Crawling & Downloading ---
📄 Scraping page 1...
📄 Scraping page 2...
📄 Scraping page 3...
📄 Scraping page 4...
📄 Scraping page 5...

🔗 Found 40 unique case links. Starting downloads...
  -> Visiting PDF 1/40
     Waiting for download to complete...
     ⚠️ Download timed out for: https://indiankanoon.org/doc/68100787/
  -> Visiting PDF 2/40
     Waiting for download to complete...
     ✅ Download complete for: https://indiankanoon.org/doc/1176861/
  -> Visiting PDF 3/40
     Waiting for download to complete...
     ✅ Download complete for: https://indiankanoon.org/doc/493882/
  -> Visiting PDF 4/40
     Waiting for download to complete...
     ✅ Download complete for: https://indiankanoon.org/doc/108308826/
  -> Visiting PDF 5/40
     Waiting for download to complete...
     ✅ Download complete for: https://indiankanoon.org/doc/80364506/
  -> Visiting PDF 6/40
     Waiting for download to complete...
     ✅ Download complete for: https://indiankanoon.org/doc/1452