In [None]:
# Cell 0: Run this first to install necessary libraries
!pip install PyMuPDF webdriver-manager selenium

Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.14.0 (from selenium)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Cell 0.5: Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Step 2: Import libraries and connect to Google Drive
import os
import time
import re
import sqlite3
import pandas as pd
import fitz  # PyMuPDF
from datetime import datetime
from google.colab import drive
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException



# ==== CONFIG ====
# --- IMPORTANT: Make sure this path matches the folder you created in your Google Drive ---
DRIVE_PROJECT_PATH = "/content/drive/MyDrive/judgments_data"

# Define paths using the main project path
DOWNLOAD_DIR = os.path.join(DRIVE_PROJECT_PATH, "input_pdfs")
OUTPUT_DIR = os.path.join(DRIVE_PROJECT_PATH, "output")
DB_PATH = os.path.join(OUTPUT_DIR, "kanoon_cases.db")
EXCEL_PATH = os.path.join(OUTPUT_DIR, "audit_log.xlsx")

# Create directories in your Google Drive if they don't exist
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("✅ Libraries installed, Google Drive mounted, and all paths are configured.")

✅ Libraries installed, Google Drive mounted, and all paths are configured.


In [None]:
# ===============================================================
# CELL 2: HELPER FUNCTIONS (COLAB VERSION)
# ===============================================================

# ==== SELENIUM HELPERS (CONFIGURED FOR COLAB) ====
def init_driver():
    """Initializes the Selenium WebDriver for the Google Colab environment."""
    options = webdriver.ChromeOptions()
    # These options are necessary for Colab
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    options.add_experimental_option("prefs", {
        "download.default_directory": DOWNLOAD_DIR,
        "plugins.always_open_pdf_externally": True,
        "download.prompt_for_download": False,
    })

    driver = webdriver.Chrome(options=options)
    driver.set_page_load_timeout(60)
    return driver

def wait_for_downloads_to_complete(directory, timeout=300):
    """Waits for all .crdownload files in a directory to disappear."""
    seconds = 0
    while seconds < timeout:
        crdownload_files = [f for f in os.listdir(directory) if f.endswith('.crdownload')]
        if not crdownload_files:
            time.sleep(1)
            return True
        seconds += 1
        time.sleep(1)
    return False

# ==== PDF PARSING HELPERS ====
def extract_text_from_pdf(pdf_path):
    """Extracts all text from a given PDF file."""
    try:
        doc = fitz.open(pdf_path)
        return "".join(page.get_text() for page in doc)
    except Exception as e:
        print(f"❌ Failed to parse {pdf_path}: {e}")
        return ""

def clean_name(name):
    """Helper function to clean extracted names."""
    if not name: return ""
    return name.strip().split(" on ")[0].strip()

def extract_judgment_date(text):
    """Finds dates in various formats and returns the most likely judgment date."""
    matches1 = re.findall(r"\b(\d{1,2}[./-]\d{1,2}[./-]\d{4})\b", text)
    matches2 = re.findall(r"\b((?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4})\b", text, re.IGNORECASE)
    dates = []
    try:
        for d in matches1: dates.append(datetime.strptime(d.replace('.', '/').replace('-', '/'), "%d/%m/%Y"))
        for d in matches2: dates.append(datetime.strptime(d, "%B %d, %Y"))
        return max(dates).strftime("%d-%m-%Y") if dates else "Not Found"
    except: return "Not Found"

# ==== CORE PDF PARSING FUNCTION ====
def parse_case_info(text, file):
    """Parses text to find all case details using a series of regular expressions."""
    court_name = "Not Found"
    if "Supreme Court of India" in text: court_name = "Supreme Court of India"
    elif "High Court of Judicature at Madras" in text: court_name = "High Court of Judicature at Madras"

    case_number = "Not Found"
    patterns = [
        r"W\.P\.\(MD\)No\.\s*([\d\s&,]+\s+of\s+\d{4})", r"C\.M\.A\.No\.\s*([\d\s&,]+\s+of\s+\d{4})",
        r"CIVIL\s+APPEAL\s+NO\.\s*(\d+\s+OF\s+\d{4})", r"Special\s+Leave\s+to\s+Appeal\s+\(C\)\s+No\(s\)\.\s*(\d+/\d{4})"
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            case_number = match.group(1).strip()
            break

    petitioner_match = re.search(r"\n(.*?)\s+(?:VERSUS|\.\.\.PETITIONER\(S\)|vs|\.\.APPELLANT\(S\))\s*\n", text, re.DOTALL | re.IGNORECASE)
    respondent_match = re.search(r"\n.*?\s+(?:VERSUS|vs)\s*\n(.*?)\s+(?:\.\.\.RESPONDENT\(S\)|CORAM)", text, re.DOTALL | re.IGNORECASE)
    petitioner = clean_name(petitioner_match.group(1).strip()) if petitioner_match else "Not Found"
    respondent = clean_name(respondent_match.group(1).strip()) if respondent_match else "Not Found"

    judges = "Not Found"
    coram_match = re.search(r"(?:Coram|CORAM)\s*:?\s*\n(.*?)(?=\n\w|\n\n)", text, re.DOTALL)
    if coram_match:
        judges = ' '.join(coram_match.group(1).strip().split())
    else:
        judge_list = re.findall(r"HON'BLE\s+MR\.?\s*JUSTICE\s+([A-Z\.\s\w]+)", text)
        if judge_list: judges = " & ".join([j.strip() for j in judge_list])

    return {
        "File Name": file, "Court Name": court_name, "Case Number": case_number, "Petitioner": petitioner,
        "Respondent": respondent, "Judgment Date": extract_judgment_date(text), "Judges": judges,
        "Timestamp": datetime.now().isoformat()
    }

# ==== DATABASE AND LOGGING FUNCTIONS ====
def init_db():
    """Initializes the SQLite database with the correct table structure."""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS cases (
            id INTEGER PRIMARY KEY AUTOINCREMENT, file_name TEXT, court_name TEXT,
            case_number TEXT, petitioner TEXT, respondent TEXT, judgment_date TEXT,
            judges TEXT, timestamp TEXT
        )''')
    conn.commit()
    return conn

def save_to_db(cursor, data):
    """Saves a dictionary of case data to the database."""
    cursor.execute('''
        INSERT INTO cases (file_name, court_name, case_number, petitioner, respondent, judgment_date, judges, timestamp)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    ''', (
        data["File Name"], data["Court Name"], data["Case Number"], data["Petitioner"],
        data["Respondent"], data["Judgment Date"], data["Judges"], data["Timestamp"]
    ))

def export_logs(records):
    """Exports a list of dictionaries to CSV and Excel files."""
    if not records:
        print("No records to export.")
        return
    df = pd.DataFrame(records)
    df.to_excel(EXCEL_PATH, index=False)
    print(f"📁 Logs saved to: {EXCEL_PATH}")

print("✅ All helper functions are defined.")

✅ All helper functions are defined.


In [None]:
# ===============================================================
# SCRIPT: FINAL INTELLIGENT PDF PARSER
# ===============================================================
import os
import re
import json
import pandas as pd
import fitz  # PyMuPDF
from datetime import datetime

# --- CONFIGURATION ---
# Point this to the folder containing your downloaded PDF files.
PDF_FOLDER_PATH = "/content/drive/MyDrive/judgments_data/input_pdfs" #<-- UPDATE IF NEEDED (for Colab)
# PDF_FOLDER_PATH = r"C:\Users\ajits\Downloads\judgments_data" #<-- UPDATE IF NEEDED (for Local)

# --- KEYWORD DICTIONARY: YOU CAN EASILY ADD NEW KEYWORDS HERE ---
SECTION_KEYWORDS = {
    "facts": [
        "Facts of the case", "Factual Background", "The Prosecution Story",
        "The facts", "Factual Matrix"
    ],
    "issues": [
        "Issues", "Issues for consideration"
    ],
    "petitioner_arguments": [
        "Arguments of the Petitioner", "Petitioner's Arguments", "Arguments on behalf of the appellant"
    ],
    "respondent_arguments": [
        "Arguments of the Respondent", "Respondent's Arguments", "Arguments on behalf of the respondent"
    ],
    "analysis": [
        "Analysis by the Court", "Court's Analysis", "Reasoning"
    ],
    "conclusion": [
        "Conclusion", "Held", "Final Order"
    ]
}
# ----------------------------------------------------------------

def extract_text_from_pdf(pdf_path):
    """Extracts all text from a given PDF file."""
    try:
        doc = fitz.open(pdf_path)
        return "".join(page.get_text() for page in doc)
    except Exception as e:
        print(f"  -> ❌ Error reading {os.path.basename(pdf_path)}: {e}")
        return ""

def parse_pdf_text(text):
    """
    Finds all section headings, sorts them, and slices the text between them.
    """
    found_sections = []
    # Find the starting position of all possible keywords
    for section_name, keywords in SECTION_KEYWORDS.items():
        for keyword in keywords:
            # Find keyword, ignoring case and allowing for ":" or newline after it
            match = re.search(r'\b' + re.escape(keyword) + r'\b\s*:?', text, re.IGNORECASE)
            if match:
                found_sections.append({'name': section_name, 'start': match.start(), 'end': match.end()})
                # We only need to find the first matching keyword for each section type
                break

    # If no sections were found, return empty data
    if not found_sections:
        return {key: "Not Found" for key in SECTION_KEYWORDS}

    # Sort the found sections by their starting position in the text
    found_sections.sort(key=lambda x: x['start'])

    extracted_data = {}
    # Slice the text between each section heading and the next one
    for i, section in enumerate(found_sections):
        section_name = section['name']
        start_index = section['end']

        # Determine the end index
        if i + 1 < len(found_sections):
            # The section ends where the next section begins
            end_index = found_sections[i+1]['start']
        else:
            # If it's the last section, take the rest of the text
            end_index = len(text)

        extracted_data[section_name] = text[start_index:end_index].strip()

    # Fill in any missing sections with "Not Found"
    for section_name in SECTION_KEYWORDS:
        if section_name not in extracted_data:
            extracted_data[section_name] = "Not Found"

    return extracted_data


# --- SCRIPT EXECUTION ---
all_cases_data = []
pdf_files = [f for f in os.listdir(PDF_FOLDER_PATH) if f.lower().endswith(".pdf")]
print(f"📑 Found {len(pdf_files)} PDFs to process.")

for filename in pdf_files:
    print(f"-> Processing: {filename}", flush=True)
    file_path = os.path.join(PDF_FOLDER_PATH, filename)
    full_text = extract_text_from_pdf(file_path)

    if not full_text:
        continue

    # Get the dictionary of parsed sections
    parsed_sections = parse_pdf_text(full_text)
    # Add the filename for reference
    parsed_sections['source_file'] = filename
    all_cases_data.append(parsed_sections)

if all_cases_data:
    # Save to JSON
    with open('final_parsed_cases.json', 'w', encoding='utf-8') as f:
        json.dump(all_cases_data, f, indent=4, ensure_ascii=False)
    print(f"\n✅ Successfully created 'final_parsed_cases.json'")

    # Save to CSV
    df = pd.DataFrame(all_cases_data)
    df.to_csv('final_parsed_cases.csv', index=False, encoding='utf-8-sig')
    print(f"✅ Successfully created 'final_parsed_cases.csv'")
else:
    print("\nNo data was processed.")

📑 Found 84 PDFs to process.
-> Processing: _vs_T_on_29_July_2024.PDF
-> Processing: M_S_Jai_Prakash_Associates_Ltd_vs_State_Of_U_P_And_Another_on_10_March_2025.PDF
-> Processing: _vs_T_on_29_July_2024 (1).PDF
-> Processing: Union_Of_India_vs_P_Radhamma_on_6_January_2025.PDF
-> Processing: Dcit_Circle_13_1_Hyderabad_vs_The_Singareni_Collieries_Company_on_12_June_2025.PDF
-> Processing: Vemuri_Ram_Prasad_vs_The_Income_Tax_Officer_on_27_August_2024.PDF
-> Processing: Manovarkhan_Manukhan_vs_State_Of_Gujarat_on_27_February_2025.PDF
-> Processing: The_Acit_Central_Circle_1_3_vs_M_S_Benifit_Tradelinks_Ltd_on_25_April_2025.PDF
-> Processing: Singareni_Collieries_Company_vs_Acit_Circle_1_Khammam_on_12_June_2025.PDF
-> Processing: Union_Of_India_vs_K_D_Nishad_on_6_January_2025.PDF
-> Processing: Raval_Pankeshbhai_Manilal_vs_State_Of_Gujarat_on_12_June_2025.PDF
-> Processing: Union_Of_India_vs_Future_Gaming_Solutions_P_Ltd_And_on_11_February_2025.PDF
-> Processing: Akzo_Nobel_India_Ltd_Gurgaon_v

In [None]:
import os
import re
import json
import pandas as pd
import fitz  # PyMuPDF
import time
from datetime import datetime
import uuid

# --- CONFIGURATION ---
PDF_FOLDER_PATH = "/content/drive/MyDrive/judgments_data/input_pdfs"

# --- COMPREHENSIVE KEYWORD DICTIONARY ---
SECTION_KEYWORDS = {
    "facts_of_case": [
        "Facts of the case", "Factual Background", "The Prosecution Story",
        "The facts", "Factual Matrix", "Brief facts", "Case of the Prosecution",
        "Background", "Factual background", "Facts in brief"
    ],
    "legal_issues": [
        "Issues", "Issues for consideration", "Points for determination",
        "Legal issues", "Questions of law", "Issues involved", "Points involved"
    ],
    "petitioner_arguments": [
        "Arguments of the Petitioner", "Petitioner's Arguments", "Arguments on behalf of the appellant",
        "Submissions of the learned counsel for the petitioner", "contentions of the petitioner",
        "Case for the petitioner", "Petitioner's case", "Appellant's arguments"
    ],
    "respondent_arguments": [
        "Arguments of the Respondent", "Respondent's Arguments", "Arguments on behalf of the respondent",
        "Submissions of the learned counsel for the respondent", "contentions of the respondent",
        "Case for the respondent", "Respondent's case"
    ],
    "judgment_analysis": [
        "Analysis by the Court", "Court's Analysis", "Reasoning", "Court's findings",
        "Discussion", "Consideration of the Court", "Reasons for the decision",
        "Analysis and decision", "Court's reasoning"
    ],
    "outcome": [
        "Conclusion", "Held", "Final Order", "Order", "For the aforesaid reasons",
        "In the result", "Judgment", "Decision", "Disposed of", "Allowed", "Dismissed"
    ],
    "citations": [
        "Relied upon", "Cases cited", "Authorities relied upon", "Citations",
        "Case law", "Precedents", "Legal authorities"
    ],
    "sections_acts_cited": [
        "Provisions of law", "Legal provisions", "Statutory provisions",
        "Under section", "Section", "Act", "Rule", "Regulation"
    ]
}

# --- ENHANCED REGEX PATTERNS ---
METADATA_PATTERNS = {
    "court": re.compile(r"IN THE (?:HIGH COURT|SUPREME COURT|DISTRICT COURT) OF (.*?)\n", re.IGNORECASE),
    "parties": re.compile(r"(.+?)\s+(?:VERSUS|VS\.?|V\.?)\s+(.+?)(?:\n\s*Coram|\n\s*JUDGMENT|\n\s*BEFORE)", re.IGNORECASE | re.DOTALL),
    "judge_name": re.compile(r"(?:CORAM|BEFORE)\s*:\s*(?:THE\s+)?(?:HON'BLE\s+)?(?:MR\.?\s+|MRS\.?\s+|MS\.?\s+)?(?:JUSTICE\s+)?(.*?)\n", re.IGNORECASE),
    "case_number": re.compile(r"((?:CRIMINAL|CIVIL|WRIT|MISCELLANEOUS|SPECIAL)\s+(?:APPEAL|PETITION|APPLICATION|CASE)\s+NO\.?\s+\d+(?:\/\d+)?\s+OF\s+\d{4})", re.IGNORECASE),
    "date": re.compile(r"(?:DATED|DECIDED ON|PRONOUNCED ON)\s*:?\s*(\d{1,2}[-/\.]\d{1,2}[-/\.]\d{4}|\d{1,2}\s+\w+\s+\d{4})", re.IGNORECASE),
    "case_type": re.compile(r"(CRIMINAL|CIVIL|WRIT|MISCELLANEOUS|SPECIAL|TAX|CONSTITUTIONAL|MATRIMONIAL)", re.IGNORECASE)
}

# --- OUTCOME PATTERNS ---
OUTCOME_PATTERNS = {
    "allowed": re.compile(r"\b(?:petition|appeal|application)\s+(?:is\s+)?(?:allowed|granted)\b", re.IGNORECASE),
    "dismissed": re.compile(r"\b(?:petition|appeal|application)\s+(?:is\s+)?dismissed\b", re.IGNORECASE),
    "acquitted": re.compile(r"\b(?:accused|defendant)\s+(?:is\s+)?acquitted\b", re.IGNORECASE),
    "convicted": re.compile(r"\b(?:accused|defendant)\s+(?:is\s+)?(?:convicted|found guilty)\b", re.IGNORECASE),
    "partly_allowed": re.compile(r"\b(?:petition|appeal)\s+(?:is\s+)?(?:partly|partially)\s+allowed\b", re.IGNORECASE)
}

def generate_case_id():
    """Generates a unique case ID."""
    return str(uuid.uuid4())[:8].upper()

def extract_text_from_pdf(pdf_path):
    """Extracts all text from a given PDF file."""
    try:
        doc = fitz.open(pdf_path)
        return "".join(page.get_text("text", sort=True) for page in doc)
    except Exception as e:
        print(f"  -> ❌ Error reading {os.path.basename(pdf_path)}: {e}")
        return ""

def extract_date(text):
    """Enhanced date extraction from judgment text."""
    header_text = text[:3000]

    # Try different date patterns
    date_patterns = [
        r"(?:DATED|DECIDED ON|PRONOUNCED ON)\s*:?\s*(\d{1,2}[-/\.]\d{1,2}[-/\.]\d{4})",
        r"(?:DATED|DECIDED ON|PRONOUNCED ON)\s*:?\s*(\d{1,2}\s+\w+\s+\d{4})",
        r"(\d{1,2}[-/\.]\d{1,2}[-/\.]\d{4})",
        r"(\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})"
    ]

    for pattern in date_patterns:
        match = re.search(pattern, header_text, re.IGNORECASE)
        if match:
            return match.group(1).strip()

    return "Not Found"

def extract_sections_acts(text):
    """Extract sections and acts cited in the judgment."""
    sections = set()

    # Pattern for sections
    section_patterns = [
        r"Section\s+(\d+(?:\([a-z0-9]+\))?)\s+of\s+(?:the\s+)?([^.]+(?:Act|Code|Rules?))",
        r"(?:under\s+)?Section\s+(\d+(?:\([a-z0-9]+\))?)",
        r"Article\s+(\d+(?:\([a-z0-9]+\))?)"
    ]

    for pattern in section_patterns:
        matches = re.finditer(pattern, text, re.IGNORECASE)
        for match in matches:
            if len(match.groups()) > 1:
                sections.add(f"Section {match.group(1)} of {match.group(2).strip()}")
            else:
                sections.add(f"Section {match.group(1)}")

    return list(sections)[:10] if sections else ["Not Found"]

def extract_citations(text):
    """Extract case citations from the judgment."""
    citations = set()

    # Patterns for case citations
    citation_patterns = [
        r"(\w+(?:\s+\w+)*)\s+v\.?\s+(\w+(?:\s+\w+)*)\s+\((\d{4})\)",
        r"(\d{4})\s+\((\d+)\)\s+([A-Z]+)\s+(\d+)",
        r"AIR\s+(\d{4})\s+([A-Z]+)\s+(\d+)"
    ]

    for pattern in citation_patterns:
        matches = re.finditer(pattern, text)
        for match in matches:
            citations.add(match.group().strip())

    return list(citations)[:15] if citations else ["Not Found"]

def determine_outcome(text):
    """Determine the outcome of the case based on text analysis."""
    conclusion_text = text[-2000:].lower()  # Look at the end of the judgment

    for outcome, pattern in OUTCOME_PATTERNS.items():
        if pattern.search(conclusion_text):
            return outcome.replace("_", " ").title()

    return "Not determined"

def extract_appeal_history(text):
    """Extract appeal history information."""
    appeal_indicators = [
        "appeal from", "revision petition", "writ petition", "special leave petition",
        "appealed against", "challenged the order", "impugned order"
    ]

    for indicator in appeal_indicators:
        if re.search(indicator, text, re.IGNORECASE):
            return "Yes"

    return "No"

def extract_comprehensive_metadata(text, filename):
    """Extract comprehensive metadata from the judgment text."""
    metadata = {
        "case_id": generate_case_id(),
        "court": "Not Found",
        "date_judgment": "Not Found",
        "parties": {
            "petitioner": "Not Found",
            "respondent": "Not Found"
        },
        "case_type": "Not Found",
        "sections_acts_cited": [],
        "summary": "Not Found",
        "outcome": "Not Found",
        "judges": [],
        "source_url": "Not Found",
        "language": "English",
        "legal_issues": [],
        "citations": [],
        "appeal_history": "Not Found",
        "publication_reporter": "Not Found"
    }

    header_text = text[:4000]

    # Extract parties
    parties_match = METADATA_PATTERNS["parties"].search(header_text)
    if parties_match:
        petitioner = ' '.join(parties_match.group(1).replace('\n', ' ').split())
        respondent = ' '.join(parties_match.group(2).replace('\n', ' ').split())
        metadata["parties"]["petitioner"] = petitioner
        metadata["parties"]["respondent"] = respondent

    # Extract other metadata
    for key, pattern in METADATA_PATTERNS.items():
        if key == "parties":
            continue
        match = pattern.search(header_text)
        if match:
            if key == "court":
                metadata["court"] = f"High Court of {match.group(1).strip()}"
            elif key == "judge_name":
                metadata["judges"] = [match.group(1).strip()]
            elif key == "case_number":
                metadata["case_id"] = match.group(1).strip()
            elif key == "case_type":
                metadata["case_type"] = match.group(1).strip().title()

    # Extract date
    metadata["date_judgment"] = extract_date(text)

    # Extract sections and acts
    metadata["sections_acts_cited"] = extract_sections_acts(text)

    # Extract citations
    metadata["citations"] = extract_citations(text)

    # Determine outcome
    metadata["outcome"] = determine_outcome(text)

    # Extract appeal history
    metadata["appeal_history"] = extract_appeal_history(text)

    # Generate summary (first 500 characters of facts)
    if len(text) > 1000:
        summary_text = text[1000:1500].strip()
        metadata["summary"] = ' '.join(summary_text.split())[:500] + "..."

    # Set source identifier as filename
    metadata["source_url"] = filename

    return metadata

def parse_judgment_sections_optimized(text):
    """Enhanced section parsing with better keyword matching."""
    found_sections = []
    keyword_to_section_map = {}
    all_keywords = []

    for section_name, keywords in SECTION_KEYWORDS.items():
        for kw in keywords:
            keyword_to_section_map[kw.lower()] = section_name
            all_keywords.append(re.escape(kw))

    master_keyword_regex = "|".join(all_keywords)
    pattern = re.compile(r"^[ \t]*(?:\d+\.?\s*)?(" + master_keyword_regex + r")[ \t]*[:.-]?",
                        re.IGNORECASE | re.MULTILINE)

    for match in pattern.finditer(text):
        matched_keyword = match.group(1).lower()
        section_name = keyword_to_section_map[matched_keyword]
        found_sections.append({'name': section_name, 'start': match.start(), 'end': match.end()})

    if not found_sections:
        return {key: "Not Found" for key in SECTION_KEYWORDS}

    found_sections.sort(key=lambda x: x['start'])

    # Remove duplicates
    unique_sections = []
    if found_sections:
        unique_sections.append(found_sections[0])
        for i in range(1, len(found_sections)):
            if found_sections[i]['start'] >= unique_sections[-1]['end']:
                unique_sections.append(found_sections[i])

    extracted_data = {}
    for i, section in enumerate(unique_sections):
        section_name, start_index = section['name'], section['end']
        end_index = unique_sections[i+1]['start'] if i + 1 < len(unique_sections) else len(text)
        content = re.sub(r'\s+', ' ', text[start_index:end_index].strip()).strip()

        # Limit content length
        if len(content) > 5000:
            content = content[:5000] + "..."

        extracted_data[section_name] = content

    # Ensure all sections are present
    for section_name in SECTION_KEYWORDS:
        if section_name not in extracted_data:
            extracted_data[section_name] = "Not Found"

    return extracted_data

# --- MAIN EXECUTION ---
def main():
    all_cases_data = {}

    if not os.path.exists(PDF_FOLDER_PATH):
        print(f"❌ ERROR: The folder '{PDF_FOLDER_PATH}' does not exist.")
        return

    pdf_files = [f for f in os.listdir(PDF_FOLDER_PATH) if f.lower().endswith(".pdf")]
    print(f"📑 Found {len(pdf_files)} PDFs to process in '{PDF_FOLDER_PATH}'.")

    for filename in pdf_files:
        print(f"\n-> Processing: {filename}", flush=True)
        file_path = os.path.join(PDF_FOLDER_PATH, filename)

        # Extract text
        full_text = extract_text_from_pdf(file_path)
        if not full_text:
            continue

        # Extract comprehensive metadata
        case_metadata = extract_comprehensive_metadata(full_text, filename)

        # Parse judgment sections
        parsed_sections = parse_judgment_sections_optimized(full_text)

        # Add full judgment text
        judgment_text = full_text[:10000] + "..." if len(full_text) > 10000 else full_text

        # Combine all data
        case_data = {
            **case_metadata,
            "facts_of_case": parsed_sections.get("facts_of_case", "Not Found"),
            "judgment_text": judgment_text,
            "petitioners_arguments": parsed_sections.get("petitioner_arguments", "Not Found"),
            "respondents_arguments": parsed_sections.get("respondent_arguments", "Not Found")
        }

        # Use case title as key
        petitioner = case_data["parties"]["petitioner"]
        respondent = case_data["parties"]["respondent"]
        if petitioner != "Not Found" and respondent != "Not Found":
            case_key = f"{petitioner} vs {respondent}"
        else:
            case_key = filename

        all_cases_data[case_key] = case_data
        print(f"  -> ✅ Completed: '{case_key}'")

    if all_cases_data:
        # Save to JSON with proper formatting
        output_json_path = 'comprehensive_legal_cases.json'
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(all_cases_data, f, indent=2, ensure_ascii=False)
        print(f"\n✅ Successfully created '{output_json_path}' with {len(all_cases_data)} cases.")

        # Create summary report
        print("\n📊 PROCESSING SUMMARY:")
        print(f"Total cases processed: {len(all_cases_data)}")

        case_types = {}
        outcomes = {}
        for case_data in all_cases_data.values():
            case_type = case_data.get("case_type", "Unknown")
            outcome = case_data.get("outcome", "Unknown")
            case_types[case_type] = case_types.get(case_type, 0) + 1
            outcomes[outcome] = outcomes.get(outcome, 0) + 1

        print(f"Case types: {dict(case_types)}")
        print(f"Outcomes: {dict(outcomes)}")

    else:
        print("\n⚠️ No data was processed.")

if __name__ == "__main__":
    main()

📑 Found 84 PDFs to process in '/content/drive/MyDrive/judgments_data/input_pdfs'.

-> Processing: _vs_T_on_29_July_2024.PDF
  -> ✅ Completed: '_vs_T_on_29_July_2024.PDF'

-> Processing: M_S_Jai_Prakash_Associates_Ltd_vs_State_Of_U_P_And_Another_on_10_March_2025.PDF
  -> ✅ Completed: 'M_S_Jai_Prakash_Associates_Ltd_vs_State_Of_U_P_And_Another_on_10_March_2025.PDF'

-> Processing: _vs_T_on_29_July_2024 (1).PDF
  -> ✅ Completed: '_vs_T_on_29_July_2024 (1).PDF'

-> Processing: Union_Of_India_vs_P_Radhamma_on_6_January_2025.PDF
  -> ✅ Completed: 'Union_Of_India_vs_P_Radhamma_on_6_January_2025.PDF'

-> Processing: Dcit_Circle_13_1_Hyderabad_vs_The_Singareni_Collieries_Company_on_12_June_2025.PDF
  -> ✅ Completed: 'Dcit, Circle-13(1), Hyderabad vs The Singareni Collieries Company ... on 12 June, 2025 Dcit, Circle-13(1), Hyderabad vs The Singareni Collieries Company ... on 12 June, 2025 ITA Nos 283 284 286 and 300 301 and 308 of 2024 Singareni Collieries Company Ltd      ,  