In [2]:
import os
import fitz  # PyMuPDF
import re
import json

# --- CONFIGURATION ---
INPUT_FOLDER = 'judgments'
OUTPUT_FOLDER = 'structured_output'

print("Libraries and configuration loaded.")

Libraries and configuration loaded.


In [10]:
# --- HELPER FUNCTIONS FOR EXTRACTION ---

def extract_metadata(full_text):
   
    metadata = {
        'court': 'Not Found', 'case_number': 'Not Found',
        'petitioner': 'Not Found', 'respondent': 'Not Found',
        'judgment_date': 'Not Found'
    }
    # (The rest of the function code is the same as before)
    court_match = re.search(r"IN THE (SUPREME COURT OF INDIA|HIGH COURT OF [A-Z ]+)", full_text, re.IGNORECASE)
    if court_match: metadata['court'] = court_match.group(0).strip()
    case_no_match = re.search(r"([A-Z\.\s]+\s?(?:NO|Number)\.\s\d+\sOF\s\d{4})", full_text, re.IGNORECASE)
    if case_no_match: metadata['case_number'] = case_no_match.group(1).strip()
    parties_match = re.search(r"([A-Za-z\s\.,&]+)\s*\.{3,}\s*(?:Petitioner|Appellant)\(s\)\s*VERSUS\s*([A-Za-z\s\.,&]+)\s*\.{3,}\s*(?:Respondent)\(s\)", full_text, re.DOTALL | re.IGNORECASE)
    if parties_match:
        metadata['petitioner'] = parties_match.group(1).strip().replace('\n', ' ')
        metadata['respondent'] = parties_match.group(2).strip().replace('\n', ' ')
    date_match = re.search(r"(?:Date of judgment|Dated|Decided on):?\s*(\d{1,2}[\.\-/]\d{1,2}[\.\-/]\d{4})", full_text, re.IGNORECASE)
    if date_match: metadata['judgment_date'] = date_match.group(1).strip()
    return metadata

def extract_sections(full_text):
    """
    Extracts key sections like facts, arguments, and ratio.
    """
    # (The function code is the same as before)
    extracted_sections = {}
    section_markers = {
        'facts': [r"FACTS OF THE CASE", r"BRIEF FACTS", r"THE FACTUAL MATRIX", r"BACKGROUND"],
        'arguments_petitioner': [r"ARGUMENTS OF THE PETITIONER", r"SUBMISSIONS ON BEHALF OF THE PETITIONER", r"CONTENTIONS OF THE APPELLANT"],
        'arguments_respondent': [r"ARGUMENTS OF THE RESPONDENT", r"SUBMISSIONS ON BEHALF OF THE RESPONDENT", r"CONTENTIONS OF THE RESPONDENT"],
        'ratio': [r"RATIO DECIDENDI", r"REASONING OF THE COURT", r"ANALYSIS AND FINDINGS", r"COURT'S ANALYSIS"],
        'conclusion': [r"CONCLUSION", r"ORDER", r"IN THE RESULT"]
    }
    all_markers_pattern = '|'.join([item for sublist in section_markers.values() for item in sublist])
    matches = list(re.finditer(all_markers_pattern, full_text, re.IGNORECASE))
    if not matches:
        extracted_sections['full_text_no_sections_found'] = full_text
        return extracted_sections
    for i, current_match in enumerate(matches):
        section_start_pos = current_match.start()
        current_heading = current_match.group(0).upper()
        current_section_name = "unknown_section"
        for name, patterns in section_markers.items():
            if any(pattern in current_heading for pattern in patterns):
                current_section_name = name
                break
        section_end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(full_text)
        section_text = full_text[section_start_pos:section_end_pos].strip()
        extracted_sections[current_section_name] = section_text
    return extracted_sections

print("Helper functions defined.")

Helper functions defined.


In [11]:
# --- INTERACTIVE TESTING CELL ---
# Pick one file from your 'judgments' folder to test with
test_file_name = 'j1.pdf' # <--- CHANGE THIS to a real file name from your folder

# Extract text from this single file
test_file_path = os.path.join(INPUT_FOLDER, test_file_name)
full_text = ""
with fitz.open(test_file_path) as doc:
    for page in doc:
        full_text += page.get_text()

# Test the metadata function
metadata_result = extract_metadata(full_text)
print("--- METADATA FOUND ---")
print(json.dumps(metadata_result, indent=2))

# Test the section extraction function
sections_result = extract_sections(full_text)
print("\n--- SECTIONS FOUND ---")
print(json.dumps(sections_result, indent=2))

--- METADATA FOUND ---
{
  "court": "IN THE SUPREME COURT OF INDIA",
  "case_number": "NO. 354 OF 2019",
  "petitioner": "Not Found",
  "respondent": "Not Found",
  "judgment_date": "12.12.2018"
}

--- SECTIONS FOUND ---
{
  "conclusion": "orders, \nand \nthe \nsaid \napplication \nwill \nbe \nconsidered/decided on its own merits without \nbeing influenced by any of the observations \nmade by us. \n21. Pending application(s), if any, shall stand disposed of. \n \n \n \n\u2026\u2026..\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026J. \n(VIKRAM NATH) \n \n \n \n\u2026\u2026..\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026\u2026J. \n(PRASANNA B. VARALE) \n \n \nNEW DELHI \nFEBRUARY  28, 2025",
  "facts": "the factual matrix, including \nthe allotment of government land and its subsequent \nuse by the allottees. The appellant contended that the \nalleged acts of omission or commission were done in \ndischarge of his duties in the quasi-judicial \nproceedings and did not amount to cr

In [12]:
# --- MAIN SCRIPT LOGIC ---

print("Starting judgment processing pipeline...")

# Create the output directory if it doesn't exist
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
    print(f"Created output folder: {OUTPUT_FOLDER}")

pdf_files = [f for f in os.listdir(INPUT_FOLDER) if f.endswith('.pdf')]

if not pdf_files:
    print(f"No PDF files found in '{INPUT_FOLDER}'.")
else:
    for pdf_file_name in pdf_files:
        try:
            input_path = os.path.join(INPUT_FOLDER, pdf_file_name)
            print(f"\nProcessing file: {pdf_file_name}...")
            full_text = ""
            with fitz.open(input_path) as doc:
                for page in doc:
                    full_text += page.get_text()
            if not full_text.strip():
                print(f"  -> WARNING: No text extracted. Might be a scanned image.")
                continue
            
            metadata = extract_metadata(full_text)
            sections = extract_sections(full_text)
            
            final_output = {
                "source_file": pdf_file_name,
                "metadata": metadata,
                "extracted_sections": sections
            }
            
            output_file_name = pdf_file_name.replace('.pdf', '.json')
            output_path = os.path.join(OUTPUT_FOLDER, output_file_name)
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(final_output, f, indent=4, ensure_ascii=False)
            
            print(f"  -> Successfully saved to {output_path}")

        except Exception as e:
            print(f"  -> ERROR processing {pdf_file_name}: {e}")

    print("\nPipeline finished.")

Starting judgment processing pipeline...
Created output folder: structured_output

Processing file: j1.pdf...
  -> Successfully saved to structured_output\j1.json

Processing file: j2.pdf...
  -> Successfully saved to structured_output\j2.json

Processing file: j3.pdf...
  -> Successfully saved to structured_output\j3.json

Processing file: j4.pdf...
  -> Successfully saved to structured_output\j4.json

Processing file: j5.pdf...
  -> Successfully saved to structured_output\j5.json

Pipeline finished.
