# ðŸ”§ Data Preprocessing for Indian Legal Documents

This notebook covers:
- Text cleaning and normalization
- Citation standardization
- Entity extraction with regex
- PII anonymization
- Preparing data for NER/RRL training

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

import json
import re
from pathlib import Path
from typing import Dict, List

import pandas as pd
from tqdm.auto import tqdm

from src.data import LegalDataLoader, LegalTextPreprocessor, SemanticChunker
from src.utils import IndianLegalPatterns, clean_legal_text

# Paths
RAW_DATA_DIR = Path('../data/raw')
PROCESSED_DATA_DIR = Path('../data/processed')
PROCESSED_DATA_DIR.mkdir(exist_ok=True)

## 1. Initialize Preprocessing Components

In [None]:
# Initialize components
preprocessor = LegalTextPreprocessor(preserve_legal_terms=True)
patterns = IndianLegalPatterns()
chunker = SemanticChunker(chunk_size=512, chunk_overlap=50)

print("âœ“ Preprocessor initialized")
print("âœ“ Regex patterns loaded")
print("âœ“ Semantic chunker ready")

## 2. Text Cleaning Pipeline

In [None]:
def preprocess_legal_document(text: str, anonymize: bool = True) -> Dict:
    """
    Full preprocessing pipeline for a legal document.
    
    Returns:
        Dict with cleaned text, extracted entities, metadata
    """
    # Step 1: Basic cleaning
    cleaned = preprocessor.clean_text(text)
    
    # Step 2: Legal normalization (Section -> standardized)
    normalized = preprocessor.normalize_legal_text(cleaned)
    
    # Step 3: Extract legal entities
    entities = patterns.extract_legal_terms(normalized)
    
    # Step 4: Extract sections with Acts
    sections_with_acts = patterns.extract_sections(normalized)
    
    # Step 5: PII anonymization (if enabled)
    if anonymize:
        final_text = patterns.anonymize_pii(normalized)
    else:
        final_text = normalized
    
    # Step 6: Sentence segmentation
    sentences = preprocessor.segment_sentences(final_text)
    
    return {
        'text': final_text,
        'sentences': sentences,
        'entities': entities,
        'sections_with_acts': sections_with_acts,
        'word_count': len(final_text.split()),
        'sentence_count': len(sentences)
    }

In [None]:
# Test the pipeline
sample_text = """
In the case of  Ram Kumar    vs.   State of UP,  FIR No. 123/2020 was filed.
The accused was charged u/s 302 of IPC and sec. 34 of Indian Penal Code.
Contact: 9876543210. 
Citing AIR 2019 SC 456, the Hon'ble Court held...
"""

result = preprocess_legal_document(sample_text)

print("CLEANED TEXT:")
print(result['text'])
print(f"\nWord count: {result['word_count']}")
print(f"Sentence count: {result['sentence_count']}")
print(f"\nSections with Acts: {result['sections_with_acts']}")

## 3. Semantic Chunking for Long Documents

In [None]:
# Long document example
long_document = """
JUDGMENT

1. This appeal challenges the judgment dated 15.01.2023 passed by the 
High Court of Delhi in WP(C) No. 1234/2022.

2. The brief facts of the case are as follows:

FACTS

3. The petitioner is a company registered under the Companies Act, 2013. 
The petitioner entered into a contract with the respondent for supply of 
goods worth Rs. 50,00,000/-.

4. The respondent failed to make payment despite repeated reminders. 
A legal notice was sent on 01.03.2022 demanding payment within 15 days.

5. Subsequently, the petitioner filed a suit for recovery before the 
District Court, which was dismissed on technical grounds.

ISSUE

6. The main issue for determination is whether the High Court was 
justified in dismissing the writ petition on grounds of alternate remedy.

ANALYSIS

7. We have heard the learned counsel for both parties and perused the 
material on record.

8. In ABL International Ltd. v. Export Credit Guarantee Corporation, 
(2004) 3 SCC 553, this Court held that writ jurisdiction should not be 
exercised in contractual matters.

9. However, in appropriate cases where there is violation of principles 
of natural justice, writ remedy is available.

ORDER

10. In view of the above, the appeal is allowed in part.
"""

# Chunk the document
chunks = chunker.chunk_legal_document(long_document, preserve_sections=True)

print(f"Document split into {len(chunks)} chunks:\n")
for chunk in chunks:
    print(f"--- Chunk {chunk.chunk_id} (~{len(chunk.text)//4} tokens) ---")
    print(chunk.text[:150] + "..." if len(chunk.text) > 150 else chunk.text)
    print()

## 4. Prepare NER Training Data Format

In [None]:
def create_ner_annotation(text: str, entities: List[Dict]) -> Dict:
    """
    Create NER annotation in spaCy/HuggingFace format.
    
    Format:
    {
        "text": "...",
        "entities": [[start, end, label], ...]
    }
    """
    annotations = []
    
    for entity in entities:
        start = entity.get('start', 0)
        end = entity.get('end', 0)
        label = entity.get('label', 'UNKNOWN')
        
        if start >= 0 and end > start:
            annotations.append([start, end, label])
    
    return {
        "text": text,
        "entities": annotations
    }

# Example: Manual annotation format
example_ner_data = [
    {
        "text": "In Kesavananda Bharati v. State of Kerala, the Supreme Court examined Article 368.",
        "entities": [
            [3, 22, "PETITIONER"],      # Kesavananda Bharati
            [27, 42, "RESPONDENT"],     # State of Kerala
            [48, 61, "COURT"],          # Supreme Court
            [71, 82, "PROVISION"]       # Article 368
        ]
    },
    {
        "text": "FIR No. 123/2020 was registered under Section 302 of the Indian Penal Code.",
        "entities": [
            [0, 16, "CASE_NUMBER"],     # FIR No. 123/2020
            [38, 49, "PROVISION"],      # Section 302
            [57, 75, "STATUTE"]         # Indian Penal Code
        ]
    }
]

print("Sample NER Training Data:")
print(json.dumps(example_ner_data, indent=2))

## 5. Prepare Rhetorical Role Labeling Data

In [None]:
# RRL Training Data Format
example_rrl_data = [
    {
        "sentence": "This appeal arises from the judgment dated 15.01.2023 passed by the High Court of Delhi.",
        "label": "PREAMBLE"
    },
    {
        "sentence": "The petitioner is a company registered under the Companies Act, 2013.",
        "label": "FACTS"
    },
    {
        "sentence": "The main issue for determination is whether the High Court was justified in dismissing the petition.",
        "label": "ISSUE"
    },
    {
        "sentence": "Learned counsel for the petitioner submitted that the order is arbitrary.",
        "label": "ARGUMENT_PETITIONER"
    },
    {
        "sentence": "We have carefully considered the submissions and perused the record.",
        "label": "ANALYSIS"
    },
    {
        "sentence": "In ABL International v. ECGC (2004) 3 SCC 553, this Court held...",
        "label": "PRECEDENT_RELIED"
    },
    {
        "sentence": "The doctrine of proportionality requires that restrictions must be necessary.",
        "label": "RATIO"
    },
    {
        "sentence": "In view of the above, the appeal is allowed.",
        "label": "RULING_PRESENT_COURT"
    }
]

print("Sample RRL Training Data:")
for item in example_rrl_data:
    print(f"[{item['label']:25}] {item['sentence'][:60]}...")

## 6. Batch Processing Pipeline

In [None]:
def process_documents_batch(documents: List[Dict], output_path: str) -> pd.DataFrame:
    """
    Process a batch of documents and save to file.
    
    Args:
        documents: List of dicts with 'id' and 'text' keys
        output_path: Path to save processed data
    
    Returns:
        DataFrame with processing statistics
    """
    processed = []
    stats = []
    
    for doc in tqdm(documents, desc="Processing documents"):
        doc_id = doc.get('id', 'unknown')
        text = doc.get('text', '')
        
        if not text:
            continue
        
        # Process document
        result = preprocess_legal_document(text)
        
        # Store processed
        processed.append({
            'id': doc_id,
            'text': result['text'],
            'sentences': result['sentences'],
            'entities': result['entities']
        })
        
        # Collect stats
        stats.append({
            'id': doc_id,
            'word_count': result['word_count'],
            'sentence_count': result['sentence_count'],
            'num_citations': len(result['entities'].get('citations', [])),
            'num_statutes': len(result['entities'].get('acts', []))
        })
    
    # Save processed data
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(processed, f, ensure_ascii=False, indent=2)
    
    print(f"\nâœ“ Saved {len(processed)} documents to {output_path}")
    
    return pd.DataFrame(stats)

# Example usage (with dummy data)
sample_docs = [
    {"id": "doc_001", "text": "In the Supreme Court, the petitioner filed under Article 32."},
    {"id": "doc_002", "text": "FIR No. 456/2021 was registered u/s 420 IPC at PS Sadar."},
]

# stats_df = process_documents_batch(sample_docs, 'data/processed/sample_processed.json')
# stats_df.describe()

## 7. Export Functions

In [None]:
def save_for_ner_training(data: List[Dict], output_path: str, format: str = 'jsonl'):
    """Save data in format suitable for NER training."""
    if format == 'jsonl':
        with open(output_path, 'w', encoding='utf-8') as f:
            for item in data:
                f.write(json.dumps(item, ensure_ascii=False) + '\n')
    else:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
    
    print(f"âœ“ Saved {len(data)} samples to {output_path}")

# Save examples
# save_for_ner_training(example_ner_data, '../data/processed/ner_samples.jsonl', format='jsonl')
# save_for_ner_training(example_rrl_data, '../data/processed/rrl_samples.jsonl', format='jsonl')

## Next Steps

1. **Collect Data**: Run the Indian Kanoon scraper
2. **Annotate**: Use Label Studio or similar for NER annotations
3. **Train**: Use the InLegalBERT fine-tuning notebooks