In [1]:
import json
import random
import pandas as pd
from typing import Dict, List

In [2]:
class LegalDataGenerator:
    """
    Generates a simulated dataset of Indian legal cases for analysis.
    This approach is used because direct scraping of legal portals like IndianKanoon
    is often restricted.
    """

    def __init__(self):
        # Case type distribution for a balanced dataset
        self.case_distribution = {
            'Criminal': 100,
            'Civil': 80,
            'Constitutional': 60,
            'Corporate': 40,
            'Tax': 20
        }
        self.judges = ["Justice D.Y. Chandrachud", "Justice Hrishikesh Roy", "Justice Ajay Rastogi", "Justice Aniruddha Bose", "Justice Krishna Murari"]
        self.petitioners = ["Rajesh Kumar", "Priya Sharma", "M/s ABC Ltd.", "Bank of India", "Property Owners Association"]
        self.respondents = ["State of Maharashtra", "Sunita Devi", "XYZ Corporation", "Default Borrower Ltd.", "Municipal Corporation"]

    def get_case_templates(self) -> Dict:
        """
        Provides structured templates based on real Indian legal case patterns.
        """
        return {
            'Criminal': {
                'courts': ['Supreme Court of India', 'Delhi High Court', 'Bombay High Court', 'Madras High Court'],
                'fact_patterns': [
                    'Appeal against conviction under Section {section} of Indian Penal Code.',
                    'Bail application in a case registered under Section {section} IPC.',
                    'Criminal revision petition challenging the order of the lower court.'
                ],
                'common_issues': [
                    'Whether evidence is sufficient for conviction.',
                    'Whether the procedure under CrPC was followed correctly.',
                    'Whether the sentence is proportionate to the offence.'
                ],
                'ipc_sections': ['302', '376', '420', '498A', '307']
            },
            'Civil': {
                'courts': ['Supreme Court of India', 'Delhi High Court', 'Karnataka High Court'],
                'fact_patterns': [
                    'Suit for recovery of money and damages regarding a property dispute.',
                    'Contract breach case involving a sum of Rs. {amount} lakhs.',
                    'Matrimonial dispute concerning maintenance and child custody.'
                ],
                'common_issues': [
                    'Whether the contract was validly formed and subsequently breached.',
                    'Whether specific performance can be granted.',
                    'Determination of rightful property ownership.'
                ]
            },
            'Constitutional': {
                'courts': ['Supreme Court of India', 'Delhi High Court', 'Kerala High Court'],
                'fact_patterns': [
                    'Writ petition challenging the constitutional validity of the {act} Act.',
                    'Public Interest Litigation (PIL) regarding the implementation of fundamental rights.',
                    'Challenge to a government policy under Article {article} of the Constitution.'
                ],
                'common_issues': [
                    'Whether the impugned action violates fundamental rights (Article 14, 19, 21).',
                    'Whether the state action passes the test of proportionality.',
                    'Whether the classification made by the law is reasonable.'
                ],
                'articles': ['14', '19', '21', '32']
            }
        }

    def generate_case_from_template(self, case_type: str, templates: Dict, case_id: int) -> Dict:
        """
        Generates a single, randomized case record from a template.
        """
        template = templates.get(case_type, templates['Civil']) # Default to Civil if type is missing
        court_prefix = ''.join([word[0] for word in template['courts'][0].split() if word[0].isupper()])

        case = {
            "case_id": f"{court_prefix}_{2020 + random.randint(0, 5)}_{case_id}",
            "case_title": f"{random.choice(self.petitioners)} vs. {random.choice(self.respondents)}",
            "court": random.choice(template['courts']),
            "date": f"202{random.randint(1, 5)}-{random.randint(1, 12):02d}-{random.randint(1, 28):02d}",
            "judge": random.choice(self.judges),
            "facts": random.choice(template['fact_patterns']).format(
                section=random.choice(template.get('ipc_sections', ['N/A'])),
                amount=random.randint(10, 100),
                act=random.choice(["Citizenship Amendment", "Farm Laws", "Data Protection"]),
                article=random.choice(template.get('articles', ['N/A']))
            ),
            "issues": "1. " + " 2. ".join(random.sample(template['common_issues'], min(len(template['common_issues']), 2))),
            "conclusion": random.choice(["The appeal is allowed.", "The appeal is dismissed.", "The conviction is upheld.", "The case is remanded to the lower court."]),
            "case_type": case_type
        }
        return case

    def generate_expanded_dataset(self, target_size: int = 300) -> pd.DataFrame:
        """
        Generates a full DataFrame of simulated legal cases.
        """
        print(f"Generating simulated legal dataset with {target_size} cases...")
        case_templates = self.get_case_templates()
        expanded_cases = []
        
        case_types_to_generate = list(self.case_distribution.keys())
        
        for i in range(target_size):
            case_type = random.choices(
                population=list(self.case_distribution.keys()),
                weights=list(self.case_distribution.values()),
                k=1
            )[0]
            if case_type not in case_templates:
                case_type = 'Civil'

            case = self.generate_case_from_template(case_type, case_templates, i + 1)
            expanded_cases.append(case)

        df = pd.DataFrame(expanded_cases)
        print("Dataset generation complete.")
        print("\nGenerated Case Type Distribution:")
        print(df['case_type'].value_counts())
        return df

In [3]:
generator = LegalDataGenerator()
legal_df = generator.generate_expanded_dataset(target_size=300)

Generating simulated legal dataset with 300 cases...
Dataset generation complete.

Generated Case Type Distribution:
case_type
Civil             140
Criminal          108
Constitutional     52
Name: count, dtype: int64


In [4]:
legal_df.head()

Unnamed: 0,case_id,case_title,court,date,judge,facts,issues,conclusion,case_type
0,SCI_2023_1,Rajesh Kumar vs. State of Maharashtra,Supreme Court of India,2023-08-06,Justice Krishna Murari,Contract breach case involving a sum of Rs. 76...,1. Whether the contract was validly formed and...,The conviction is upheld.,Civil
1,SCI_2022_2,Property Owners Association vs. Municipal Corp...,Delhi High Court,2023-06-23,Justice Aniruddha Bose,Contract breach case involving a sum of Rs. 44...,1. Whether specific performance can be granted...,The conviction is upheld.,Civil
2,SCI_2025_3,Property Owners Association vs. State of Mahar...,Supreme Court of India,2025-03-11,Justice Krishna Murari,Matrimonial dispute concerning maintenance and...,1. Whether specific performance can be granted...,The case is remanded to the lower court.,Civil
3,SCI_2021_4,Rajesh Kumar vs. XYZ Corporation,Supreme Court of India,2023-03-04,Justice Aniruddha Bose,Appeal against conviction under Section 376 of...,1. Whether the procedure under CrPC was follow...,The appeal is dismissed.,Criminal
4,SCI_2025_5,Bank of India vs. State of Maharashtra,Delhi High Court,2025-01-10,Justice Krishna Murari,Contract breach case involving a sum of Rs. 14...,1. Whether the contract was validly formed and...,The appeal is dismissed.,Civil


In [5]:
# Convert dataframe to a list of dictionaries for JSON compatibility
dataset_list = legal_df.to_dict(orient='records')

# Create a structured output
final_json_output = {
    "dataset_info": {
        "total_cases": len(dataset_list),
        "description": "A simulated dataset of Indian legal cases generated for analysis.",
        "source": "Generated Data (emulating sources like eCourts, SCI)"
    },
    "cases": dataset_list,
    "sample_analysis": {
        "case_id": dataset_list[0]['case_id'],
        "case_title": dataset_list[0]['case_title'],
        "analysis": {
            "key_legal_principles": ["Burden of proof", "Admissibility of evidence", "Procedural fairness"],
            "successful_strategies": ["Corroboration of witness testimony", "Challenging chain of custody"],
            "recommendation": "Focus on strengthening evidence collection and witness preparation."
        }
    }
}

# Save the output to a JSON file
output_filename = 'legal_data_output.json'
with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(final_json_output, f, indent=2, ensure_ascii=False)

print(f"\n✅ Successfully generated and saved data to '{output_filename}'")


✅ Successfully generated and saved data to 'legal_data_output.json'
