In [1]:
# CELL 1: Libraries & Configuration

import os
import json
from pypdf import PdfReader
from bs4 import BeautifulSoup
from pptx import Presentation

# --- CONFIGURATION ---
DATA_FOLDER = '../data'  
OUTPUT_FILE = '../data/extracted_legal_data.json'

print(f"‚úÖ Setup Complete. Target Data Folder: {os.path.abspath(DATA_FOLDER)}")

‚úÖ Setup Complete. Target Data Folder: c:\Users\teju_\OneDrive\Desktop\AI bot\ai_legal_project\data


In [2]:
# CELL 2: Extraction Logic (With Metadata)

def extract_pdf(filepath):
    text = ""
    page_count = 0
    try:
        reader = PdfReader(filepath)
        page_count = len(reader.pages) # Get Page Count
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text, f"{page_count} Pages"
    except Exception as e:
        return f"[ERROR]: {e}", 0

def extract_pptx(filepath):
    text = ""
    slide_count = 0
    try:
        prs = Presentation(filepath)
        slide_count = len(prs.slides) # Get Slide Count
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + " "
            text += "\n"
        return text, f"{slide_count} Slides"
    except Exception as e:
        return f"[ERROR]: {e}", 0

def extract_html(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')
            text = soup.get_text(separator=' ', strip=True)
            char_count = len(text) # Get Character Count for HTML
            return text, f"{char_count} Characters"
    except Exception as e:
        return f"[ERROR]: {e}", 0

# Mapping extensions to functions
EXTRACTORS = {
    '.pdf': extract_pdf,
    '.pptx': extract_pptx,
    '.html': extract_html,
    '.htm': extract_html
}

In [3]:
# CELL 3: Execution Loop

extracted_data_list = []

if os.path.exists(DATA_FOLDER):
    all_files = os.listdir(DATA_FOLDER)
    print(f"üìÇ Found {len(all_files)} files in folder. Starting processing...\n")

    for filename in all_files:
        filepath = os.path.join(DATA_FOLDER, filename)
        
        # Skip directories, only process files
        if not os.path.isfile(filepath):
            continue

        # Get extension
        _, ext = os.path.splitext(filename)
        ext = ext.lower()

        if ext in EXTRACTORS:
            # 1. Run the specific extractor
            extractor_func = EXTRACTORS[ext]
            text, volume_info = extractor_func(filepath)

            if not text.startswith("[ERROR]"):
                # 2. Build the JSON Structure 
                record = {
                    "document_title": os.path.splitext(filename)[0],
                    "document_type": ext.replace('.', '').upper(),
                    "source_file": filename,
                    "file_path": filepath,
                    "page_or_char_count": volume_info, # Dynamic: "5 Pages" or "3020 Characters"
                    "extracted_text": text.strip()
                }
                extracted_data_list.append(record)
                print(f"‚úÖ Processed: {filename} ({volume_info})")
            else:
                print(f"‚ùå Error in {filename}: {text}")
        else:
            print(f"‚ö†Ô∏è  Skipped (Unsupported): {filename}")
else:
    print(f"‚ùå Error: Folder '{DATA_FOLDER}' does not exist.")

print(f"\nüéâ Extraction Finished! Successfully extracted {len(extracted_data_list)} documents.")

üìÇ Found 14 files in folder. Starting processing...

‚úÖ Processed: Basic_Features_Indian_Constitution.pptx (20 Slides)
‚úÖ Processed: Constitution of India.pdf (402 Pages)
‚úÖ Processed: Criminal Procedure Code.pdf (263 Pages)
‚úÖ Processed: Cyber_Law_Regime_India.pptx (20 Slides)
‚úÖ Processed: Indian Evidence Act.pdf (60 Pages)
‚úÖ Processed: Indian Penal Code.pdf (205 Pages)
‚úÖ Processed: Indian_Legal_Environment.pptx (20 Slides)
‚úÖ Processed: Judiciary.pptx (21 Slides)
‚úÖ Processed: Kesavananda Bharati v. State of Kerala - Wikipedia.html (19236 Characters)
‚úÖ Processed: Maneka Gandhi v. Union of India - Wikipedia.html (5987 Characters)
‚úÖ Processed: Navtej Singh Johar v. Union of India - Wikipedia.html (26112 Characters)
‚úÖ Processed: Public_Interest_Litigation.pptx (19 Slides)
‚úÖ Processed: Puttaswamy v. Union of India - Wikipedia.html (17534 Characters)
‚úÖ Processed: Vishakha and Others v. State of Rajasthan - Wikipedia.html (10506 Characters)

üéâ Extraction Finished

In [4]:
# CELL 4: Save & Verify

# 1. Save to JSON
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(extracted_data_list, f, indent=4)

print(f"üíæ Data saved to: {OUTPUT_FILE}")

# 2. Verify Output (Print the first record)
if extracted_data_list:
    print("\n--- üîç PREVIEW OF FIRST RECORD ---")
    first_record = extracted_data_list[0]
    for key, value in first_record.items():
        # Print only first 100 chars of text to keep screen clean
        if key == "extracted_text":
            print(f"{key}: {value[:100]}...")
        else:
            print(f"{key}: {value}")

üíæ Data saved to: ../data/extracted_legal_data.json

--- üîç PREVIEW OF FIRST RECORD ---
document_title: Basic_Features_Indian_Constitution
document_type: PPTX
source_file: Basic_Features_Indian_Constitution.pptx
file_path: ../data\Basic_Features_Indian_Constitution.pptx
page_or_char_count: 20 Slides
extracted_text: Introduction to Indian Constitution ‚Ä¢ The Constitution is the supreme law of India.
‚Ä¢ It was drafted...
