In [1]:
from PyPDF2 import PdfReader, PdfWriter
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
import string

def extract_index_from_first_page(pdf_path):
    """Extract text from the first page of PDF"""
    reader = PdfReader(pdf_path)
    first_page = reader.pages[0]
    text = first_page.extract_text()
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    # Assuming topics are listed after the title (skip first line)
    return lines[1:]  # Skipping title line

def create_alphabetical_index(topics, output_index_pdf):
    """Create a new PDF with alphabetical grouped index"""
    c = canvas.Canvas(output_index_pdf, pagesize=A4)
    width, height = A4

    x, y = 50, height - 80  # Start position
    grouped = {}

    # Group topics by starting letter
    for topic in topics:
        first_char = topic[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"  # For non A–Z topics
        grouped.setdefault(first_char, []).append(topic)

    # Sort groups and topics
    for letter in sorted(grouped.keys()):
        # Add letter heading
        c.setFont("Helvetica-Bold", 14)
        c.drawString(x, y, letter)
        c.line(x, y-2, x+20, y-2)  # underline
        y -= 25

        # Add topics
        c.setFont("Helvetica", 12)
        for t in sorted(grouped[letter], key=lambda s: s.lower()):
            if y < 50:  # New page if space ends
                c.showPage()
                y = height - 80
            c.drawString(x + 20, y, t)
            y -= 20

        y -= 10  # Gap between letter groups

    c.save()

def replace_index_page(input_pdf, index_pdf, output_pdf):
    """Replace first page with new index page"""
    reader = PdfReader(input_pdf)
    writer = PdfWriter()

    # Insert new index page
    index_reader = PdfReader(index_pdf)
    writer.add_page(index_reader.pages[0])

    # Add remaining pages from original PDF
    for page_num in range(1, len(reader.pages)):
        writer.add_page(reader.pages[page_num])

    with open(output_pdf, "wb") as f:
        writer.write(f)


if __name__ == "__main__":
    input_pdf = "C:\\Users\\viswa\\OneDrive\\Desktop\\4.pdf"          # Your original PDF
    temp_index_pdf = "new_index.pdf"  # Temporary index page
    output_pdf = "output.pdf"         # Final PDF with updated index

    topics = extract_index_from_first_page(input_pdf)
    create_alphabetical_index(topics, temp_index_pdf)
    replace_index_page(input_pdf, temp_index_pdf, output_pdf)

    print("✅ New PDF created with alphabetical index at:", output_pdf)


✅ New PDF created with alphabetical index at: output.pdf


In [3]:
from PyPDF2 import PdfReader, PdfWriter
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
import string
import re

def extract_index_from_first_page(pdf_path):
    """Extract section numbers + titles from first page"""
    reader = PdfReader(pdf_path)
    first_page = reader.pages[0]
    text = first_page.extract_text()

    # Extract lines with numbering like "1. Something"
    lines = []
    for line in text.splitlines():
        match = re.match(r"^\s*(\d+[A-Z]?\.)\s*(.*)", line)
        if match:
            number = match.group(1).strip()
            title = match.group(2).strip()
            lines.append((number, title))
    return lines

def create_alphabetical_index(sections, output_index_pdf):
    """Create a new alphabetical index page"""
    c = canvas.Canvas(output_index_pdf, pagesize=A4)
    width, height = A4
    x, y = 50, height - 80

    grouped = {}

    # Group by first letter of title
    for number, title in sections:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"  # for non A–Z
        grouped.setdefault(first_char, []).append((number, title))

    # Sort groups and titles
    for letter in sorted(grouped.keys()):
        # Add letter heading
        c.setFont("Helvetica-Bold", 14)
        c.drawString(x, y, letter)
        c.line(x, y-2, x+20, y-2)  # underline
        y -= 25

        # Add topics
        c.setFont("Helvetica", 12)
        for num, title in sorted(grouped[letter], key=lambda s: s[1].lower()):
            if y < 50:  # new page if space ends
                c.showPage()
                y = height - 80
            c.drawString(x + 20, y, f"{num} {title}")
            y -= 20

        y -= 10  # Gap between letter groups

    c.save()

def replace_index_page(input_pdf, index_pdf, output_pdf):
    """Replace first page with new index page"""
    reader = PdfReader(input_pdf)
    writer = PdfWriter()

    # Insert new index page
    index_reader = PdfReader(index_pdf)
    writer.add_page(index_reader.pages[0])

    # Add remaining pages
    for page_num in range(1, len(reader.pages)):
        writer.add_page(reader.pages[page_num])

    with open(output_pdf, "wb") as f:
        writer.write(f)


if __name__ == "__main__":
    input_pdf = "C:\\Users\\viswa\\OneDrive\\Desktop\\4.pdf"           # Your original PDF
    temp_index_pdf = "new_index.pdf"   # Temporary index page
    output_pdf = "output.pdf"          # Final PDF

    sections = extract_index_from_first_page(input_pdf)
    create_alphabetical_index(sections, temp_index_pdf)
    replace_index_page(input_pdf, temp_index_pdf, output_pdf)

    print("✅ New PDF created with alphabetical index:", output_pdf)


✅ New PDF created with alphabetical index: output.pdf


In [5]:
from PyPDF2 import PdfReader, PdfWriter
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
import string
import re

def extract_index_from_first_page(pdf_path):
    """Extract section titles from first page (ignore original numbers)."""
    reader = PdfReader(pdf_path)
    first_page = reader.pages[0]
    text = first_page.extract_text()

    topics = []
    for line in text.splitlines():
        # Match lines like "12. Something" or "5A. Something"
        match = re.match(r"^\s*\d+[A-Z]?\.\s*(.*)", line)
        if match:
            title = match.group(1).strip()
            topics.append(title)
    return topics

def create_alphabetical_index(topics, output_index_pdf):
    """Create new index pages with alphabetical sorting and renumbering."""
    c = canvas.Canvas(output_index_pdf, pagesize=A4)
    width, height = A4
    x, y = 50, height - 80

    # --- Add Title & Subtitle ---
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width/2, height - 50, "THE MATERNITY BENEFIT ACT, 1961")
    c.drawCentredString(width/2, height - 70, "__________")
    c.drawCentredString(width/2, height - 90, "ARRANGEMENT OF SECTIONS")
    c.drawCentredString(width/2, height - 110, "__________")

    y = height - 150
    c.setFont("Helvetica-Bold", 12)
    c.drawString(x, y, "SECTIONS")
    y -= 30

    # Group by first letter
    grouped = {}
    for title in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append(title)

    new_number = 1
    # Iterate groups in alphabetical order
    for letter in sorted(grouped.keys()):
        # Add letter heading
        if y < 100:
            c.showPage()
            y = height - 80
        c.setFont("Helvetica-Bold", 14)
        c.drawString(x, y, letter)
        c.line(x, y-2, x+20, y-2)
        y -= 25

        # Add topics inside this group
        c.setFont("Helvetica", 12)
        for title in sorted(grouped[letter], key=lambda s: s.lower()):
            if y < 50:  # New page if space ends
                c.showPage()
                y = height - 80
            c.drawString(x + 20, y, f"{new_number}. {title}")
            new_number += 1
            y -= 20

        y -= 10  # Gap between groups

    c.save()

def replace_index_page(input_pdf, index_pdf, output_pdf):
    """Replace first page with new index pages (can be multi-page)."""
    reader = PdfReader(input_pdf)
    writer = PdfWriter()

    # Insert new index pages
    index_reader = PdfReader(index_pdf)
    for page in index_reader.pages:
        writer.add_page(page)

    # Add remaining pages from original PDF (skip old index page)
    for page_num in range(1, len(reader.pages)):
        writer.add_page(reader.pages[page_num])

    with open(output_pdf, "wb") as f:
        writer.write(f)


if __name__ == "__main__":
    input_pdf = "C:\\Users\\viswa\\OneDrive\\Desktop\\4.pdf"           # Your original PDF
    temp_index_pdf = "new_index.pdf"   # Temporary index file
    output_pdf = "output.pdf"          # Final PDF

    topics = extract_index_from_first_page(input_pdf)
    create_alphabetical_index(topics, temp_index_pdf)
    replace_index_page(input_pdf, temp_index_pdf, output_pdf)

    print("✅ New PDF created with alphabetical index:", output_pdf)


✅ New PDF created with alphabetical index: output.pdf


In [6]:
from PyPDF2 import PdfReader, PdfWriter
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
import string
import re

def extract_index_from_first_page(pdf_path):
    """Extract section titles from first page (ignore original numbers)."""
    reader = PdfReader(pdf_path)
    first_page = reader.pages[0]
    text = first_page.extract_text()

    topics = []
    for line in text.splitlines():
        # Match lines like "12. Something" or "5A. Something"
        match = re.match(r"^\s*\d+[A-Z]?\.\s*(.*)", line)
        if match:
            title = match.group(1).strip()
            topics.append(title)
    return topics

def create_alphabetical_index(topics, output_index_pdf):
    """Create new index pages with alphabetical sorting and renumbering."""
    c = canvas.Canvas(output_index_pdf, pagesize=A4)
    width, height = A4
    x, y = 50, height - 80

    # --- Add Title & Subtitle ---
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width/2, height - 50, "THE MATERNITY BENEFIT ACT, 1961")
    c.drawCentredString(width/2, height - 70, "__________")
    c.drawCentredString(width/2, height - 90, "ARRANGEMENT OF SECTIONS")
    c.drawCentredString(width/2, height - 110, "__________")

    y = height - 150
    c.setFont("Helvetica-Bold", 12)
    c.drawString(x, y, "SECTIONS")
    y -= 30

    # Group by first letter
    grouped = {}
    for title in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append(title)

    new_number = 1
    # Iterate groups in alphabetical order
    for letter in sorted(grouped.keys()):
        # Check for new page
        if y < 100:
            c.showPage()
            y = height - 80

        # --- Centered Letter Heading ---
        c.setFont("Helvetica-Bold", 14)
        c.drawCentredString(width/2, y, letter)
        # underline across a short width under the letter
        text_width = c.stringWidth(letter, "Helvetica-Bold", 14)
        c.line((width - text_width)/2, y-2, (width + text_width)/2, y-2)
        y -= 25

        # Add topics under the group
        c.setFont("Helvetica", 12)
        for title in sorted(grouped[letter], key=lambda s: s.lower()):
            if y < 50:  # new page if space ends
                c.showPage()
                y = height - 80
            c.drawString(x + 20, y, f"{new_number}. {title}")
            new_number += 1
            y -= 20

        y -= 10  # Gap between groups

    c.save()

def replace_index_page(input_pdf, index_pdf, output_pdf):
    """Replace first page with new index pages (can be multi-page)."""
    reader = PdfReader(input_pdf)
    writer = PdfWriter()

    # Insert new index pages
    index_reader = PdfReader(index_pdf)
    for page in index_reader.pages:
        writer.add_page(page)

    # Add remaining pages from original PDF (skip old index page)
    for page_num in range(1, len(reader.pages)):
        writer.add_page(reader.pages[page_num])

    with open(output_pdf, "wb") as f:
        writer.write(f)


if __name__ == "__main__":
    input_pdf = "C:\\Users\\viswa\\OneDrive\\Desktop\\4.pdf"            # Your original PDF
    temp_index_pdf = "new_index.pdf"   # Temporary index file
    output_pdf = "output.pdf"          # Final PDF

    topics = extract_index_from_first_page(input_pdf)
    create_alphabetical_index(topics, temp_index_pdf)
    replace_index_page(input_pdf, temp_index_pdf, output_pdf)

    print("✅ New PDF created with centered alphabetical index:", output_pdf)


✅ New PDF created with centered alphabetical index: output.pdf


In [13]:
from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from docx import Document
import string
import re

def extract_index_from_first_page(pdf_path):
    """Extract section titles from first page with original numbering."""
    reader = PdfReader(pdf_path)
    first_page = reader.pages[0]
    text = first_page.extract_text()

    topics = []
    for line in text.splitlines():
        # Match lines like "12. Something" or "5A. Something"
        match = re.match(r"^\s*(\d+[A-Z]?\.)\s*(.*)", line)
        if match:
            num = match.group(1).strip()
            title = match.group(2).strip()
            topics.append((num, title))
    return topics

def create_alphabetical_index_pdf(topics, output_pdf):
    """Create alphabetical index in PDF with original numbers."""
    c = canvas.Canvas(output_pdf, pagesize=A4)
    width, height = A4
    x, y = 50, height - 80

    # --- Title ---
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width/2, height - 50, "THE MATERNITY BENEFIT ACT, 1961")
    c.drawCentredString(width/2, height - 70, "__________")
    c.drawCentredString(width/2, height - 90, "ARRANGEMENT OF SECTIONS")
    c.drawCentredString(width/2, height - 110, "__________")

    y = height - 150
    c.setFont("Helvetica-Bold", 12)
    c.drawString(x, y, "SECTIONS")
    y -= 30

    # Group by first letter
    grouped = {}
    for num, title in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append((num, title))

    # Iterate groups in alphabetical order
    for letter in sorted(grouped.keys()):
        if y < 100:
            c.showPage()
            y = height - 80

        # --- Centered Letter Heading ---
        c.setFont("Helvetica-Bold", 14)
        c.drawCentredString(width/2, y, letter)
        text_width = c.stringWidth(letter, "Helvetica-Bold", 14)
        c.line((width - text_width)/2, y-2, (width + text_width)/2, y-2)
        y -= 25

        # Add topics under the group (keep original numbering)
        c.setFont("Helvetica", 12)
        for num, title in sorted(grouped[letter], key=lambda s: s[1].lower()):
            if y < 50:
                c.showPage()
                y = height - 80
            c.drawString(x + 20, y, f"{num} {title}")
            y -= 20

        y -= 10

    c.save()

def create_alphabetical_index_docx(topics, output_docx):
    """Create alphabetical index in Word with original numbers."""
    doc = Document()
    doc.add_heading("THE MATERNITY BENEFIT ACT, 1961", level=0)
    doc.add_paragraph("__________")
    doc.add_paragraph("ARRANGEMENT OF SECTIONS")
    doc.add_paragraph("__________\n")
    doc.add_heading("SECTIONS", level=1)

    grouped = {}
    for num, title in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append((num, title))

    for letter in sorted(grouped.keys()):
        doc.add_heading(letter, level=2)
        for num, title in sorted(grouped[letter], key=lambda s: s[1].lower()):
            doc.add_paragraph(f"{num} {title}")

    doc.save(output_docx)

if __name__ == "__main__":
    input_pdf = "C:\\Users\\viswa\\OneDrive\\Desktop\\4.pdf"
    output_pdf = "alphabetical_index.pdf"
    output_docx = "alphabetical_index.docx"

    topics = extract_index_from_first_page(input_pdf)
    create_alphabetical_index_pdf(topics, output_pdf)
    create_alphabetical_index_docx(topics, output_docx)

    print("✅ Index created in both PDF and Word formats:")
    print("PDF:", output_pdf)
    print("Word:", output_docx)


PermissionError: [Errno 13] Permission denied: 'alphabetical_index.docx'

In [11]:
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.shared import Pt, Inches, RGBColor
import string

def create_alphabetical_index_docx(topics, output_docx):
    """Create alphabetical index in Word with original numbers, centered headings, and underlines."""
    doc = Document()

    # --- Title ---
    title = doc.add_heading("THE MATERNITY BENEFIT ACT, 1961", level=0)
    title.alignment = WD_ALIGN_PARAGRAPH.CENTER

    underline = doc.add_paragraph("__________")
    underline.alignment = WD_ALIGN_PARAGRAPH.CENTER

    arr = doc.add_paragraph("ARRANGEMENT OF SECTIONS")
    arr.alignment = WD_ALIGN_PARAGRAPH.CENTER

    underline2 = doc.add_paragraph("__________")
    underline2.alignment = WD_ALIGN_PARAGRAPH.CENTER

    doc.add_paragraph("")  # spacing

    # --- Sections Heading ---
    sections = doc.add_paragraph("SECTIONS")
    sections.alignment = WD_ALIGN_PARAGRAPH.CENTER
    sections.runs[0].bold = True

    # --- Group topics by alphabet ---
    grouped = {}
    for num, title in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append((num, title))

    for letter in sorted(grouped.keys()):
        # --- Centered Letter Heading with underline ---
        para = doc.add_paragraph(letter)
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = para.runs[0]
        run.bold = True
        run.font.size = Pt(14)
        run.underline = True  # underline letter

        doc.add_paragraph("")  # small spacing

        # --- Topics under each letter, centered ---
        for num, title in sorted(grouped[letter], key=lambda s: s[1].lower()):
            p = doc.add_paragraph(f"{num} {title}")
            p.alignment = WD_ALIGN_PARAGRAPH.CENTER

    doc.save(output_docx)


In [14]:
from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
import string
import re

# ------------------ Extract Index from PDF ------------------ #
def extract_index_from_first_page(pdf_path):
    """Extract section titles from first page with original numbering."""
    reader = PdfReader(pdf_path)
    first_page = reader.pages[0]
    text = first_page.extract_text()

    topics = []
    for line in text.splitlines():
        # Match lines like "12. Something" or "5A. Something"
        match = re.match(r"^\s*(\d+[A-Z]?\.)\s*(.*)", line)
        if match:
            num = match.group(1).strip()
            title = match.group(2).strip()
            topics.append((num, title))
    return topics

# ------------------ Create PDF ------------------ #
def create_alphabetical_index_pdf(topics, output_pdf):
    """Create alphabetical index in PDF with original numbers."""
    c = canvas.Canvas(output_pdf, pagesize=A4)
    width, height = A4
    x, y = 50, height - 80

    # --- Title ---
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width/2, height - 50, "THE MATERNITY BENEFIT ACT, 1961")
    c.drawCentredString(width/2, height - 70, "__________")
    c.drawCentredString(width/2, height - 90, "ARRANGEMENT OF SECTIONS")
    c.drawCentredString(width/2, height - 110, "__________")

    y = height - 150
    c.setFont("Helvetica-Bold", 12)
    c.drawCentredString(width/2, y, "SECTIONS")
    y -= 30

    # Group by first letter
    grouped = {}
    for num, title in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append((num, title))

    # Iterate groups in alphabetical order
    for letter in sorted(grouped.keys()):
        if y < 100:
            c.showPage()
            y = height - 80

        # --- Centered Letter Heading ---
        c.setFont("Helvetica-Bold", 14)
        c.drawCentredString(width/2, y, letter)
        text_width = c.stringWidth(letter, "Helvetica-Bold", 14)
        c.line((width - text_width)/2, y-2, (width + text_width)/2, y-2)
        y -= 25

        # Add topics under the group (keep original numbering)
        c.setFont("Helvetica", 12)
        for num, title in sorted(grouped[letter], key=lambda s: s[1].lower()):
            if y < 50:
                c.showPage()
                y = height - 80
            c.drawCentredString(width/2, y, f"{num} {title}")  # Centered
            y -= 20

        y -= 10

    c.save()

# ------------------ Create Word ------------------ #
def create_alphabetical_index_docx(topics, output_docx):
    """Create alphabetical index in Word with original numbers, centered headings, and underlines."""
    doc = Document()

    # --- Title ---
    title = doc.add_heading("THE MATERNITY BENEFIT ACT, 1961", level=0)
    title.alignment = WD_ALIGN_PARAGRAPH.CENTER

    underline = doc.add_paragraph("__________")
    underline.alignment = WD_ALIGN_PARAGRAPH.CENTER

    arr = doc.add_paragraph("ARRANGEMENT OF SECTIONS")
    arr.alignment = WD_ALIGN_PARAGRAPH.CENTER

    underline2 = doc.add_paragraph("__________")
    underline2.alignment = WD_ALIGN_PARAGRAPH.CENTER

    doc.add_paragraph("")  # spacing

    # --- Sections Heading ---
    sections = doc.add_paragraph("SECTIONS")
    sections.alignment = WD_ALIGN_PARAGRAPH.CENTER
    sections.runs[0].bold = True

    # --- Group topics by alphabet ---
    grouped = {}
    for num, title in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append((num, title))

    for letter in sorted(grouped.keys()):
        # --- Centered Letter Heading with underline ---
        para = doc.add_paragraph(letter)
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = para.runs[0]
        run.bold = True
        run.font.size = Pt(14)
        run.underline = True  # underline letter

        doc.add_paragraph("")  # small spacing

        # --- Topics under each letter, centered ---
        for num, title in sorted(grouped[letter], key=lambda s: s[1].lower()):
            p = doc.add_paragraph(f"{num} {title}")
            p.alignment = WD_ALIGN_PARAGRAPH.CENTER

    doc.save(output_docx)

# ------------------ MAIN ------------------ #
if __name__ == "__main__":
    input_pdf = "C:\\Users\\viswa\\OneDrive\\Desktop\\4.pdf"   # Input PDF
    output_pdf = "alphabetical_index.pdf"                      # Output PDF
    output_docx = "alphabetical_index.docx"                    # Output Word

    topics = extract_index_from_first_page(input_pdf)
    create_alphabetical_index_pdf(topics, output_pdf)
    create_alphabetical_index_docx(topics, output_docx)

    print("✅ Index created in both formats:")
    print("PDF:", output_pdf)
    print("Word:", output_docx)


✅ Index created in both formats:
PDF: alphabetical_index.pdf
Word: alphabetical_index.docx


In [3]:
from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
import string
import re

# ------------------ Extract Index from PDF ------------------ #
def extract_index_from_first_page(pdf_path):
    """Extract section titles from first page with original numbering."""
    reader = PdfReader(pdf_path)
    first_page = reader.pages[0]
    text = first_page.extract_text()

    topics = []
    for line in text.splitlines():
        # Match lines like "12. Something" or "5A. Something"
        match = re.match(r"^\s*(\d+[A-Z]?\.)\s*(.*)", line)
        if match:
            num = match.group(1).strip()
            title = match.group(2).strip()
            topics.append((num, title))
    return topics

# ------------------ Create PDF ------------------ #
def create_alphabetical_index_pdf(topics, output_pdf):
    """Create alphabetical index in PDF with original numbers."""
    c = canvas.Canvas(output_pdf, pagesize=A4)
    width, height = A4
    x, y = 50, height - 80

    # --- Title ---
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width/2, height - 50, "THE MATERNITY BENEFIT ACT, 1961")
    c.drawCentredString(width/2, height - 70, "__________")
    c.drawCentredString(width/2, height - 90, "ARRANGEMENT OF SECTIONS")
    c.drawCentredString(width/2, height - 110, "__________")

    y = height - 150
    c.setFont("Helvetica-Bold", 12)
    c.drawCentredString(width/2, y, "SECTIONS")
    y -= 30

    # Group by first letter
    grouped = {}
    for num, title in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append((num, title))

    # Iterate groups in alphabetical order
    for letter in sorted(grouped.keys()):
        if y < 100:
            c.showPage()
            y = height - 80

        # --- Centered Letter Heading ---
        c.setFont("Helvetica-Bold", 14)
        c.drawCentredString(width/2, y, letter)
        text_width = c.stringWidth(letter, "Helvetica-Bold", 14)
        c.line((width - text_width)/2, y-2, (width + text_width)/2, y-2)
        y -= 25

        # Add topics under the group (left aligned)
        c.setFont("Helvetica", 12)
        for num, title in sorted(grouped[letter], key=lambda s: s[1].lower()):
            if y < 50:
                c.showPage()
                y = height - 80
            c.drawString(x + 20, y, f"{num} {title}")  # Left aligned
            y -= 20

        y -= 10

    c.save()

# ------------------ Create Word ------------------ #
def create_alphabetical_index_docx(topics, output_docx):
    """Create alphabetical index in Word with proper alignments."""
    doc = Document()

    # --- Title ---
    title = doc.add_heading("THE MATERNITY BENEFIT ACT, 1961", level=0)
    title.alignment = WD_ALIGN_PARAGRAPH.CENTER

    underline = doc.add_paragraph("__________")
    underline.alignment = WD_ALIGN_PARAGRAPH.CENTER

    arr = doc.add_paragraph("ARRANGEMENT OF SECTIONS")
    arr.alignment = WD_ALIGN_PARAGRAPH.CENTER

    underline2 = doc.add_paragraph("__________")
    underline2.alignment = WD_ALIGN_PARAGRAPH.CENTER

    doc.add_paragraph("")  # spacing

    # --- Sections Heading ---
    sections = doc.add_paragraph("SECTIONS")
    sections.alignment = WD_ALIGN_PARAGRAPH.CENTER
    sections.runs[0].bold = True

    # --- Group topics by alphabet ---
    grouped = {}
    for num, title in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append((num, title))

    for letter in sorted(grouped.keys()):
        # --- Centered Letter Heading with underline ---
        para = doc.add_paragraph(letter)
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = para.runs[0]
        run.bold = True
        run.font.size = Pt(14)
        run.underline = True  # underline letter

        doc.add_paragraph("")  # small spacing

        # --- Topics under each letter (left aligned) ---
        for num, title in sorted(grouped[letter], key=lambda s: s[1].lower()):
            p = doc.add_paragraph(f"{num} {title}")
            p.alignment = WD_ALIGN_PARAGRAPH.LEFT

    doc.save(output_docx)

# ------------------ MAIN ------------------ #
if __name__ == "__main__":
    input_pdf = "C:\\Users\\viswa\\OneDrive\\Desktop\\4.pdf"   # Input PDF
    output_pdf = "4.pdf"                      # Output PDF
    output_docx = "4.docx"                    # Output Word

    topics = extract_index_from_first_page(input_pdf)
    create_alphabetical_index_pdf(topics, output_pdf)
    create_alphabetical_index_docx(topics, output_docx)

    print("✅ Index created in both formats:")
    print("PDF:", output_pdf)
    print("Word:", output_docx)


✅ Index created in both formats:
PDF: 4.pdf
Word: 4.docx


In [27]:
from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
import string
import re

# ------------------ Extract Index from first two pages ------------------ #
def extract_index_from_first_pages(pdf_path, max_pages=2):
    """Extract section titles from first N pages (default 2) with original numbering."""
    reader = PdfReader(pdf_path)
    topics = []

    for i in range(min(max_pages, len(reader.pages))):
        page = reader.pages[i]
        text = page.extract_text()
        if not text:
            continue
        for line in text.splitlines():
            # Match lines like "12. Something" or "5A. Something"
            match = re.match(r"^\s*(\d+[A-Z]?\.)\s*(.*)", line)
            if match:
                num = match.group(1).strip()
                title = match.group(2).strip()
                topics.append((num, title))
    return topics

# ------------------ Create PDF ------------------ #
def create_alphabetical_index_pdf(topics, output_pdf):
    """Create alphabetical index in PDF with original numbers."""
    c = canvas.Canvas(output_pdf, pagesize=A4)
    width, height = A4
    x, y = 50, height - 80

    # --- Title --- #
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width/2, height - 50, "THE PAYMENT OF BONUS ACT, 1965")
    c.drawCentredString(width/2, height - 70, "__________")
    c.drawCentredString(width/2, height - 90, "ARRANGEMENT OF SECTIONS")
    c.drawCentredString(width/2, height - 110, "__________")

    y = height - 150
    c.setFont("Helvetica-Bold", 12)
    c.drawCentredString(width/2, y, "SECTIONS")
    y -= 30

    # Group by first letter #
    grouped = {}
    for num, title in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append((num, title))

    # Iterate groups in alphabetical order #
    for letter in sorted(grouped.keys()):
        if y < 100:
            c.showPage()
            y = height - 80

        # --- Centered Letter Heading --- #
        c.setFont("Helvetica-Bold", 14)
        c.drawCentredString(width/2, y, letter)
        text_width = c.stringWidth(letter, "Helvetica-Bold", 14)
        c.line((width - text_width)/2, y-2, (width + text_width)/2, y-2)
        y -= 25

        # --- Topics under group (left aligned) --- #
        c.setFont("Helvetica", 12)
        for num, title in sorted(grouped[letter], key=lambda s: s[1].lower()):
            if y < 50:
                c.showPage()
                y = height - 80
            c.drawString(x + 20, y, f"{num} {title}")
            y -= 20

        y -= 10

    c.save()

# ------------------ Create Word ------------------ #
def create_alphabetical_index_docx(topics, output_docx):
    """Create alphabetical index in Word with proper alignments."""
    doc = Document()

    # --- Title --- #
    title = doc.add_heading("THE PAYMENT OF BONUS ACT, 1965", level=0)
    title.alignment = WD_ALIGN_PARAGRAPH.CENTER

    underline = doc.add_paragraph("__________")
    underline.alignment = WD_ALIGN_PARAGRAPH.CENTER

    arr = doc.add_paragraph("ARRANGEMENT OF SECTIONS")
    arr.alignment = WD_ALIGN_PARAGRAPH.CENTER

    underline2 = doc.add_paragraph("__________")
    underline2.alignment = WD_ALIGN_PARAGRAPH.CENTER

    doc.add_paragraph("")  # spacing

    # --- Sections Heading --- #
    sections = doc.add_paragraph("SECTIONS")
    sections.alignment = WD_ALIGN_PARAGRAPH.CENTER
    sections.runs[0].bold = True

    # --- Group topics by alphabet --- #
    grouped = {}
    for num, title in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append((num, title))

    for letter in sorted(grouped.keys()):
        # --- Centered Letter Heading with underline --- #
        para = doc.add_paragraph(letter)
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = para.runs[0]
        run.bold = True
        run.font.size = Pt(14)
        run.underline = True  # underline letter

        doc.add_paragraph("")  # spacing

        # --- Topics under each letter (left aligned) --- #
        for num, title in sorted(grouped[letter], key=lambda s: s[1].lower()):
            p = doc.add_paragraph(f"{num} {title}")
            p.alignment = WD_ALIGN_PARAGRAPH.LEFT

    doc.save(output_docx)

# ------------------ MAIN ------------------ #
if __name__ == "__main__":
    input_pdf = "C:\\Users\\viswa\\OneDrive\\Desktop\\11.pdf"   # Input PDF
    output_pdf = "11.pdf"              # Output PDF
    output_docx = "11.docx"            # Output Word

    topics = extract_index_from_first_pages(input_pdf, max_pages=4)
    create_alphabetical_index_pdf(topics, output_pdf)
    create_alphabetical_index_docx(topics, output_docx)

    print("✅ Index created in both formats:")
    print("PDF:", output_pdf)
    print("Word:", output_docx)


✅ Index created in both formats:
PDF: 11.pdf
Word: 11.docx


In [10]:
from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
import string
import re

# ------------------ Extract Index ------------------ #
def extract_index_from_first_pages(pdf_path, max_pages=2):
    """Extract section titles with subtopics from first N pages."""
    reader = PdfReader(pdf_path)
    topics = []

    current_parent = None

    for i in range(min(max_pages, len(reader.pages))):
        page = reader.pages[i]
        text = page.extract_text()
        if not text:
            continue

        for line in text.splitlines():
            line = line.strip()
            if not line:
                continue

            # Match numbered topics like "120. Repeal and savings."
            match = re.match(r"^\s*(\d+[A-Z]?\.)\s*(.*)", line)
            if match:
                num = match.group(1).strip()
                title = match.group(2).strip()
                current_parent = (num, title, [])
                topics.append(current_parent)

            else:
                # If no numbering, treat as subtopic of last parent
                if current_parent:
                    current_parent[2].append(line)

    return topics

# ------------------ Create PDF ------------------ #
def create_alphabetical_index_pdf(topics, output_pdf):
    c = canvas.Canvas(output_pdf, pagesize=A4)
    width, height = A4
    x, y = 50, height - 80

    # Title
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width/2, height - 50, "THE PAYMENT OF WAGES ACT, 1936")
    c.drawCentredString(width/2, height - 70, "__________")
    c.drawCentredString(width/2, height - 90, "ARRANGEMENT OF SECTIONS")
    c.drawCentredString(width/2, height - 110, "__________")

    y = height - 150
    c.setFont("Helvetica-Bold", 12)
    c.drawCentredString(width/2, y, "SECTIONS")
    y -= 30

    # Group by alphabet
    grouped = {}
    for num, title, subs in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append((num, title, subs))

    for letter in sorted(grouped.keys()):
        if y < 100:
            c.showPage()
            y = height - 80

        # Letter heading
        c.setFont("Helvetica-Bold", 14)
        c.drawCentredString(width/2, y, letter)
        text_width = c.stringWidth(letter, "Helvetica-Bold", 14)
        c.line((width - text_width)/2, y-2, (width + text_width)/2, y-2)
        y -= 25

        # Topics under the letter
        c.setFont("Helvetica", 12)
        for num, title, subs in sorted(grouped[letter], key=lambda s: s[1].lower()):
            if y < 50:
                c.showPage()
                y = height - 80
            c.drawString(x + 20, y, f"{num} {title}")
            y -= 20

            # Subtopics indented
            for sub in subs:
                if y < 50:
                    c.showPage()
                    y = height - 80
                c.drawString(x + 50, y, f"- {sub}")
                y -= 20

        y -= 10

    c.save()

# ------------------ Create Word ------------------ #
def create_alphabetical_index_docx(topics, output_docx):
    doc = Document()

    # Title
    title = doc.add_heading("THE PAYMENT OF WAGES ACT, 1936", level=0)
    title.alignment = WD_ALIGN_PARAGRAPH.CENTER
    doc.add_paragraph("__________").alignment = WD_ALIGN_PARAGRAPH.CENTER
    arr = doc.add_paragraph("ARRANGEMENT OF SECTIONS")
    arr.alignment = WD_ALIGN_PARAGRAPH.CENTER
    doc.add_paragraph("__________").alignment = WD_ALIGN_PARAGRAPH.CENTER
    doc.add_paragraph("")

    sections = doc.add_paragraph("SECTIONS")
    sections.alignment = WD_ALIGN_PARAGRAPH.CENTER
    sections.runs[0].bold = True

    # Group by alphabet
    grouped = {}
    for num, title, subs in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append((num, title, subs))

    for letter in sorted(grouped.keys()):
        para = doc.add_paragraph(letter)
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = para.runs[0]
        run.bold = True
        run.font.size = Pt(14)
        run.underline = True

        doc.add_paragraph("")

        for num, title, subs in sorted(grouped[letter], key=lambda s: s[1].lower()):
            p = doc.add_paragraph(f"{num} {title}")
            p.alignment = WD_ALIGN_PARAGRAPH.LEFT

            # Add subtopics indented
            for sub in subs:
                sub_p = doc.add_paragraph(f"- {sub}")
                sub_p.alignment = WD_ALIGN_PARAGRAPH.LEFT
                sub_p.paragraph_format.left_indent = Pt(30)

    doc.save(output_docx)

# ------------------ MAIN ------------------ #
if __name__ == "__main__":
    input_pdf = "C:\\Users\\viswa\\OneDrive\\Desktop\\6.pdf"
    output_pdf = "6.pdf"
    output_docx = "6.docx"

    topics = extract_index_from_first_pages(input_pdf, max_pages=2)
    create_alphabetical_index_pdf(topics, output_pdf)
    create_alphabetical_index_docx(topics, output_docx)

    print("✅ Index created with subtopics handled:")
    print("PDF:", output_pdf)
    print("Word:", output_docx)


✅ Index created with subtopics handled:
PDF: 6.pdf
Word: 6.docx


In [11]:
import re
import string
from collections import defaultdict, OrderedDict

from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas

from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt

# =========================
# Parsing helpers
# =========================

NUM_SECTION_RE = re.compile(r"^\s*(\d+[A-Z]?\.)\s*(.+?)\s*$")   # 7. Title / 7A. Title / 36A. Title
CHAPTER_RE = re.compile(r"^\s*CHAPTER\b", re.IGNORECASE)
ROMAN_ONLY_RE = re.compile(r"^\s*[IVXLCDM]+\s*$", re.IGNORECASE)
PAGE_NUMBER_RE = re.compile(r"^\s*\d+\s*$")
SECTIONS_HEADING_RE = re.compile(r"^\s*SECTIONS\s*$", re.IGNORECASE)
ALLCAPS_LINE_RE = re.compile(r"^[A-Z0-9 ,’'`\-\.\(\)\/&]+$")     # crude but practical
SCHEDULE_RE = re.compile(r"\bSCHEDULE\b", re.IGNORECASE)

PREFERRED_CATEGORY_ORDER = [
    "PRELIMINARY",
    "THE INSPECTING STAFF",
    "HEALTH",
    "SAFETY",
    "PROVISIONS RELATING TO HAZARDOUS PROCESSES",
    "WELFARE",
]

def normalize_space(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def looks_like_category(line: str) -> bool:
    """A category is typically an ALL-CAPS line that is NOT 'SECTIONS' and NOT a page number."""
    t = normalize_space(line)
    if not t:
        return False
    if SECTIONS_HEADING_RE.match(t):
        return False
    if PAGE_NUMBER_RE.match(t):
        return False
    if CHAPTER_RE.match(t):
        return False
    # e.g., "CHAPTER IVA" might be on one line and the category on the next line (ALL CAPS).
    # accept all-caps lines as categories
    return bool(ALLCAPS_LINE_RE.match(t)) and any(c.isalpha() for c in t)

def first_letter(title: str) -> str:
    """First alphabetic letter for grouping; # for non A-Z."""
    for ch in title:
        if ch.isalpha():
            return ch.upper()
    return "#"

# =========================
# Extraction
# =========================

def extract_factories_act_index(pdf_path: str, max_pages: int = 10):
    """
    Extracts a structured list of sections from the first max_pages of the PDF:
    - Tracks current CHAPTER -> CATEGORY (e.g., PRELIMINARY, HEALTH).
    - Captures numbered sections with their category.
    - Attaches unnumbered SCHEDULE lines as subitems of the last numbered section.
    Returns: list of dicts: {num, title, category, subitems: [str, ...]}
    """
    reader = PdfReader(pdf_path)
    pages_to_read = min(max_pages, len(reader.pages))

    results = []
    current_category = None
    current_parent = None          # last numbered section for attaching schedules

    pending_maybe_category = False # True right after a CHAPTER line

    for i in range(pages_to_read):
        page = reader.pages[i]
        text = page.extract_text() or ""
        for raw in text.splitlines():
            line = normalize_space(raw)
            if not line:
                continue

            # Skip obvious noise
            if PAGE_NUMBER_RE.match(line):
                continue

            # "CHAPTER ..." line? set flag to expect category next
            if CHAPTER_RE.match(line):
                pending_maybe_category = True
                continue

            # A roman numeral alone line (often on a line by itself after "CHAPTER")
            if pending_maybe_category and ROMAN_ONLY_RE.match(line):
                # Still expecting the real category on next all-caps line
                continue

            # If we just saw CHAPTER, the next ALLCAPS line (not 'SECTIONS') is the category
            if pending_maybe_category and looks_like_category(line):
                current_category = line
                pending_maybe_category = False
                continue

            # 'SECTIONS' headers repeat—ignore
            if SECTIONS_HEADING_RE.match(line):
                pending_maybe_category = False
                continue

            # Numbered section?
            m = NUM_SECTION_RE.match(line)
            if m:
                num = m.group(1).strip()
                title = m.group(2).strip().rstrip(".")
                current_parent = {"num": num, "title": title, "category": current_category, "subitems": []}
                results.append(current_parent)
                continue

            # Unnumbered SCHEDULE lines → attach to last numbered section as subitems
            if SCHEDULE_RE.search(line) and current_parent:
                # Keep original casing; ensure trailing period
                item = line if line.endswith(".") else line + "."
                current_parent["subitems"].append(item)
                continue

            # Any other ALLCAPS line that appears between blocks (e.g., repeated category on a new page)
            # Treat as category if we are NOT in a pending chapter and the line looks like category
            if looks_like_category(line):
                current_category = line
                continue

            # Otherwise ignore residual lines (page decorations, wrapped lines already captured by PyPDF2, etc.)

    return results

# =========================
# Grouping: A/B/C → Category → Items
# =========================

def group_by_letter_and_category(sections):
    """
    sections: list of dicts {num,title,category,subitems}
    Returns: Ordered dict: letter -> Ordered dict: category -> list of sections
    Category order: preferred list first, then alphabetical for the rest.
    """
    by_letter = defaultdict(list)
    for s in sections:
        letter = first_letter(s["title"])
        by_letter[letter].append(s)

    # Now per letter, split by category
    grouped = OrderedDict()
    for letter in sorted(by_letter.keys()):
        bucket = by_letter[letter]

        # gather categories present
        cats = defaultdict(list)
        for s in bucket:
            cat = (s["category"] or "").strip() or "MISCELLANEOUS"
            cats[cat].append(s)

        # order categories: preferred order first (only those present), then others alphabetically
        ordered_cats = []
        pref_present = [c for c in PREFERRED_CATEGORY_ORDER if c in cats]
        others = sorted([c for c in cats.keys() if c not in PREFERRED_CATEGORY_ORDER])
        for c in pref_present + others:
            # sort sections within a category by title (case-insensitive), but keep numbering shown
            cats[c].sort(key=lambda t: t["title"].lower())
            ordered_cats.append((c, cats[c]))

        grouped[letter] = ordered_cats

    return grouped

# =========================
# PDF Rendering (ReportLab)
# =========================

def draw_centered_underlined(c, width, y, text, font="Helvetica-Bold", size=14, underline=True):
    c.setFont(font, size)
    c.drawCentredString(width/2, y, text)
    if underline:
        w = c.stringWidth(text, font, size)
        c.line((width - w)/2, y-2, (width + w)/2, y-2)

def create_pdf(grouped, output_pdf, act_title="THE FACTORIES ACT, 1948"):
    c = canvas.Canvas(output_pdf, pagesize=A4)
    width, height = A4
    left = 50
    y = height - 50

    # Title block
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width/2, y, act_title); y -= 20
    c.drawCentredString(width/2, y, "__________"); y -= 20
    c.drawCentredString(width/2, y, "ARRANGEMENT OF SECTIONS"); y -= 20
    c.drawCentredString(width/2, y, "__________"); y -= 30

    c.setFont("Helvetica-Bold", 12)
    c.drawCentredString(width/2, y, "SECTIONS"); y -= 30

    # Content
    for letter, cat_list in grouped.items():
        # new page if low
        if y < 100:
            c.showPage(); y = height - 50

        # Letter heading (centered + underline)
        draw_centered_underlined(c, width, y, letter, font="Helvetica-Bold", size=14, underline=True)
        y -= 25

        # For each category under this letter
        for category, items in cat_list:
            if y < 100:
                c.showPage(); y = height - 50
            # Category heading (centered + underline)
            draw_centered_underlined(c, width, y, category, font="Helvetica-Bold", size=12, underline=True)
            y -= 22

            c.setFont("Helvetica", 12)
            for s in items:
                # Section line
                if y < 60:
                    c.showPage(); y = height - 50; c.setFont("Helvetica", 12)
                line = f"{s['num']} {s['title']}"
                c.drawString(left + 20, y, line); y -= 18

                # Subitems (e.g., schedules) indented
                for sub in s.get("subitems", []):
                    if y < 60:
                        c.showPage(); y = height - 50; c.setFont("Helvetica", 12)
                    c.drawString(left + 50, y, f"- {sub}")
                    y -= 18

            y -= 10  # gap after category

        y -= 8  # gap after letter group

    c.save()

# =========================
# Word Rendering (python-docx)
# =========================

def add_centered_underlined_para(doc: Document, text: str, size_pt=14, bold=True):
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run(text)
    run.bold = bold
    run.underline = True
    run.font.size = Pt(size_pt)
    return p

def create_docx(grouped, output_docx, act_title="THE FACTORIES ACT, 1948"):
    doc = Document()

    # Title block
    h = doc.add_heading(act_title, level=0)
    h.alignment = WD_ALIGN_PARAGRAPH.CENTER

    p = doc.add_paragraph("__________"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p = doc.add_paragraph("ARRANGEMENT OF SECTIONS"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p = doc.add_paragraph("__________"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER

    p = doc.add_paragraph("SECTIONS")
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    if p.runs: p.runs[0].bold = True

    doc.add_paragraph("")  # spacing

    # Content
    for letter, cat_list in grouped.items():
        # Letter heading
        add_centered_underlined_para(doc, letter, size_pt=14, bold=True)

        for category, items in cat_list:
            # Category heading
            add_centered_underlined_para(doc, category, size_pt=12, bold=True)

            # Items
            for s in items:
                p = doc.add_paragraph(f"{s['num']} {s['title']}")
                p.alignment = WD_ALIGN_PARAGRAPH.LEFT

                # Subitems (schedules) indented
                for sub in s.get("subitems", []):
                    sp = doc.add_paragraph(f"- {sub}")
                    sp.alignment = WD_ALIGN_PARAGRAPH.LEFT
                    sp.paragraph_format.left_indent = Pt(30)

            doc.add_paragraph("")  # small gap after category

        doc.add_paragraph("")  # gap after letter

    doc.save(output_docx)

# =========================
# MAIN
# =========================

if __name__ == "__main__":
    # ---- Configure these paths ----
    input_pdf = r"C:\\Users\\viswa\\OneDrive\\Desktop\\7.pdf"  # your source PDF
    output_pdf = r"7.pdf"
    output_docx = r"7.docx"

    # How many index pages to read (you said it can be up to five)
    MAX_INDEX_PAGES = 5

    # 1) Extract structured sections
    sections = extract_factories_act_index(input_pdf, max_pages=MAX_INDEX_PAGES)

    # 2) Group: A/B/C -> Category -> Items
    grouped = group_by_letter_and_category(sections)

    # 3) Render both outputs
    create_pdf(grouped, output_pdf, act_title="THE FACTORIES ACT, 1948")
    create_docx(grouped, output_docx, act_title="THE FACTORIES ACT, 1948")

    print("✅ Done.")
    print("PDF :", output_pdf)
    print("Word:", output_docx)


✅ Done.
PDF : 7.pdf
Word: 7.docx


In [13]:
import re
import string
from collections import defaultdict, OrderedDict

from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas

from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt

# =========================
# Parsing helpers
# =========================

NUM_SECTION_RE = re.compile(r"^\s*(\d+[A-Z]?\.)\s*(.+?)\s*$")
CHAPTER_RE = re.compile(r"^\s*CHAPTER\b", re.IGNORECASE)
ROMAN_ONLY_RE = re.compile(r"^\s*[IVXLCDM]+\s*$", re.IGNORECASE)
PAGE_NUMBER_RE = re.compile(r"^\s*\d+\s*$")
SECTIONS_HEADING_RE = re.compile(r"^\s*SECTIONS\s*$", re.IGNORECASE)
ALLCAPS_LINE_RE = re.compile(r"^[A-Z0-9 ,’'`\-\.\(\)\/&]+$")
SCHEDULE_RE = re.compile(r"\bSCHEDULE\b", re.IGNORECASE)

PREFERRED_CATEGORY_ORDER = [
    "PRELIMINARY",
    "THE INSPECTING STAFF",
    "HEALTH",
    "SAFETY",
    "PROVISIONS RELATING TO HAZARDOUS PROCESSES",
    "WELFARE",
]

def normalize_space(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def looks_like_category(line: str) -> bool:
    t = normalize_space(line)
    if not t:
        return False
    if SECTIONS_HEADING_RE.match(t):
        return False
    if PAGE_NUMBER_RE.match(t):
        return False
    if CHAPTER_RE.match(t):
        return False
    return bool(ALLCAPS_LINE_RE.match(t)) and any(c.isalpha() for c in t)

def first_letter(title: str) -> str:
    for ch in title:
        if ch.isalpha():
            return ch.upper()
    return "#"

# =========================
# Extraction
# =========================

def extract_factories_act_index(pdf_path: str, max_pages: int = 10):
    reader = PdfReader(pdf_path)
    pages_to_read = min(max_pages, len(reader.pages))

    results = []
    current_category = None
    current_parent = None
    pending_maybe_category = False

    for i in range(pages_to_read):
        page = reader.pages[i]
        text = page.extract_text() or ""
        for raw in text.splitlines():
            line = normalize_space(raw)
            if not line:
                continue
            if PAGE_NUMBER_RE.match(line):
                continue
            if CHAPTER_RE.match(line):
                pending_maybe_category = True
                continue
            if pending_maybe_category and ROMAN_ONLY_RE.match(line):
                continue
            if pending_maybe_category and looks_like_category(line):
                current_category = line
                pending_maybe_category = False
                continue
            # === FIXED: Ignore repeated "SECTIONS" header without resetting category ===
            if SECTIONS_HEADING_RE.match(line):
                continue
            m = NUM_SECTION_RE.match(line)
            if m:
                num = m.group(1).strip()
                title = m.group(2).strip().rstrip(".")
                current_parent = {"num": num, "title": title, "category": current_category, "subitems": []}
                results.append(current_parent)
                continue
            if SCHEDULE_RE.search(line) and current_parent:
                item = line if line.endswith(".") else line + "."
                current_parent["subitems"].append(item)
                continue
            if looks_like_category(line):
                current_category = line
                continue
    return results

# =========================
# Grouping
# =========================

def group_by_letter_and_category(sections):
    by_letter = defaultdict(list)
    for s in sections:
        letter = first_letter(s["title"])
        by_letter[letter].append(s)

    grouped = OrderedDict()
    for letter in sorted(by_letter.keys()):
        bucket = by_letter[letter]
        cats = defaultdict(list)
        for s in bucket:
            cat = (s["category"] or "").strip() or "MISCELLANEOUS"
            cats[cat].append(s)

        ordered_cats = []
        pref_present = [c for c in PREFERRED_CATEGORY_ORDER if c in cats]
        others = sorted([c for c in cats.keys() if c not in PREFERRED_CATEGORY_ORDER])
        for c in pref_present + others:
            cats[c].sort(key=lambda t: t["title"].lower())
            ordered_cats.append((c, cats[c]))

        grouped[letter] = ordered_cats
    return grouped

# =========================
# PDF Rendering
# =========================

def draw_centered_underlined(c, width, y, text, font="Helvetica-Bold", size=14, underline=True):
    c.setFont(font, size)
    c.drawCentredString(width/2, y, text)
    if underline:
        w = c.stringWidth(text, font, size)
        c.line((width - w)/2, y-2, (width + w)/2, y-2)

def create_pdf(grouped, output_pdf, act_title="THE FACTORIES ACT, 1948"):
    c = canvas.Canvas(output_pdf, pagesize=A4)
    width, height = A4
    left = 50
    y = height - 50

    # Title block
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width/2, y, act_title); y -= 20
    c.drawCentredString(width/2, y, "__________"); y -= 20
    c.drawCentredString(width/2, y, "ARRANGEMENT OF SECTIONS"); y -= 20
    c.drawCentredString(width/2, y, "__________"); y -= 30

    c.setFont("Helvetica-Bold", 12)
    c.drawCentredString(width/2, y, "SECTIONS"); y -= 30

    # Content
    for letter, cat_list in grouped.items():
        if y < 100:
            c.showPage(); y = height - 50
        draw_centered_underlined(c, width, y, letter, font="Helvetica-Bold", size=14, underline=True)
        y -= 25

        for category, items in cat_list:
            if y < 100:
                c.showPage(); y = height - 50
            draw_centered_underlined(c, width, y, category, font="Helvetica-Bold", size=12, underline=True)
            y -= 22

            c.setFont("Helvetica", 12)
            for s in items:
                if y < 60:
                    c.showPage(); y = height - 50; c.setFont("Helvetica", 12)
                line = f"{s['num']} {s['title']}"
                c.drawString(left + 20, y, line); y -= 18
                for sub in s.get("subitems", []):
                    if y < 60:
                        c.showPage(); y = height - 50; c.setFont("Helvetica", 12)
                    c.drawString(left + 50, y, f"- {sub}")
                    y -= 18
            y -= 10
        y -= 8
    c.save()

# =========================
# Word Rendering
# =========================

def add_centered_underlined_para(doc: Document, text: str, size_pt=14, bold=True):
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run(text)
    run.bold = bold
    run.underline = True
    run.font.size = Pt(size_pt)
    return p

def create_docx(grouped, output_docx, act_title="THE FACTORIES ACT, 1948"):
    doc = Document()

    # Title block
    h = doc.add_heading(act_title, level=0)
    h.alignment = WD_ALIGN_PARAGRAPH.CENTER

    p = doc.add_paragraph("__________"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p = doc.add_paragraph("ARRANGEMENT OF SECTIONS"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p = doc.add_paragraph("__________"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER

    p = doc.add_paragraph("SECTIONS")
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    if p.runs: p.runs[0].bold = True

    doc.add_paragraph("")  # spacing

    for letter, cat_list in grouped.items():
        add_centered_underlined_para(doc, letter, size_pt=14, bold=True)

        for category, items in cat_list:
            add_centered_underlined_para(doc, category, size_pt=12, bold=True)

            for s in items:
                p = doc.add_paragraph(f"{s['num']} {s['title']}")
                p.alignment = WD_ALIGN_PARAGRAPH.LEFT
                for sub in s.get("subitems", []):
                    sp = doc.add_paragraph(f"- {sub}")
                    sp.alignment = WD_ALIGN_PARAGRAPH.LEFT
                    sp.paragraph_format.left_indent = Pt(30)
            doc.add_paragraph("")  # gap after category
        doc.add_paragraph("")  # gap after letter

    doc.save(output_docx)

# =========================
# MAIN
# =========================

if __name__ == "__main__":
    input_pdf = r"C:\\Users\\viswa\\OneDrive\\Desktop\\7.pdf"
    output_pdf = r"7.pdf"
    output_docx = r"7.docx"

    MAX_INDEX_PAGES = 5 # configurable for longer indexes

    sections = extract_factories_act_index(input_pdf, max_pages=MAX_INDEX_PAGES)


In [16]:
import re
import string
from collections import defaultdict, OrderedDict
from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt

# -------------------
# Helpers
# -------------------
def normalize_space(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def first_letter(title: str) -> str:
    for ch in title:
        if ch.isalpha():
            return ch.upper()
    return "#"

# -------------------
# Extract sections
# -------------------
def extract_sections_from_pdf(pdf_path, max_pages=10):
    reader = PdfReader(pdf_path)
    pages_to_read = min(max_pages, len(reader.pages))
    sections = []
    current_category = None

    multi_section_split = re.compile(r"(?=\d+[A-Z]?\.|SECTION\s*S\d+\.)", re.IGNORECASE)
    num_section_re = re.compile(r"(?:SECTION\s*)?(\d+[A-Z]?|S\d+)\.\s*(.+?)\s*(?=$|\d+[A-Z]?\.|SECTION\s*S\d+)", re.IGNORECASE)
    chapter_re = re.compile(r"^\s*CHAPTER\b", re.IGNORECASE)
    allcaps_re = re.compile(r"^[A-Z0-9 ,’'`\-\.\(\)\/&]+$")

    for i in range(pages_to_read):
        page = reader.pages[i]
        text = page.extract_text() or ""
        for raw_line in text.splitlines():
            line = normalize_space(raw_line)
            if not line:
                continue
            if chapter_re.match(line):
                continue
            if allcaps_re.match(line) and any(c.isalpha() for c in line):
                current_category = line
                continue
            split_sections = multi_section_split.split(line)
            for sec in split_sections:
                sec = sec.strip()
                if not sec:
                    continue
                m = num_section_re.match(sec)
                if m:
                    num = m.group(1).strip()
                    title = m.group(2).strip().rstrip(".")
                    sections.append({"num": num, "title": title, "category": current_category, "subitems": []})
    return sections

# -------------------
# Group alphabetically & by category
# -------------------
PREFERRED_CATEGORY_ORDER = [
    "PRELIMINARY",
    "THE INSPECTING STAFF",
    "HEALTH",
    "SAFETY",
    "PROVISIONS RELATING TO HAZARDOUS PROCESSES",
    "WELFARE",
    "WORKING HOURS OF ADULTS",
    "EMPLOYMENT OF YOUNG PERSONS",
    "ANNUAL LEAVE WITH WAGES",
    "SPECIAL PROVISIONS",
    "PENALTIES AND PROCEDURE",
    "SUPPLEMENTAL",
]

def group_by_letter_and_category(sections):
    by_letter = defaultdict(list)
    for s in sections:
        letter = first_letter(s["title"])
        by_letter[letter].append(s)

    grouped = OrderedDict()
    for letter in sorted(by_letter.keys()):
        bucket = by_letter[letter]
        cats = defaultdict(list)
        for s in bucket:
            cat = (s["category"] or "").strip() or "MISCELLANEOUS"
            cats[cat].append(s)
        ordered_cats = []
        pref_present = [c for c in PREFERRED_CATEGORY_ORDER if c in cats]
        others = sorted([c for c in cats.keys() if c not in PREFERRED_CATEGORY_ORDER])
        for c in pref_present + others:
            cats[c].sort(key=lambda t: t["title"].lower())
            ordered_cats.append((c, cats[c]))
        grouped[letter] = ordered_cats
    return grouped

# -------------------
# PDF Output
# -------------------
def draw_centered_underlined(c, width, y, text, font="Helvetica-Bold", size=14, underline=True):
    c.setFont(font, size)
    c.drawCentredString(width/2, y, text)
    if underline:
        w = c.stringWidth(text, font, size)
        c.line((width - w)/2, y-2, (width + w)/2, y-2)

def create_pdf(grouped, output_pdf, act_title="THE FACTORIES ACT, 1948"):
    c = canvas.Canvas(output_pdf, pagesize=A4)
    width, height = A4
    left = 50
    y = height - 50

    # Title block
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width/2, y, act_title); y -= 20
    c.drawCentredString(width/2, y, "__________"); y -= 20
    c.drawCentredString(width/2, y, "ARRANGEMENT OF SECTIONS"); y -= 20
    c.drawCentredString(width/2, y, "__________"); y -= 30

    c.setFont("Helvetica-Bold", 12)
    c.drawCentredString(width/2, y, "SECTIONS"); y -= 30

    # Content
    for letter, cat_list in grouped.items():
        if y < 100:
            c.showPage(); y = height - 50
        draw_centered_underlined(c, width, y, letter, font="Helvetica-Bold", size=14, underline=True)
        y -= 25

        for category, items in cat_list:
            if y < 100:
                c.showPage(); y = height - 50
            draw_centered_underlined(c, width, y, category, font="Helvetica-Bold", size=12, underline=True)
            y -= 22

            c.setFont("Helvetica", 12)
            for s in items:
                if y < 60:
                    c.showPage(); y = height - 50; c.setFont("Helvetica", 12)
                line = f"{s['num']} {s['title']}"
                c.drawString(left + 20, y, line); y -= 18
                for sub in s.get("subitems", []):
                    if y < 60:
                        c.showPage(); y = height - 50; c.setFont("Helvetica", 12)
                    c.drawString(left + 50, y, f"- {sub}")
                    y -= 18
            y -= 10
        y -= 8
    c.save()

# -------------------
# Word Output
# -------------------
def add_centered_underlined_para(doc: Document, text: str, size_pt=14, bold=True):
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run(text)
    run.bold = bold
    run.underline = True
    run.font.size = Pt(size_pt)
    return p

def create_docx(grouped, output_docx, act_title="THE FACTORIES ACT, 1948"):
    doc = Document()

    # Title block
    h = doc.add_heading(act_title, level=0)
    h.alignment = WD_ALIGN_PARAGRAPH.CENTER

    p = doc.add_paragraph("__________"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p = doc.add_paragraph("ARRANGEMENT OF SECTIONS"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p = doc.add_paragraph("__________"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER

    p = doc.add_paragraph("SECTIONS")
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    if p.runs: p.runs[0].bold = True

    doc.add_paragraph("")  # spacing

    for letter, cat_list in grouped.items():
        add_centered_underlined_para(doc, letter, size_pt=14, bold=True)

        for category, items in cat_list:
            add_centered_underlined_para(doc, category, size_pt=12, bold=True)

            for s in items:
                p = doc.add_paragraph(f"{s['num']} {s['title']}")
                p.alignment = WD_ALIGN_PARAGRAPH.LEFT
                for sub in s.get("subitems", []):
                    sp = doc.add_paragraph(f"- {sub}")
                    sp.alignment = WD_ALIGN_PARAGRAPH.LEFT
                    sp.paragraph_format.left_indent = Pt(30)
            doc.add_paragraph("")  # gap after category
        doc.add_paragraph("")  # gap after letter

    doc.save(output_docx)

# -------------------
# Main
# -------------------
if __name__ == "__main__":
    input_pdf = r"C:\\Users\\viswa\\OneDrive\\Desktop\\7.pdf"
    output_pdf = r"7.pdf"
    output_docx = r"7.docx"

    MAX_INDEX_PAGES = 5  # read first N pages of index

    sections = extract_sections_from_pdf(input_pdf, max_pages=MAX_INDEX_PAGES)
    grouped = group_by_letter_and_category(sections)
    create_pdf(grouped, output_pdf)
    create_docx(grouped, output_docx)

    print("✅ PDF and Word alphabetical index generated successfully!")


✅ PDF and Word alphabetical index generated successfully!


In [17]:
import re
import string
from collections import defaultdict, OrderedDict
from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt

# -------------------
# Helpers
# -------------------
def normalize_space(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def first_letter(title: str) -> str:
    for ch in title:
        if ch.isalpha():
            return ch.upper()
    return "#"

# -------------------
# Extract sections
# -------------------
def extract_sections_from_pdf(pdf_path, max_pages=10):
    """
    Extract sections with their chapter/category from PDF.
    Ignores page header 'SECTIONS'.
    Includes subtopics if formatted under a main section.
    """
    reader = PdfReader(pdf_path)
    pages_to_read = min(max_pages, len(reader.pages))
    sections = []
    current_category = None
    last_section = None

    multi_section_split = re.compile(r"(?=\d+[A-Z]?\.|SECTION\s*S\d+\.)", re.IGNORECASE)
    num_section_re = re.compile(r"(?:SECTION\s*)?(\d+[A-Z]?|S\d+)\.\s*(.+?)\s*(?=$|\d+[A-Z]?\.|SECTION\s*S\d+)", re.IGNORECASE)
    chapter_re = re.compile(r"^\s*CHAPTER\b", re.IGNORECASE)
    allcaps_re = re.compile(r"^[A-Z0-9 ,’'`\-\.\(\)\/&]+$")

    for i in range(pages_to_read):
        page = reader.pages[i]
        text = page.extract_text() or ""
        for raw_line in text.splitlines():
            line = normalize_space(raw_line)
            if not line:
                continue

            # Skip page header "SECTIONS"
            if line.upper() == "SECTIONS":
                continue

            # Detect chapter heading and ignore "CHAPTER IV" etc.
            if chapter_re.match(line):
                continue

            # Detect category heading (all uppercase, not numeric)
            if allcaps_re.match(line) and any(c.isalpha() for c in line):
                current_category = line
                continue

            # Check for subtopics under last section (like under "REPEAL AND SAVINGS")
            if last_section and line.startswith("-"):
                last_section["subitems"].append(line.lstrip("- ").strip())
                continue

            # Split multiple sections in one line
            split_sections = multi_section_split.split(line)
            for sec in split_sections:
                sec = sec.strip()
                if not sec:
                    continue
                m = num_section_re.match(sec)
                if m:
                    num = m.group(1).strip()
                    title = m.group(2).strip().rstrip(".")
                    section_obj = {
                        "num": num,
                        "title": title,
                        "category": current_category or "MISCELLANEOUS",
                        "subitems": []
                    }
                    sections.append(section_obj)
                    last_section = section_obj
    return sections

# -------------------
# Group alphabetically & by category
# -------------------
PREFERRED_CATEGORY_ORDER = [
    "PRELIMINARY",
    "THE INSPECTING STAFF",
    "HEALTH",
    "SAFETY",
    "PROVISIONS RELATING TO HAZARDOUS PROCESSES",
    "WELFARE",
    "WORKING HOURS OF ADULTS",
    "EMPLOYMENT OF YOUNG PERSONS",
    "ANNUAL LEAVE WITH WAGES",
    "SPECIAL PROVISIONS",
    "PENALTIES AND PROCEDURE",
    "SUPPLEMENTAL",
]

def group_by_letter_and_category(sections):
    by_letter = defaultdict(list)
    for s in sections:
        letter = first_letter(s["title"])
        by_letter[letter].append(s)

    grouped = OrderedDict()
    for letter in sorted(by_letter.keys()):
        bucket = by_letter[letter]
        cats = defaultdict(list)
        for s in bucket:
            cat = (s["category"] or "").strip() or "MISCELLANEOUS"
            cats[cat].append(s)
        ordered_cats = []
        pref_present = [c for c in PREFERRED_CATEGORY_ORDER if c in cats]
        others = sorted([c for c in cats.keys() if c not in PREFERRED_CATEGORY_ORDER])
        for c in pref_present + others:
            cats[c].sort(key=lambda t: t["title"].lower())
            ordered_cats.append((c, cats[c]))
        grouped[letter] = ordered_cats
    return grouped

# -------------------
# PDF Output
# -------------------
def draw_centered_underlined(c, width, y, text, font="Helvetica-Bold", size=14, underline=True):
    c.setFont(font, size)
    c.drawCentredString(width/2, y, text)
    if underline:
        w = c.stringWidth(text, font, size)
        c.line((width - w)/2, y-2, (width + w)/2, y-2)

def create_pdf(grouped, output_pdf, act_title="THE FACTORIES ACT, 1948"):
    c = canvas.Canvas(output_pdf, pagesize=A4)
    width, height = A4
    left = 50
    y = height - 50

    # Title block
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width/2, y, act_title); y -= 20
    c.drawCentredString(width/2, y, "__________"); y -= 20
    c.drawCentredString(width/2, y, "ARRANGEMENT OF SECTIONS"); y -= 20
    c.drawCentredString(width/2, y, "__________"); y -= 30

    c.setFont("Helvetica-Bold", 12)
    c.drawCentredString(width/2, y, "SECTIONS"); y -= 30

    # Content
    for letter, cat_list in grouped.items():
        if y < 100:
            c.showPage(); y = height - 50
        draw_centered_underlined(c, width, y, letter, font="Helvetica-Bold", size=14, underline=True)
        y -= 25

        for category, items in cat_list:
            if y < 100:
                c.showPage(); y = height - 50
            draw_centered_underlined(c, width, y, category, font="Helvetica-Bold", size=12, underline=True)
            y -= 22

            c.setFont("Helvetica", 12)
            for s in items:
                if y < 60:
                    c.showPage(); y = height - 50; c.setFont("Helvetica", 12)
                line = f"{s['num']} {s['title']}"
                c.drawString(left + 20, y, line); y -= 18
                for sub in s.get("subitems", []):
                    if y < 60:
                        c.showPage(); y = height - 50; c.setFont("Helvetica", 12)
                    c.drawString(left + 50, y, f"- {sub}")
                    y -= 18
            y -= 10
        y -= 8
    c.save()

# -------------------
# Word Output
# -------------------
def add_centered_underlined_para(doc: Document, text: str, size_pt=14, bold=True):
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run(text)
    run.bold = bold
    run.underline = True
    run.font.size = Pt(size_pt)
    return p

def create_docx(grouped, output_docx, act_title="THE FACTORIES ACT, 1948"):
    doc = Document()

    # Title block
    h = doc.add_heading(act_title, level=0)
    h.alignment = WD_ALIGN_PARAGRAPH.CENTER

    p = doc.add_paragraph("__________"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p = doc.add_paragraph("ARRANGEMENT OF SECTIONS"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p = doc.add_paragraph("__________"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER

    p = doc.add_paragraph("SECTIONS")
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    if p.runs: p.runs[0].bold = True

    doc.add_paragraph("")  # spacing

    for letter, cat_list in grouped.items():
        add_centered_underlined_para(doc, letter, size_pt=14, bold=True)

        for category, items in cat_list:
            add_centered_underlined_para(doc, category, size_pt=12, bold=True)

            for s in items:
                p = doc.add_paragraph(f"{s['num']} {s['title']}")
                p.alignment = WD_ALIGN_PARAGRAPH.LEFT
                for sub in s.get("subitems", []):
                    sp = doc.add_paragraph(f"- {sub}")
                    sp.alignment = WD_ALIGN_PARAGRAPH.LEFT
                    sp.paragraph_format.left_indent = Pt(30)
            doc.add_paragraph("")  # gap after category
        doc.add_paragraph("")  # gap after letter

    doc.save(output_docx)

# -------------------
# Main
# -------------------
if __name__ == "__main__":
    input_pdf = r"C:\\Users\\viswa\\OneDrive\\Desktop\\7.pdf"
    output_pdf = r"7.pdf"
    output_docx = r"7.docx"

    sections = extract_sections_from_pdf(input_pdf, max_pages=10)
    grouped = group_by_letter_and_category(sections)

    create_pdf(grouped, output_pdf)
    create_docx(grouped, output_docx)

    print("✅ Alphabetical index created with proper chapters and subtopics.")
    print("PDF:", output_pdf)
    print("Word:", output_docx)


✅ Alphabetical index created with proper chapters and subtopics.
PDF: 7.pdf
Word: 7.docx


In [20]:
import re
import string
from collections import defaultdict, OrderedDict
from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt

# -------------------
# Helpers
# -------------------
def normalize_space(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def first_letter(title: str) -> str:
    for ch in title:
        if ch.isalpha():
            return ch.upper()
    return "#"

# -------------------
# Extract sections
# -------------------
def extract_sections_from_pdf(pdf_path, max_pages=10):
    """
    Extract sections with their chapter/category from PDF.
    Ignores page header 'SECTIONS'.
    Includes subtopics if formatted under a main section.
    """
    reader = PdfReader(pdf_path)
    pages_to_read = min(max_pages, len(reader.pages))
    sections = []
    current_category = None
    last_section = None

    # Match multiple sections in one line
    multi_section_split = re.compile(r"(?=\d+[A-Z]?\.|S\d+[A-Z]?\.)", re.IGNORECASE)
    # Capture original number with dot (7., 36A., S6.)
    num_section_re = re.compile(r"(\d+[A-Z]?\.|S\d+[A-Z]?\.?)\s*(.+)", re.IGNORECASE)
    chapter_re = re.compile(r"^\s*CHAPTER\b", re.IGNORECASE)
    allcaps_re = re.compile(r"^[A-Z0-9 ,’'`\-\.\(\)\/&]+$")

    for i in range(pages_to_read):
        page = reader.pages[i]
        text = page.extract_text() or ""
        for raw_line in text.splitlines():
            line = normalize_space(raw_line)
            if not line:
                continue

            # Skip page header "SECTIONS"
            if line.upper() == "SECTIONS":
                continue

            # Skip lines like "SECTION" alone
            if re.match(r"^SECTION\s*$", line, re.IGNORECASE):
                continue

            # Detect chapter heading and ignore "CHAPTER IV" etc.
            if chapter_re.match(line):
                continue

            # Detect category heading (all uppercase, not numeric)
            if allcaps_re.match(line) and any(c.isalpha() for c in line):
                current_category = line
                continue

            # Check for subtopics under last section (like under "REPEAL AND SAVINGS")
            if last_section and line.startswith("-"):
                last_section["subitems"].append(line.lstrip("- ").strip())
                continue

            # Split multiple sections in one line
            split_sections = multi_section_split.split(line)
            for sec in split_sections:
                sec = sec.strip()
                if not sec:
                    continue
                m = num_section_re.match(sec)
                if m:
                    num = m.group(1).strip()  # preserve original number with dot
                    title = m.group(2).strip().rstrip(".")
                    section_obj = {
                        "num": num,
                        "title": title,
                        "category": current_category or "MISCELLANEOUS",
                        "subitems": []
                    }
                    sections.append(section_obj)
                    last_section = section_obj
    return sections

# -------------------
# Group alphabetically & by category
# -------------------
PREFERRED_CATEGORY_ORDER = [
    "PRELIMINARY",
    "THE INSPECTING STAFF",
    "HEALTH",
    "SAFETY",
    "PROVISIONS RELATING TO HAZARDOUS PROCESSES",
    "WELFARE",
    "WORKING HOURS OF ADULTS",
    "EMPLOYMENT OF YOUNG PERSONS",
    "ANNUAL LEAVE WITH WAGES",
    "SPECIAL PROVISIONS",
    "PENALTIES AND PROCEDURE",
    "SUPPLEMENTAL",
]

def group_by_letter_and_category(sections):
    by_letter = defaultdict(list)
    for s in sections:
        letter = first_letter(s["title"])
        by_letter[letter].append(s)

    grouped = OrderedDict()
    for letter in sorted(by_letter.keys()):
        bucket = by_letter[letter]
        cats = defaultdict(list)
        for s in bucket:
            cat = (s["category"] or "").strip() or "MISCELLANEOUS"
            cats[cat].append(s)
        ordered_cats = []
        pref_present = [c for c in PREFERRED_CATEGORY_ORDER if c in cats]
        others = sorted([c for c in cats.keys() if c not in PREFERRED_CATEGORY_ORDER])
        for c in pref_present + others:
            cats[c].sort(key=lambda t: t["title"].lower())
            ordered_cats.append((c, cats[c]))
        grouped[letter] = ordered_cats
    return grouped

# -------------------
# PDF Output
# -------------------
def draw_centered_underlined(c, width, y, text, font="Helvetica-Bold", size=14, underline=True):
    c.setFont(font, size)
    c.drawCentredString(width/2, y, text)
    if underline:
        w = c.stringWidth(text, font, size)
        c.line((width - w)/2, y-2, (width + w)/2, y-2)

def create_pdf(grouped, output_pdf, act_title="THE FACTORIES ACT, 1948"):
    c = canvas.Canvas(output_pdf, pagesize=A4)
    width, height = A4
    left = 50
    y = height - 50

    # Title block
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width/2, y, act_title); y -= 20
    c.drawCentredString(width/2, y, "__________"); y -= 20
    c.drawCentredString(width/2, y, "ARRANGEMENT OF SECTIONS"); y -= 20
    c.drawCentredString(width/2, y, "__________"); y -= 30

    c.setFont("Helvetica-Bold", 12)
    c.drawCentredString(width/2, y, "SECTIONS"); y -= 30

    # Content
    for letter, cat_list in grouped.items():
        if y < 100:
            c.showPage(); y = height - 50
        draw_centered_underlined(c, width, y, letter, font="Helvetica-Bold", size=14, underline=True)
        y -= 25

        for category, items in cat_list:
            if y < 100:
                c.showPage(); y = height - 50
            draw_centered_underlined(c, width, y, category, font="Helvetica-Bold", size=12, underline=True)
            y -= 22

            c.setFont("Helvetica", 12)
            for s in items:
                if y < 60:
                    c.showPage(); y = height - 50; c.setFont("Helvetica", 12)
                line = f"{s['num']} {s['title']}"  # preserve original numbering with dot
                c.drawString(left + 20, y, line); y -= 18
                for sub in s.get("subitems", []):
                    if y < 60:
                        c.showPage(); y = height - 50; c.setFont("Helvetica", 12)
                    c.drawString(left + 50, y, f"- {sub}")
                    y -= 18
            y -= 10
        y -= 8
    c.save()

# -------------------
# Word Output
# -------------------
def add_centered_underlined_para(doc: Document, text: str, size_pt=14, bold=True):
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run(text)
    run.bold = bold
    run.underline = True
    run.font.size = Pt(size_pt)
    return p

def create_docx(grouped, output_docx, act_title="THE FACTORIES ACT, 1948"):
    doc = Document()

    # Title block
    h = doc.add_heading(act_title, level=0)
    h.alignment = WD_ALIGN_PARAGRAPH.CENTER

    p = doc.add_paragraph("__________"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p = doc.add_paragraph("ARRANGEMENT OF SECTIONS"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p = doc.add_paragraph("__________"); p.alignment = WD_ALIGN_PARAGRAPH.CENTER

    p = doc.add_paragraph("SECTIONS")
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    if p.runs: p.runs[0].bold = True

    doc.add_paragraph("")  # spacing

    for letter, cat_list in grouped.items():
        add_centered_underlined_para(doc, letter, size_pt=14, bold=True)

        for category, items in cat_list:
            add_centered_underlined_para(doc, category, size_pt=12, bold=True)

            for s in items:
                p = doc.add_paragraph(f"{s['num']} {s['title']}")  # preserve numbering
                p.alignment = WD_ALIGN_PARAGRAPH.LEFT
                for sub in s.get("subitems", []):
                    sp = doc.add_paragraph(f"- {sub}")
                    sp.alignment = WD_ALIGN_PARAGRAPH.LEFT
                    sp.paragraph_format.left_indent = Pt(30)
            doc.add_paragraph("")  # gap after category
        doc.add_paragraph("")  # gap after letter

    doc.save(output_docx)

# -------------------
# Main
# -------------------
if __name__ == "__main__":
    input_pdf = r"C:\\Users\\viswa\\OneDrive\\Desktop\\7.pdf"  # your source PDF
    output_pdf = r"7.pdf"
    output_docx = r"7.docx"

    sections = extract_sections_from_pdf(input_pdf, max_pages=5)
    grouped = group_by_letter_and_category(sections)

    create_pdf(grouped, output_pdf)
    create_docx(grouped, output_docx)

    print("✅ Alphabetical index created with original numbering, chapters, and subtopics.")
    print("PDF:", output_pdf)
    print("Word:", output_docx)


✅ Alphabetical index created with original numbering, chapters, and subtopics.
PDF: 7.pdf
Word: 7.docx


In [28]:
import re
from PyPDF2 import PdfReader
from fpdf import FPDF
from docx import Document

# --- Step 1: Read text from PDF ---
pdf_path = "C:\\Users\\viswa\\OneDrive\\Desktop\\RV\\4.pdf"  # Replace with your PDF file path
reader = PdfReader(pdf_path)

lines = []
for page in reader.pages:
    text = page.extract_text()
    if text:
        # Split text into lines
        lines.extend([line.strip() for line in text.split('\n') if line.strip()])

# --- Step 2: Define sorting key ---
def extract_section_key(line):
    match = re.match(r'(\d+)([A-Z]*)\.', line)
    if match:
        num_part = int(match.group(1))
        alpha_part = match.group(2)
        return (num_part, alpha_part)
    else:
        return (float('inf'), '')  # Lines without numbers go to the end

# --- Step 3: Sort lines ---
sorted_lines = sorted(lines, key=extract_section_key)

# --- Step 4: Save to DOCX ---
doc = Document()
for line in sorted_lines:
    doc.add_paragraph(line)
docx_path = "sorted_output.docx"
doc.save(docx_path)
print(f"SORTED DOCX saved at: {docx_path}")

# --- Step 5: Save to PDF ---
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", size=12)
for line in sorted_lines:
    pdf.multi_cell(0, 8, line)
pdf_path = "4.pdf"
pdf.output(pdf_path)
print(f"SORTED PDF saved at: {pdf_path}")


SORTED DOCX saved at: sorted_output.docx


  pdf.set_font("Arial", size=12)


FPDFException: Not enough horizontal space to render a single character

In [30]:
import re
from PyPDF2 import PdfReader
from fpdf import FPDF
from docx import Document

# --- Step 1: Read text from PDF ---
pdf_path = "C:\\Users\\viswa\\OneDrive\\Desktop\\RV\\4.pdf"  # Replace with your PDF file path
reader = PdfReader(pdf_path)

lines = []
for page in reader.pages:
    text = page.extract_text()
    if text:
        lines.extend([line.strip() for line in text.split('\n') if line.strip()])

# --- Step 2: Define sorting key ---
def extract_section_key(line):
    # Matches lines like 2., 2A., 7C., 25B.
    match = re.match(r'(\d+)([A-Z]*)\.', line)
    if match:
        num_part = int(match.group(1))
        alpha_part = match.group(2)
        return (num_part, alpha_part)
    else:
        return (float('inf'), '')  # Lines without numbers go to end

# --- Step 3: Sort lines ---
sorted_lines = sorted(lines, key=extract_section_key)

# --- Step 4: Clean lines for PDF ---
def clean_line(line):
    line = re.sub(r'\s+', ' ', line)  # Replace multiple spaces/tabs/newlines
    return str(line)

cleaned_lines = [clean_line(line) for line in sorted_lines]

# --- Step 5: Save to DOCX ---
doc = Document()
for line in cleaned_lines:
    doc.add_paragraph(line)
docx_path = "sorted_output.docx"
doc.save(docx_path)
print(f"SORTED DOCX saved at: {docx_path}")

# --- Step 6: Save to PDF ---
pdf = FPDF(orientation='P', unit='mm', format='A4')
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_left_margin(15)
pdf.set_right_margin(15)
pdf.set_font("Helvetica", size=12)


for line in cleaned_lines:
    # Split long lines to prevent "not enough horizontal space" error
    max_chars = 120  # Approximate characters per line
    while len(line) > max_chars:
        pdf.multi_cell(0, 8, line[:max_chars])
        line = line[max_chars:]
    pdf.multi_cell(0, 8, line)

pdf_path = "sorted_output.pdf"
pdf.output(pdf_path)
print(f"SORTED PDF saved at: {pdf_path}")


SORTED DOCX saved at: sorted_output.docx


FPDFException: Not enough horizontal space to render a single character

In [44]:
import re
import string
from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt

# ------------------ Helper: Extract numeric & alpha parts ------------------ #
def extract_section_key(num_str):
    """
    Converts a section like '2', '2A', '7C', '25B' into a sortable key.
    Returns (numeric part, alphabetical part).
    """
    match = re.match(r'(\d+)([A-Z]*)', num_str)
    if match:
        num = int(match.group(1))
        alpha = match.group(2)
        return (num, alpha)
    else:
        return (float('inf'), '')  # Non-numbered sections go last

# ------------------ Extract Index from first pages ------------------ #
def extract_index_from_first_pages(pdf_path, max_pages=4):
    """Extract section numbers and titles from first N pages."""
    reader = PdfReader(pdf_path)
    topics = []

    for i in range(min(max_pages, len(reader.pages))):
        page = reader.pages[i]
        text = page.extract_text()
        if not text:
            continue
        for line in text.splitlines():
            # Match lines like "12. Something" or "5A. Something"
            match = re.match(r"^\s*(\d+[A-Z]?\.)\s*(.*)", line)
            if match:
                num = match.group(1).strip()
                title = match.group(2).strip()
                topics.append((num, title))
    return topics

# ------------------ Create PDF ------------------ #
def create_alphabetical_index_pdf(topics, output_pdf):
    """Create alphabetical index PDF sorted numerically & grouped alphabetically."""
    c = canvas.Canvas(output_pdf, pagesize=A4)
    width, height = A4
    x, y = 50, height - 80

    # --- Title ---
    c.setFont("Helvetica-Bold", 14)
    c.drawCentredString(width/2, height - 50, "THE PAYMENT OF BONUS ACT, 1965")
    c.drawCentredString(width/2, height - 70, "__________")
    c.drawCentredString(width/2, height - 90, "ARRANGEMENT OF SECTIONS")
    c.drawCentredString(width/2, height - 110, "__________")

    y = height - 150
    c.setFont("Helvetica-Bold", 12)
    c.drawCentredString(width/2, y, "SECTIONS")
    y -= 30

    # Group by first letter of title
    grouped = {}
    for num, title in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append((num, title))

    # Iterate groups in alphabetical order
    for letter in sorted(grouped.keys()):
        if y < 100:
            c.showPage()
            y = height - 80

        # --- Centered Letter Heading ---
        c.setFont("Helvetica-Bold", 14)
        c.drawCentredString(width/2, y, letter)
        text_width = c.stringWidth(letter, "Helvetica-Bold", 14)
        c.line((width - text_width)/2, y-2, (width + text_width)/2, y-2)
        y -= 25

        # --- Topics under group (sorted numerically) ---
        c.setFont("Helvetica", 12)
        for num, title in sorted(grouped[letter], key=lambda s: extract_section_key(s[0])):
            if y < 50:
                c.showPage()
                y = height - 80
            # Clean line
            line = re.sub(r'\s+', ' ', f"{num} {title}")
            max_chars = 120
            while len(line) > max_chars:
                c.drawString(x + 20, y, line[:max_chars])
                line = line[max_chars:]
                y -= 20
            c.drawString(x + 20, y, line)
            y -= 20
        y -= 10

    c.save()

# ------------------ Create Word ------------------ #
def create_alphabetical_index_docx(topics, output_docx):
    """Create alphabetical index Word doc sorted numerically & grouped alphabetically."""
    doc = Document()

    # --- Title ---
    title = doc.add_heading("THE PAYMENT OF BONUS ACT, 1965", level=0)
    title.alignment = WD_ALIGN_PARAGRAPH.CENTER

    underline = doc.add_paragraph("__________")
    underline.alignment = WD_ALIGN_PARAGRAPH.CENTER

    arr = doc.add_paragraph("ARRANGEMENT OF SECTIONS")
    arr.alignment = WD_ALIGN_PARAGRAPH.CENTER

    underline2 = doc.add_paragraph("__________")
    underline2.alignment = WD_ALIGN_PARAGRAPH.CENTER

    doc.add_paragraph("")  # spacing

    # --- Sections Heading ---
    sections = doc.add_paragraph("SECTIONS")
    sections.alignment = WD_ALIGN_PARAGRAPH.CENTER
    sections.runs[0].bold = True

    # Group topics by first letter
    grouped = {}
    for num, title in topics:
        first_char = title[0].upper()
        if first_char not in string.ascii_uppercase:
            first_char = "#"
        grouped.setdefault(first_char, []).append((num, title))

    for letter in sorted(grouped.keys()):
        # --- Centered Letter Heading ---
        para = doc.add_paragraph(letter)
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = para.runs[0]
        run.bold = True
        run.font.size = Pt(14)
        run.underline = True

        doc.add_paragraph("")  # spacing

        # --- Topics under each letter (sorted numerically) ---
        for num, title in sorted(grouped[letter], key=lambda s: extract_section_key(s[0])):
            p = doc.add_paragraph(f"{num} {title}")
            p.alignment = WD_ALIGN_PARAGRAPH.LEFT

    doc.save(output_docx)

# ------------------ MAIN ------------------ #
if __name__ == "__main__":
    input_pdf = "C:\\Users\\viswa\\OneDrive\\Desktop\\14.pdf"   # Input PDF
    output_pdf = "14.pdf"                               # Output PDF
    output_docx = "14.docx"                             # Output Word

    # Extract topics
    topics = extract_index_from_first_pages(input_pdf, max_pages=2)

    # Create PDF & Word
    create_alphabetical_index_pdf(topics, output_pdf)
    create_alphabetical_index_docx(topics, output_docx)

    print("✅ Index created successfully:")
    print("PDF:", output_pdf)
    print("Word:", output_docx)


✅ Index created successfully:
PDF: 14.pdf
Word: 14.docx
