In [10]:
import fitz  # PyMuPDF
from deep_translator import GoogleTranslator

# Translate long text in chunks
def translate_text_chunked(text, translator, max_len=5000):
    if len(text.strip()) == 0:
        return ""
    if len(text) <= max_len:
        return translator.translate(text)

    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_len, len(text))
        chunk = text[start:end]
        translated_chunk = translator.translate(chunk)
        chunks.append(translated_chunk)
        start = end

    return " ".join(chunks)

# Translation cache
translation_cache = {}

def translate_with_cache(text, translator):
    if text in translation_cache:
        return translation_cache[text]
    translated = translate_text_chunked(text, translator)
    translation_cache[text] = translated
    return translated

# Extract all text spans (including tables) with positions
def extract_text_spans_with_positions(file_bytes):
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    pages_data = []

    for page_num, page in enumerate(doc):
        spans_data = []
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:
            if "lines" in b:
                for line in b["lines"]:
                    for span in line["spans"]:
                        text = span["text"].strip()
                        if text:
                            spans_data.append({
                                "text": text,
                                "bbox": span["bbox"]
                            })
        pages_data.append(spans_data)
    return pages_data, doc

# Translate spans
def translate_spans(pages_data, src_lang, tgt_lang):
    translator = GoogleTranslator(source=src_lang, target=tgt_lang)
    for page_spans in pages_data:
        for span in page_spans:
            original = span["text"]
            span["translated"] = translate_with_cache(original, translator)
    return pages_data

# Overlay translations on original PDF
def overlay_translations_on_pdf(doc, pages_data, fontname="helv", fontsize=10):
    for page_num, spans in enumerate(pages_data):
        page = doc[page_num]
        for span in spans:
            x0, y0, x1, y1 = span["bbox"]
            # Clear original text area with white box
            page.draw_rect(fitz.Rect(x0, y0, x1, y1), color=(1, 1, 1), fill=(1, 1, 1))
            # Insert translated text
            page.insert_textbox(
                fitz.Rect(x0, y0, x1, y1),
                span["translated"],
                fontname=fontname,
                fontsize=fontsize,
                align=0
            )
    return doc

# Full process
def process_pdf(input_pdf_path, output_pdf_path, src_lang, tgt_lang):
    with open(input_pdf_path, "rb") as f:
        pdf_bytes = f.read()

    pages_data, doc = extract_text_spans_with_positions(pdf_bytes)
    pages_data = translate_spans(pages_data, src_lang, tgt_lang)
    translated_doc = overlay_translations_on_pdf(doc, pages_data)
    translated_doc.save(output_pdf_path)
    print(f"✅ Translated PDF saved as: {output_pdf_path}")

# Example usage
if __name__ == "__main__":
    input_pdf = "Test1.pdf"
    output_pdf = "translated_output2.pdf"
    source_lang = "english"
    target_lang = "hindi"

    process_pdf(input_pdf, output_pdf, source_lang, target_lang)


✅ Translated PDF saved as: translated_output2.pdf


In [21]:
# PDF Translator with Table Formatting Fix - Compatible Version
# Single file for Jupyter Notebook

# Install required packages (run this cell first)
"""
!pip install deep-translator
!pip install PyMuPDF
!pip install pdfplumber
"""

import fitz as pymupdf
import pdfplumber
from deep_translator import GoogleTranslator

def translate_pdf_with_tables(input_pdf_path, output_pdf_path):
    """
    Translate PDF with proper table formatting
    """
    
    WHITE = pymupdf.pdfcolor["white"]
    textflags = pymupdf.TEXT_DEHYPHENATE
    to_hindi = GoogleTranslator(source="en", target="hi")
    
    # Open PDF
    doc = pymupdf.open(input_pdf_path)
    ocg = doc.add_ocg("Hindi", on=True)
    
    # Extract tables from all pages first
    print("Extracting tables...")
    all_tables = extract_all_tables(input_pdf_path)
    
    for page_num, page in enumerate(doc):
        print(f"Processing page {page_num + 1}...")
        
        # Get page tables
        page_tables = [t for t in all_tables if t['page'] == page_num]
        
        # Get all text blocks
        blocks = page.get_text("blocks", flags=textflags)
        
        # Track which blocks are part of tables
        table_blocks = []
        for table_info in page_tables:
            table_blocks.extend(get_table_text_blocks(blocks, table_info['data']))
        
        # Process non-table blocks first
        for i, block in enumerate(blocks):
            if i in table_blocks:
                continue  # Skip table blocks for now
                
            bbox = pymupdf.Rect(block[:4])
            text = block[4].strip()
            
            if text:
                try:
                    hindit = to_hindi.translate(text)
                    
                    # Clear original text
                    page.draw_rect(bbox, color=None, fill=WHITE, oc=ocg)
                    
                    # Insert translated text using textbox
                    page.insert_textbox(
                        bbox, hindit, 
                        fontsize=12,
                        color=(0, 0, 0),
                        oc=ocg
                    )
                except Exception as e:
                    print(f"Translation error: {e}")
        
        # Process tables with proper formatting
        for table_info in page_tables:
            process_table_on_page(page, table_info, to_hindi, WHITE, ocg)
    
    # Save translated PDF
    doc.subset_fonts()
    doc.ez_save(output_pdf_path)
    doc.close()
    
    print(f"✅ Translated PDF saved: {output_pdf_path}")

def extract_all_tables(pdf_path):
    """Extract all tables with their positions"""
    all_tables = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            tables = page.extract_tables()
            
            for table in tables:
                if table and len(table) > 0:
                    # Find table boundaries
                    table_bbox = find_table_bbox(page, table)
                    
                    all_tables.append({
                        'page': page_num,
                        'data': table,
                        'bbox': table_bbox
                    })
    
    return all_tables

def find_table_bbox(page, table_data):
    """Find table bounding box by searching for cell text"""
    if not table_data or not table_data[0]:
        return None
    
    # Look for first non-empty cell
    first_cell_text = None
    for row in table_data:
        for cell in row:
            if cell and cell.strip():
                first_cell_text = cell.strip()
                break
        if first_cell_text:
            break
    
    if first_cell_text:
        # Search for this text on the page
        words = page.extract_words()
        for word in words:
            if first_cell_text in word['text']:
                # Estimate table size
                rows = len(table_data)
                cols = len(table_data[0]) if table_data else 0
                
                # Calculate approximate table bbox
                cell_width = 100
                cell_height = 25
                
                x0 = word['x0']
                y0 = word['top']
                x1 = x0 + (cols * cell_width)
                y1 = y0 + (rows * cell_height)
                
                return pymupdf.Rect(x0, y0, x1, y1)
    
    # Default bbox if not found
    return pymupdf.Rect(50, 100, 500, 300)

def get_table_text_blocks(blocks, table_data):
    """Identify which text blocks belong to tables"""
    table_block_indices = []
    
    # Create set of all table cell texts
    table_texts = set()
    for row in table_data:
        for cell in row:
            if cell and cell.strip():
                table_texts.add(cell.strip())
    
    # Find blocks that contain table text
    for i, block in enumerate(blocks):
        block_text = block[4].strip()
        # Check if block text is part of any table cell
        for table_text in table_texts:
            if table_text in block_text or block_text in table_text:
                table_block_indices.append(i)
                break
    
    return table_block_indices

def process_table_on_page(page, table_info, translator, WHITE, ocg):
    """Process and translate a table while maintaining formatting"""
    table_data = table_info['data']
    table_bbox = table_info['bbox']
    
    if not table_data:
        return
    
    rows = len(table_data)
    cols = len(table_data[0]) if table_data else 0
    
    if rows == 0 or cols == 0:
        return
    
    print(f"Translating table: {rows} rows x {cols} cols")
    
    # Clear the table area
    page.draw_rect(table_bbox, color=None, fill=WHITE, oc=ocg)
    
    # Calculate cell dimensions
    cell_width = table_bbox.width / cols
    cell_height = table_bbox.height / rows
    
    # Ensure minimum cell height
    if cell_height < 20:
        cell_height = 20
        table_bbox = pymupdf.Rect(table_bbox.x0, table_bbox.y0, table_bbox.x1, table_bbox.y0 + (rows * cell_height))
    
    # Draw table with borders and translated text
    for row_idx, row in enumerate(table_data):
        for col_idx, cell in enumerate(row):
            # Calculate cell position
            x0 = table_bbox.x0 + (col_idx * cell_width)
            y0 = table_bbox.y0 + (row_idx * cell_height)
            x1 = x0 + cell_width
            y1 = y0 + cell_height
            
            cell_rect = pymupdf.Rect(x0, y0, x1, y1)
            
            # Draw cell border
            page.draw_rect(cell_rect, color=(0, 0, 0), width=0.8, oc=ocg)
            
            # Add header background for first row
            if row_idx == 0:
                page.draw_rect(cell_rect, color=None, fill=(0.9, 0.9, 0.9), oc=ocg)
                page.draw_rect(cell_rect, color=(0, 0, 0), width=0.8, oc=ocg)  # Redraw border
            
            # Insert translated text
            if cell and cell.strip():
                try:
                    translated_cell = translator.translate(cell.strip())
                    
                    # Create text rect with padding
                    text_rect = pymupdf.Rect(x0 + 3, y0 + 2, x1 - 3, y1 - 2)
                    
                    # Determine font size based on cell size and text length
                    text_length = len(translated_cell)
                    if text_length > 50:
                        font_size = 7
                    elif text_length > 30:
                        font_size = 8
                    else:
                        font_size = 9
                    
                    # Insert text
                    page.insert_textbox(
                        text_rect,
                        translated_cell,
                        fontsize=font_size,
                        color=(0, 0, 0),
                        align=pymupdf.TEXT_ALIGN_LEFT,
                        oc=ocg
                    )
                    
                except Exception as e:
                    print(f"Cell translation error: {e}")
                    # Insert original text if translation fails
                    text_rect = pymupdf.Rect(x0 + 3, y0 + 2, x1 - 3, y1 - 2)
                    page.insert_textbox(
                        text_rect,
                        cell,
                        fontsize=8,
                        color=(0, 0, 0),
                        align=pymupdf.TEXT_ALIGN_LEFT,
                        oc=ocg
                    )

# Main function to use
def translate_pdf(input_pdf, output_pdf):
    """
    Main function to translate PDF with proper table formatting
    
    Usage:
    translate_pdf('input.pdf', 'output_hindi.pdf')
    """
    try:
        translate_pdf_with_tables(input_pdf, output_pdf)
        print("🎉 Translation completed successfully!")
        return True
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()
        return False

# Simple test function
def test_translation():
    """Test Google Translator"""
    translator = GoogleTranslator(source="en", target="hi")
    test_text = "Hello World"
    result = translator.translate(test_text)
    print(f"Test: {test_text} -> {result}")

# Example usage
if __name__ == "__main__":
    print("PDF Translator with Table Formatting - Compatible Version")
    print("=" * 50)
    
    # Test translation
    print("Testing translator...")
    test_translation()
    
    print("\nReady to translate!")
    print("Use: translate_pdf('input.pdf', 'output.pdf')")
    
    # Example usage:
    # translate_pdf('Testing.pdf', 'translated-hindi.pdf')

PDF Translator with Table Formatting - Compatible Version
Testing translator...
Test: Hello World -> हैलो वर्ल्ड

Ready to translate!
Use: translate_pdf('input.pdf', 'output.pdf')


In [22]:
translate_pdf('Test1.pdf', 'translated_hindi.pdf')

Extracting tables...
Processing page 1...
Translating table: 6 rows x 4 cols
Processing page 2...
Translating table: 4 rows x 4 cols
✅ Translated PDF saved: translated_hindi.pdf
🎉 Translation completed successfully!


True