In [10]:
import pymupdf as fitz 
from typing import List, Dict

def validate_pdf(file_bytes: bytes) -> bool:
    print("Validating PDF...")
    fitz.open(stream=file_bytes, filetype="pdf").close()
    print("PDF is valid.")
    return True


with open("Testing.pdf", "rb") as f:
    file_bytes = f.read()

validate_pdf(file_bytes)

Validating PDF...
PDF is valid.


True

In [11]:
def extract_text_blocks(file_bytes: bytes):
    print("Extracting text blocks from PDF...")
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    blocks_info = []
    for page_num, page in enumerate(doc):
        print(f"Processing page {page_num + 1}/{len(doc)}")
        for block in page.get_text("blocks", flags=fitz.TEXT_DEHYPHENATE):
            if len(block) >= 5 and block[4].strip():
                blocks_info.append({
                    'page': page_num,
                    'text': block[4],
                    'bbox': block[:4],
                    'block_type': block[5] if len(block) > 5 else 0,
                    'block_no': block[6] if len(block) > 6 else 0
                })
    doc.close()
    print(f"Total text blocks extracted: {len(blocks_info)}")
    return blocks_info

extract_text_blocks(file_bytes)

Extracting text blocks from PDF...
Processing page 1/2
Processing page 2/2
Total text blocks extracted: 39


[{'page': 0,
  'text': '📌 Task Overview \n',
  'bbox': (72.0, 100.6922607421875, 185.50848388671875, 115.9647216796875),
  'block_type': 1,
  'block_no': 0},
 {'page': 0,
  'text': 'Build a web application that can translate PDF files between Hindi and English, in both \ndirections. The application should allow users to: \n',
  'bbox': (72.0, 130.09716796875, 514.901123046875, 156.9324951171875),
  'block_type': 2,
  'block_no': 0},
 {'page': 0,
  'text': '●\u200b Upload a PDF\u200b\n',
  'bbox': (90.0, 171.189697265625, 180.09730529785156, 183.478759765625),
  'block_type': 3,
  'block_no': 0},
 {'page': 0,
  'text': ' \n●\u200b Choose translation direction (Hindi → English or English → Hindi)\u200b\n',
  'bbox': (90.0, 185.7359619140625, 429.97637939453125, 212.5712890625),
  'block_type': 4,
  'block_no': 0},
 {'page': 0,
  'text': ' \n●\u200b Download a translated PDF that retains the original structure\u200b\n',
  'bbox': (90.0, 214.8284912109375, 405.55523681640625, 241.663818359

In [12]:
def extract_simple_text(file_bytes: bytes) -> str:
    print("Extracting simple text from PDF...")
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    text = "\n\n".join(page.get_text() for page in doc)
    doc.close()
    print(f"Total characters extracted: {len(text)}")
    return text.strip()

extract_simple_text(file_bytes)

Extracting simple text from PDF...
Total characters extracted: 2059


"📌 Task Overview \nBuild a web application that can translate PDF files between Hindi and English, in both \ndirections. The application should allow users to: \n●\u200b Upload a PDF\u200b\n \n●\u200b Choose translation direction (Hindi → English or English → Hindi)\u200b\n \n●\u200b Download a translated PDF that retains the original structure\u200b\n \n \n✅ Functional Requirements \n🔁 Language Translation \n●\u200b The app must support two-way translation: Hindi ↔ English\u200b\n \n●\u200b Translation should be intelligent:\u200b\n \n○\u200b Do not translate abbreviations or acronyms (e.g., “AI”, “NASA”)\u200b\n \n○\u200b Skip translation of fully capitalized words (e.g., “PDF”, “ML”)\u200b\n \n○\u200b Ensure contextual accuracy\u200b\n \n📄 PDF Handling \n●\u200b Input PDFs may include text, tables, images, and complex formatting\u200b\n \n●\u200b Output PDF must preserve:\u200b\n \n○\u200b Original fonts and formatting\u200b\n \n○\u200b Page structure\u200b\n \n○\u200b Tables and im

In [13]:
def get_pdf_info(file_bytes: bytes):
    print("Getting PDF metadata and info...")
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    rect = doc[0].rect if doc else None
    info = {
        'page_count': len(doc),
        'metadata': doc.metadata,
        'is_encrypted': doc.is_encrypted,
        'size_bytes': len(file_bytes),
        'page_width': rect.width if rect else None,
        'page_height': rect.height if rect else None,
    }
    doc.close()
    print(f"PDF Info: {info}")
    return info

get_pdf_info(file_bytes)

Getting PDF metadata and info...
PDF Info: {'page_count': 2, 'metadata': {'format': 'PDF 1.4', 'title': 'PDF Translator Web App', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Skia/PDF m140 Google Docs Renderer', 'creationDate': '', 'modDate': '', 'trapped': '', 'encryption': None}, 'is_encrypted': False, 'size_bytes': 118420, 'page_width': 612.0, 'page_height': 792.0}


{'page_count': 2,
 'metadata': {'format': 'PDF 1.4',
  'title': 'PDF Translator Web App',
  'author': '',
  'subject': '',
  'keywords': '',
  'creator': '',
  'producer': 'Skia/PDF m140 Google Docs Renderer',
  'creationDate': '',
  'modDate': '',
  'trapped': '',
  'encryption': None},
 'is_encrypted': False,
 'size_bytes': 118420,
 'page_width': 612.0,
 'page_height': 792.0}

In [14]:
def extract_images_info(file_bytes: bytes):
    print("Extracting image info from PDF...")
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    images = []
    for page_num, page in enumerate(doc):
        image_list = page.get_images()
        print(f"Page {page_num + 1}: Found {len(image_list)} images")
        for idx, img in enumerate(image_list):
            images.append({
                'page': page_num,
                'index': idx,
                'xref': img[0],
                'bbox': page.get_image_bbox(img),
                'width': img[2],
                'height': img[3]
            })
    doc.close()
    print(f"Total images extracted: {len(images)}")
    return images

extract_images_info(file_bytes)

Extracting image info from PDF...
Page 1: Found 0 images
Page 2: Found 0 images
Total images extracted: 0


[]

In [15]:
def has_extractable_text(file_bytes: bytes) -> bool:
    print("Checking if PDF has extractable text...")
    result = bool(extract_simple_text(file_bytes).strip())
    print(f"Text extractable: {result}")
    return result
has_extractable_text(file_bytes)

Checking if PDF has extractable text...
Extracting simple text from PDF...
Total characters extracted: 2059
Text extractable: True


True