# Day. 7

In [None]:
import os

## CW - 05

### pdfplumber

In [None]:
import pdfplumber
def check_pdf_text_layer(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            pages_w_text = 0
            total_chrs = 0

            print(f"Checking PDF: {pdf_path}")
            print(f"Total pages: {total_pages}\n")

            for i, page in enumerate(pdf.pages, 1):
                text = page.extract_text()

                if text and text.strip():
                    pages_w_text += 1
                    char_count = len(text.strip())
                    total_chrs += char_count
                    print(f"Page no. {i} has text layer ({char_count} characters)")
                else:
                    print(f"Page no. {i} has no text layer or is blank.")

            has_text_layer = pages_w_text > 0
            coverage_rate = (pages_w_text / total_pages * 100) if total_pages > 0 else 0

            print(f"\n{'=' * 50}")
            print(f"Check results:")
            print(f"- Page with text layer count: {pages_w_text}/{total_pages}")
            print(f"- Coverage rate: {coverage_rate:.2f}%")
            print(f"- Total characters: {total_chrs}")
            print(f"-Conclusion: {'This pdf has text layer and can be extract directly.' if has_text_layer else 'This pdf has no extractable text layer.'}")

            output_path = os.path.splitext(pdf_path)[0] + "_extracted-pdfplumber.md"
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(text)
            print(f"Markdown file exported to: {output_path}")

    except Exception as e:
        print(f"ERR: {str(e)}")
        return None

if __name__ == "__main__":
    pdf_file = "./CW/05/example.pdf"
    result = check_pdf_text_layer(pdf_file)

### Docling

In [None]:
from docling.document_converter import DocumentConverter

def check_pdf_text_layer(pdf_path):
    try:
        # Initialize Docling converter
        converter = DocumentConverter()
        
        print(f"Checking PDF: {pdf_path}")
        print("Converting document with Docling...")
        
        # Convert the PDF
        result = converter.convert(pdf_path)
        
        # Export to markdown
        markdown_text = result.document.export_to_markdown()
        
        if markdown_text and markdown_text.strip():
            total_chars = len(markdown_text.strip())
            
            print(f"Document successfully converted to markdown")
            print(f"Total characters extracted: {total_chars}")
            
            # Export to .md file
            output_path = os.path.splitext(pdf_path)[0] + "_extracted-docling.md"
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(markdown_text)
            
            print(f"Markdown file exported to: {output_path}")
            
            print(f"\n{'=' * 50}")
            print(f"Check results:")
            print(f"- Total characters: {total_chars}")
            print(f"- Conclusion: This PDF has text layer and can be extracted directly.")
            
            return output_path
        else:
            print(f"No text content extracted from the document.")
            print(f"\n{'=' * 50}")
            print(f"Check results:")
            print(f"- Total characters: 0")
            print(f"- Conclusion: This PDF has no extractable text layer.")
            
            return None
            
    except Exception as e:
        print(f"ERR: {str(e)}")
        return None

if __name__ == "__main__":
    pdf_file = "./CW/05/example.pdf"
    result = check_pdf_text_layer(pdf_file)
    
    if result:
        print(f"\nSuccess! Extracted text saved to: {result}")

### Markitdown

In [None]:
from markitdown import MarkItDown

def check_pdf_text_layer(pdf_path):
    try:
        md = MarkItDown()
        result = md.convert(pdf_path)
        
        # MarkItDown returns the entire document as markdown text
        text = result.text_content if hasattr(result, 'text_content') else result
        
        print(f"Checking PDF: {pdf_path}")
        
        # MarkItDown doesn't provide per-page analysis, so we extract overall statistics
        if text and text.strip():
            total_chars = len(text.strip())
            # Rough estimation of pages based on character count (assuming ~2000 chars per page)
            estimated_pages = max(1, total_chars // 2000)
            
            print(f"Document successfully converted to markdown")
            print(f"Total characters extracted: {total_chars}")
            print(f"Estimated pages (based on character count): {estimated_pages}")
            
            # Export to .md file
            output_path = os.path.splitext(pdf_path)[0] + "_extracted-markitdown.md"
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(text)
            
            print(f"Markdown file exported to: {output_path}")
            
            print(f"\n{'=' * 50}")
            print(f"Check results:")
            print(f"- Total characters: {total_chars}")
            print(f"- Conclusion: This PDF has text layer and can be extracted directly.")
            
            return output_path
        else:
            print(f"No text content extracted from the document.")
            print(f"\n{'=' * 50}")
            print(f"Check results:")
            print(f"- Total characters: 0")
            print(f"- Conclusion: This PDF has no extractable text layer.")
            
            return None
            
    except Exception as e:
        print(f"ERR: {str(e)}")
        return None

if __name__ == "__main__":
    pdf_file = "./CW/05/example.pdf"
    result = check_pdf_text_layer(pdf_file)
    
    if result:
        print(f"\nSuccess! Extracted text saved to: {result}")

## CW 06

In [3]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, ApiVlmOptions
from docling.document_converter import DocumentConverter, PdfFormatOption


pdf_options = PdfPipelineOptions(
    do_ocr=True,
)


# 建立文件轉換器
doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
    }
)


# 轉換 PDF 文件
result = doc_converter.convert("./CW/06/sample_table.pdf")

# print(result)
# 輸出為 Markdown 格式
a = result.document.export_to_markdown()
print(a)

[32m[INFO] 2026-02-10 15:48:38,226 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-02-10 15:48:38,229 [RapidOCR] download_file.py:60: File exists and is valid: D:\Documents\cache\aiworkshop\repo\nutc2504lab_hw\.venv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2026-02-10 15:48:38,230 [RapidOCR] main.py:53: Using D:\Documents\cache\aiworkshop\repo\nutc2504lab_hw\.venv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2026-02-10 15:48:38,278 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-02-10 15:48:38,280 [RapidOCR] download_file.py:60: File exists and is valid: D:\Documents\cache\aiworkshop\repo\nutc2504lab_hw\.venv\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2026-02-10 15:48:38,280 [RapidOCR] main.py:53: Using D:\Documents\cache\aiworkshop\repo\nutc2504lab_hw\.venv\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.o




In [4]:
def olmocr2_vlm_options(
    model: str = "allenai/olmOCR-2-7B-1025-FP8",
    hostname_and_port: str = "https://ws-01.wade0426.me/v1/",
    prompt: str = "Convert this page to markdown.",
    max_tokens: int = 4096,
    temperature: float = 0.0,
    api_key: str = "",) -> ApiVlmOptions:


    headers = {}
    if api_key:
        headers["Authorization"] = f"Bearer {api_key}"
   
    options = ApiVlmOptions(
        url=f"http://{hostname_and_port}/chat/completions",
        params=dict(
            model=model,
            max_tokens=max_tokens,
        ),
        headers=headers,
        prompt=prompt,
        timeout=120,  # olmocr2 可能需要較長處理時間
        scale=2.0,  # 圖片縮放比例
        temperature=temperature,
        response_format=ResponseFormat.MARKDOWN,
    )
    return options
