# Day. 7

In [1]:
import os

## CW - 05

### pdfplumber

In [9]:
import pdfplumber
def check_pdf_text_layer(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            pages_w_text = 0
            total_chrs = 0

            print(f"Checking PDF: {pdf_path}")
            print(f"Total pages: {total_pages}\n")

            for i, page in enumerate(pdf.pages, 1):
                text = page.extract_text()

                if text and text.strip():
                    pages_w_text += 1
                    char_count = len(text.strip())
                    total_chrs += char_count
                    print(f"Page no. {i} has text layer ({char_count} characters)")
                else:
                    print(f"Page no. {i} has no text layer or is blank.")

            has_text_layer = pages_w_text > 0
            coverage_rate = (pages_w_text / total_pages * 100) if total_pages > 0 else 0

            print(f"\n{'=' * 50}")
            print(f"Check results:")
            print(f"- Page with text layer count: {pages_w_text}/{total_pages}")
            print(f"- Coverage rate: {coverage_rate:.2f}%")
            print(f"- Total characters: {total_chrs}")
            print(f"-Conclusion: {'This pdf has text layer and can be extract directly.' if has_text_layer else 'This pdf has no extractable text layer.'}")

            output_path = os.path.splitext(pdf_path)[0] + "_extracted-pdfplumber.md"
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(text)
            print(f"Markdown file exported to: {output_path}")

    except Exception as e:
        print(f"ERR: {str(e)}")
        return None

if __name__ == "__main__":
    pdf_file = "./CW/05/example.pdf"
    result = check_pdf_text_layer(pdf_file)

Checking PDF: ./CW/05/example.pdf
Total pages: 3

Page no. 1 has text layer (979 characters)
Page no. 2 has text layer (383 characters)
Page no. 3 has text layer (1440 characters)

Check results:
- Page with text layer count: 3/3
- Coverage rate: 100.00%
- Total characters: 2802
-Conclusion: This pdf has text layer and can be extract directly.
Markdown file exported to: ./CW/05/example_extracted-pdfplumber.md


### Docling

In [3]:
from docling.document_converter import DocumentConverter

def check_pdf_text_layer(pdf_path):
    try:
        # Initialize Docling converter
        converter = DocumentConverter()
        
        print(f"Checking PDF: {pdf_path}")
        print("Converting document with Docling...")
        
        # Convert the PDF
        result = converter.convert(pdf_path)
        
        # Export to markdown
        markdown_text = result.document.export_to_markdown()
        
        if markdown_text and markdown_text.strip():
            total_chars = len(markdown_text.strip())
            
            print(f"Document successfully converted to markdown")
            print(f"Total characters extracted: {total_chars}")
            
            # Export to .md file
            output_path = os.path.splitext(pdf_path)[0] + "_extracted-docling.md"
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(markdown_text)
            
            print(f"Markdown file exported to: {output_path}")
            
            print(f"\n{'=' * 50}")
            print(f"Check results:")
            print(f"- Total characters: {total_chars}")
            print(f"- Conclusion: This PDF has text layer and can be extracted directly.")
            
            return output_path
        else:
            print(f"No text content extracted from the document.")
            print(f"\n{'=' * 50}")
            print(f"Check results:")
            print(f"- Total characters: 0")
            print(f"- Conclusion: This PDF has no extractable text layer.")
            
            return None
            
    except Exception as e:
        print(f"ERR: {str(e)}")
        return None

if __name__ == "__main__":
    pdf_file = "./CW/05/example.pdf"
    result = check_pdf_text_layer(pdf_file)
    
    if result:
        print(f"\nSuccess! Extracted text saved to: {result}")

[32m[INFO] 2026-02-10 11:50:02,910 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-02-10 11:50:02,914 [RapidOCR] download_file.py:60: File exists and is valid: D:\Documents\cache\aiworkshop\repo\nutc2504lab_hw\.venv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2026-02-10 11:50:02,914 [RapidOCR] main.py:53: Using D:\Documents\cache\aiworkshop\repo\nutc2504lab_hw\.venv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2026-02-10 11:50:02,962 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-02-10 11:50:02,964 [RapidOCR] download_file.py:60: File exists and is valid: D:\Documents\cache\aiworkshop\repo\nutc2504lab_hw\.venv\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2026-02-10 11:50:02,964 [RapidOCR] main.py:53: Using D:\Documents\cache\aiworkshop\repo\nutc2504lab_hw\.venv\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.o

Checking PDF: ./CW/05/example.pdf
Converting document with Docling...
Document successfully converted to markdown
Total characters extracted: 8171
Markdown file exported to: ./CW/05/example_extracted-docling.md

Check results:
- Total characters: 8171
- Conclusion: This PDF has text layer and can be extracted directly.

Success! Extracted text saved to: ./CW/05/example_extracted-docling.md


### Markitdown

In [4]:
from markitdown import MarkItDown

def check_pdf_text_layer(pdf_path):
    try:
        md = MarkItDown()
        result = md.convert(pdf_path)
        
        # MarkItDown returns the entire document as markdown text
        text = result.text_content if hasattr(result, 'text_content') else result
        
        print(f"Checking PDF: {pdf_path}")
        
        # MarkItDown doesn't provide per-page analysis, so we extract overall statistics
        if text and text.strip():
            total_chars = len(text.strip())
            # Rough estimation of pages based on character count (assuming ~2000 chars per page)
            estimated_pages = max(1, total_chars // 2000)
            
            print(f"Document successfully converted to markdown")
            print(f"Total characters extracted: {total_chars}")
            print(f"Estimated pages (based on character count): {estimated_pages}")
            
            # Export to .md file
            output_path = os.path.splitext(pdf_path)[0] + "_extracted-markitdown.md"
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(text)
            
            print(f"Markdown file exported to: {output_path}")
            
            print(f"\n{'=' * 50}")
            print(f"Check results:")
            print(f"- Total characters: {total_chars}")
            print(f"- Conclusion: This PDF has text layer and can be extracted directly.")
            
            return output_path
        else:
            print(f"No text content extracted from the document.")
            print(f"\n{'=' * 50}")
            print(f"Check results:")
            print(f"- Total characters: 0")
            print(f"- Conclusion: This PDF has no extractable text layer.")
            
            return None
            
    except Exception as e:
        print(f"ERR: {str(e)}")
        return None

if __name__ == "__main__":
    pdf_file = "./CW/05/example.pdf"
    result = check_pdf_text_layer(pdf_file)
    
    if result:
        print(f"\nSuccess! Extracted text saved to: {result}")

Checking PDF: ./CW/05/example.pdf
Document successfully converted to markdown
Total characters extracted: 2946
Estimated pages (based on character count): 1
Markdown file exported to: ./CW/05/example_extracted-markitdown.md

Check results:
- Total characters: 2946
- Conclusion: This PDF has text layer and can be extracted directly.

Success! Extracted text saved to: ./CW/05/example_extracted-markitdown.md
