In [1]:
import pymupdf
import pytesseract
from tqdm import tqdm
import numpy as np
from PIL import Image
import os
import pandas as pd

# Function declaration

In [2]:
def pdf_to_png(pdf_path, output_folder, dpi=400):
    # Open the PDF file
    pdf_document = pymupdf.open(pdf_path)
    os.makedirs(output_folder, exist_ok=True)
    
    for page_number in tqdm(range(len(pdf_document)), desc="Converting PDF to PNG..."):
        # Load the page
        page = pdf_document.load_page(page_number)
        
        # Render page to an image (pixmap)
        pix = page.get_pixmap(dpi=dpi)
        
        # Define output image path
        output_image_path = f"{output_folder}/page_{page_number + 1}.png"
        
        # Save the image
        pix.save(output_image_path)
        
def pdf_to_text(pdf_path, output_folder, sample_page=None):
    pdf_document = pymupdf.open(pdf_path)
    os.makedirs(output_folder, exist_ok=True)
    if sample_page is not None:
        pages_to_process = [sample_page]
    else:
        pages_to_process = range(len(pdf_document))
    
    texts = {}
    for page_number in pages_to_process:
        page = pdf_document.load_page(page_number)
        text = page.get_text()
        texts[page_number] = text
    
    return texts

def crop_header_footer(image_array: np.ndarray, threshold: int = 127):
    """
    Crop header and footer from a grayscale image based on a pixel intensity threshold.

    Args:
        image_array (numpy.ndarray): Input image as a 2D array.
        threshold (int): Pixel intensity threshold to identify content.
    Returns:
        numpy.ndarray: Cropped image array.
    """
    height, _ = image_array.shape
    row_text_counts = np.sum(image_array < threshold, axis=1)
    has_text_signal = row_text_counts > 0
    
    # top crop (header)
    top_crop_row = 0
    reach_header = False
    for i in range(height // 2):
        if has_text_signal[i]:
            reach_header = True
        if reach_header and not has_text_signal[i]:
            top_crop_row = i
            break
    
    # bottom crop (footer)
    bottom_crop_row = height
    reach_footer = False
    for i in range(height - 1, height // 2, -1):
        if has_text_signal[i]:
            reach_footer = True
        if reach_footer and not has_text_signal[i]:
            bottom_crop_row = i
            break
    
    cropped_image = image_array[top_crop_row:bottom_crop_row, :]
    return cropped_image
    
def convert_to_binary(image_array:np.ndarray, threshold:int = 90):
    """
    Apply a black threshold to a grayscale image.

    Args:
        image_array (numpy.ndarray): Input image as a 2D array.
        threshold (int): Threshold value to consider a pixel as black.

    Returns:
        numpy.ndarray: Binary image after applying the threshold.
    """
    binary_image = np.where(image_array < threshold, 0, 255).astype(np.uint8)
    return binary_image

def apply_pooling(image, pool_size, stride, padding=0, min_pooling=True):
    """
    Apply pooling (min or max) to an image efficiently using NumPy.

    Args:
        image (numpy.ndarray): Input image as a 2D array.
        pool_size (int): Size of the pooling window (n x n).
        stride (int): Stride of the pooling window.
        padding (int): Amount of zero padding to add to the image (default is 0).
        min_pooling (bool): If True, apply min pooling; otherwise, apply max pooling.

    Returns:
        numpy.ndarray: Pooled image.
    """
    # Add zero padding to the image
    if padding > 0:
        image = np.pad(image, pad_width=padding, mode='constant', constant_values=0)
    
    # Get dimensions of the input image
    h, w = image.shape
    
    # Calculate dimensions of the output image
    out_h = (h - pool_size) // stride + 1
    out_w = (w - pool_size) // stride + 1
    
    # Create a sliding window view of the image
    shape = (out_h, out_w, pool_size, pool_size)
    strides = (stride * image.strides[0], stride * image.strides[1], image.strides[0], image.strides[1])
    windows = np.lib.stride_tricks.as_strided(image, shape=shape, strides=strides)
    
    # Apply pooling operation
    pooled_image = windows.min(axis=(2, 3)) if min_pooling else windows.max(axis=(2, 3))
    
    return pooled_image

def poolling_and_ocr_image(image_array:np.ndarray, lang:str = 'vie', config:str = '--psm 3', pool_size:int = 2, stride:int = 1, min_pooling:bool=True):
    """
    Apply pooling to an image and then perform OCR using Tesseract.

    Args:
        image_array (numpy.ndarray): Input image as a 2D array.
        lang (str): Language for Tesseract OCR.
        config (str): Configuration options for Tesseract OCR.
        black_threshold (int): Threshold to consider a pixel as black.
        pool_size (int): Size of the pooling window (n x n).
        stride (int): Stride of the pooling window.
        min_pooling (bool): If True, apply min pooling; otherwise, apply max pooling.
    Returns:
        str: Extracted text from the image.
    """
    pooled_image = apply_pooling(image_array, pool_size, stride, padding=0, min_pooling=min_pooling)
    image_obj = Image.fromarray(pooled_image)
    
    text = pytesseract.image_to_string(image_obj, lang=lang, config=config)
    
    return text

def divide_image(image_array:np.ndarray):
    # document was displayed in 2 column style, find the gap and divide the image
    _, width = image_array.shape
    col_sum = np.sum(image_array < 127, axis=0)
    gap_start, gap_end = None, None
    in_gap = False
    # the middle 20% area is more likely to contain the gap
    start_col = width // 2 - width // 10
    end_col = width // 2 + width // 10
    for col in range(start_col, end_col):
        if col_sum[col] == 0 and not in_gap:
            gap_start = col
            in_gap = True
        elif col_sum[col] > 0 and in_gap:
            gap_end = col
            break
    if gap_start is not None and gap_end is not None:
        left_image = image_array[:, :gap_start]
        right_image = image_array[:, gap_end:]
        return left_image, right_image
    else:
        return image_array, None

# PDF to PNG

In [3]:
# pdf_to_png(input_pdf, png_output_folder, dpi=400)

# Crop Image

# OCR

Range: page **6 - 251**

### Parameters
- **Black Threshold:** 90
- **Pool size:** 2
- **Stride:** 1

In [4]:
black_threshold = 90
pool_size = 2
stride = 1
min_pooling = True

page_range = list(range(6, 252)) # 6 - 252
results = []

### Run

In [5]:
# path declarations
input_pdf = "../data/raw/Duoc-Dien-Viet-Nam-V-tap-2.pdf"
png_output_folder = "../data/raw/pdf_images"
processed_output_csv = "../data/processed/ocr_output.csv"

for page_num in tqdm(page_range, desc="Extracting text from images..."):
    image_path = os.path.join(png_output_folder, f"page_{page_num}.png")
    image_array = np.array(Image.open(image_path).convert('L'))
    
    image_array = convert_to_binary(image_array, threshold=black_threshold)
    cropped_image = crop_header_footer(image_array, threshold=127)
    
    # save cropped image for inspection
    # cropped_image_obj = Image.fromarray(cropped_image)
    # cropped_image_obj.save(os.path.join("test_data", f"page_{page_num}_cropped.png"))

    text = poolling_and_ocr_image(cropped_image, lang='vie', config='--psm 3', pool_size=pool_size, stride=stride, min_pooling=min_pooling)
    
    results.append({
        "page_id": page_num,
        "text": text.strip()
    })

Extracting text from images...: 100%|██████████| 246/246 [16:28<00:00,  4.02s/it]


### Process the result

In [6]:
df = pd.DataFrame(results)
df.to_csv(processed_output_csv, index=False, encoding='utf-8')

In [1]:
import pandas as pd

df = pd.read_csv("../data/processed/ocr_output.csv", encoding='utf-8')

df.head()

Unnamed: 0,page_id,text
0,6,QUI ĐỊNH CHUNG\n\n1]. Tên chính của các chuyên...
1,7,Mát: 10°C đến 20 °C.\n\nNhiệt độ phòng: 20 °C ...
2,8,"1]7. Trong chuyên luận kháng sinh, hl mục định..."
3,9,27. Hỗn hợp của các chất lỏng được ghi theo ký...
4,10,39. Dược liệu dùng. sàn xuất thuốc thành phẩm ...


In [4]:
# join the text into a single txt file
all_text = " ".join(df['text'].tolist())
with open("../data/raw/all_text.txt", "w", encoding="utf-8") as f:
    f.write(all_text)