In [14]:
import os, sys, pymupdf  # import the bindings
from PIL import Image
import io
import numpy as np
import matplotlib.pyplot as plt

In [15]:
def get_minmax_gt_threshold(arr,threshold):
    idx = np.argwhere(arr>threshold)
    if len(idx) == 0:#nowhere: black page with no white.
        return len(arr),0 
    min_idx = np.min(idx)
    max_idx = np.max(idx)
    return min_idx,max_idx

In [16]:
def get_mask_rectangles(column, row, threshold = 128):
    column_len = len(column)
    row_len = len(row)
    column_min_idx, column_max_idx = get_minmax_gt_threshold(column,threshold)
    row_min_idx, row_max_idx = get_minmax_gt_threshold(row,threshold)
    #print("Mid Column: White is: %d to %d of %d - %d"%(column_min_idx, column_max_idx,0,column_len))
    #print("Mid Row   : White is: %d to %d of %d - %d"%(row_min_idx, row_max_idx,0,row_len))
    mask_rectangles = []
    if column_min_idx > 0: #top rect required. From (0,0 to row_len,column_min_idx)
        #print("top rect required")
        mask_rectangles.append(pymupdf.Rect(0, 0, row_len, column_min_idx))
    if column_max_idx < column_len: #bottom rect required. From (0,column_max_idx to row_len,column_len)
        #print("bottom rect required")
        mask_rectangles.append(pymupdf.Rect(0, column_max_idx, row_len, column_len))
    
    if row_min_idx > 0: #left rect required. From (0,0 to row_min_idx,column_len)
        #print("left rect required")
        mask_rectangles.append(pymupdf.Rect(0, 0, row_min_idx,column_len))
    if row_max_idx < row_len: #right rect required. From (0,column_max_idx to row_len,column_len)
        #print("right rect required")
        mask_rectangles.append(pymupdf.Rect(row_max_idx, 0, row_len, column_len))
    return mask_rectangles

In [17]:

def process_pdf(long_filename):
    with pymupdf.open(long_filename) as doc:  # Changes here
        
        for page in doc:  # iterate through the pages
            show_figure = False
            show_image = False
            pix = page.get_pixmap(colorspace=pymupdf.csGRAY)  # render page to an image
            w = pix.width
            h = pix.height
            pixbytes = pix.tobytes(output='png')
            image = Image.open(io.BytesIO(pixbytes))
            if show_image:
                image.show()
            pixarr = np.asarray(image)

            threshold = 128
            #select a column and a row at half of page
            column = pixarr[:,int(h/2)]
            row = pixarr[int(w/2),:]
            # Draw a rect on the page using a Shape object
            mask_rectangles = get_mask_rectangles(column, row, threshold = 180)
            shape = page.new_shape()
            for r in mask_rectangles:
                shape.draw_rect(r)
                shape.finish(width = 0, color=(0, 0, 0), fill=True)
            shape.commit(overlay=True)
            if show_figure:
                fig = plt.figure()
                plt.plot(column,label="column")
                plt.plot(row,label="row")
                plt.legend()
                plt.show()
        doc.save(long_filename.replace("pdf","out.pdf"))


In [18]:
directory = "."
for file in sorted(os.listdir(directory)):
    if not file.endswith("-tmp.pdf"):
        continue
    print("Processing %s" % long_filename)
    long_filename = os.path.join(directory,file)
    process_pdf(long_filename)

Processing ./abc-pages69-tmp.pdf
Processing ./abc-pages0001-tmp.pdf
Processing ./abc-pages0002-tmp.pdf
Processing ./abc-pages0003-tmp.pdf
Processing ./abc-pages0004-tmp.pdf
Processing ./abc-pages0005-tmp.pdf
Processing ./abc-pages0006-tmp.pdf
Processing ./abc-pages0007-tmp.pdf
Processing ./abc-pages0008-tmp.pdf
Processing ./abc-pages0009-tmp.pdf
Processing ./abc-pages0010-tmp.pdf
Processing ./abc-pages0011-tmp.pdf
Processing ./abc-pages0012-tmp.pdf
Processing ./abc-pages0013-tmp.pdf
Processing ./abc-pages0014-tmp.pdf
Processing ./abc-pages0015-tmp.pdf
Processing ./abc-pages0016-tmp.pdf
Processing ./abc-pages0017-tmp.pdf
Processing ./abc-pages0018-tmp.pdf
Processing ./abc-pages0019-tmp.pdf
Processing ./abc-pages0020-tmp.pdf
Processing ./abc-pages0021-tmp.pdf
Processing ./abc-pages0022-tmp.pdf
Processing ./abc-pages0023-tmp.pdf
Processing ./abc-pages0024-tmp.pdf
Processing ./abc-pages0025-tmp.pdf
Processing ./abc-pages0026-tmp.pdf
Processing ./abc-pages0027-tmp.pdf
Processing ./abc-pages