In [6]:
import os
from pdf2image import convert_from_path

PDF_file = "pdf/Padilla Nobleza virtuosa test.pdf"
output_dir = "images" 

os.makedirs(output_dir, exist_ok=True)

pages = convert_from_path(PDF_file)

for i, page in enumerate(pages):
    filename = os.path.join(output_dir, f"page_{i+1}.jpg")
    page.save(filename, 'JPEG')

print("Images saved in the directory:", output_dir)


Images saved in the directory: images


In [13]:
import cv2

image = cv2.imread('images/page_12.jpg')
result = image.copy()
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# Remove horizontal lines
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40,1))
remove_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    cv2.drawContours(result, [c], -1, (255,255,255), 5)

# Remove vertical lines
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40))
remove_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    cv2.drawContours(result, [c], -1, (255,255,255), 5)

# cv2.imshow('thresh', thresh)
# cv2.imshow('result', result)
cv2.imwrite('result.png', result)
# cv2.waitKey()

True

In [29]:
def cut_half(image_path):
    img = cv2.imread(image_path)
    height = img.shape[0]
    width = img.shape[1]
    width_cutoff = width // 2
    s1 = img[:, :width_cutoff]
    s2 = img[:, width_cutoff:]
    return s1, s2

In [36]:
import os

lis_images = os.listdir('images')

lis_images.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))


if not os.path.exists('pages'):
    os.makedirs('pages')

for image in range(len(lis_images)):
    s1, s2 = cut_half('images/'+lis_images[image])
    cv2.imwrite(f'pages/{image*2}.jpg', s1)
    cv2.imwrite(f'pages/{(image*2)+1}.jpg', s2)


In [37]:
print(lis_images)

['page_1.jpg', 'page_2.jpg', 'page_3.jpg', 'page_4.jpg', 'page_5.jpg', 'page_6.jpg', 'page_7.jpg', 'page_8.jpg', 'page_9.jpg', 'page_10.jpg', 'page_11.jpg', 'page_12.jpg', 'page_13.jpg', 'page_14.jpg', 'page_15.jpg', 'page_16.jpg']


In [5]:
import cv2
import math

image = cv2.imread('pages/1.jpg')

image_height, image_width, _ = image.shape

result = image.copy()
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# Remove horizontal lines
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
remove_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]

dist1_0 = math.inf
dist1_1 = math.inf

final_x1 = 0
final_y1 = 0
final_x2 = 0
final_y2 = 0

for c in cnts:
    x, y, w, h = cv2.boundingRect(c)
    print("Horizontal Line - X:", x, "Y:", y, "Width:", w, "Height:", h)
    top_left = x, y
    bottom_right = x + w, y + h

    diff1_0 = y - image_height//2
    diff1_1 = y+h - image_height//2


    if diff1_0 < dist1_0:
        dist1_0 = diff1_0
        final_y1 = y
    
    elif diff1_1 < dist1_1:
        dist1_1 = diff1_1
        final_y2 = y+h

    cv2.rectangle(result, top_left,bottom_right,(0,0, 255), 5)


vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
remove_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]

dist2_0 = math.inf
dist2_1 = math.inf

for c in cnts:
    x, y, w, h = cv2.boundingRect(c)
    print("Vertical Line - X:", x, "Y:", y, "Width:", w, "Height:", h)
    top_left = x, y
    bottom_right = x + w, y + h

    diff1 = x - image_width//2
    diff2 = x+w - image_width//2

    if diff1 < dist2_0:
        dist2_0 = diff1
        final_x1 = x

    elif diff2 < dist2_1:
        dist2_1 = diff2
        final_x2 = x+w

    cv2.rectangle(result, top_left,bottom_right,(0,0, 255), 5)

cv2.imwrite('resusdfgslt.png', result)


Horizontal Line - X: 640 Y: 1219 Width: 138 Height: 1
Horizontal Line - X: 0 Y: 1219 Width: 92 Height: 1
Horizontal Line - X: 87 Y: 1122 Width: 260 Height: 7
Horizontal Line - X: 349 Y: 1119 Width: 370 Height: 8
Horizontal Line - X: 199 Y: 1078 Width: 173 Height: 1
Horizontal Line - X: 331 Y: 1074 Width: 239 Height: 3
Horizontal Line - X: 95 Y: 1074 Width: 110 Height: 2
Horizontal Line - X: 611 Y: 520 Width: 110 Height: 7
Horizontal Line - X: 589 Y: 506 Width: 132 Height: 10
Horizontal Line - X: 158 Y: 126 Width: 241 Height: 2
Horizontal Line - X: 90 Y: 79 Width: 609 Height: 8
Horizontal Line - X: 0 Y: 0 Width: 835 Height: 42
Vertical Line - X: 81 Y: 940 Width: 2 Height: 172
Vertical Line - X: 79 Y: 726 Width: 2 Height: 153
Vertical Line - X: 60 Y: 724 Width: 4 Height: 185
Vertical Line - X: 59 Y: 627 Width: 3 Height: 103
Vertical Line - X: 703 Y: 454 Width: 1 Height: 81
Vertical Line - X: 725 Y: 421 Width: 2 Height: 80
Vertical Line - X: 730 Y: 404 Width: 1 Height: 96
Vertical Line - 

True