#### Imports

In [392]:
# ctrl+shift+p -> Notebook: Select Notebook Kernel -> venv
from pdf2image import convert_from_path
from pytesseract import pytesseract
from PIL import Image, ImageFilter
import cv2
import os
import numpy as np
from matplotlib import pyplot as plt

#### Utils Functions

In [393]:
def convert_pdf_to_img(pdf_file):
    return convert_from_path(
        pdf_path=pdf_file, 
        dpi=500, 
        #output_folder="./output", 
        poppler_path=r"C:\Program Files\poppler-23.01.0\Library\bin",
        )


def convert_image_to_text(file):
    pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
    os.environ['TESSDATA_PREFIX'] = r'C:\Program Files\Tesseract-OCR\tessdata'
    return pytesseract.image_to_string(file, lang="por")


def save(img, name):
    cv2.imwrite(name, img)


def get_concat_v(im1, im2): # vertical image concatenation using PIL lib
    dst = Image.new('RGB', (im1.width, im1.height + im2.height))
    dst.paste(im1, (0, 0))
    dst.paste(im2, (0, im1.height))
    return dst


def show(img_array):
    im = Image.fromarray(img_array)
    im.show()


def process_image(image: np.array, _x: int = 1400, _y: int = 80):
    # Convert PIL Image to CV2 Image (witch is a numpy array)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    # Apply grayscale to the image
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Apply blur
    blur = cv2.GaussianBlur(gray, (7, 7), 0)
    # Apply thresholding to the grayscale image
    _, threshold = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV+ cv2.THRESH_OTSU)
    # Dilation
    kernal = cv2.getStructuringElement(cv2.MORPH_RECT, (1400, 80)) # manual adjust of x, y dilation
    dilate = cv2.dilate(threshold, kernal, iterations=1)
    # Finding the Countours from the dilated image
    contours = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = contours[0] if len(contours)==2 else contours[1]
    contours = sorted(contours, key=lambda x: cv2.boundingRect(x)[1])
    return contours, image



#### Reading the pdf and converting all the pages into one single image file

In [394]:
pdf_file = "REALFLEX.pdf"
images = convert_pdf_to_img(pdf_file)
_images = []
for image in images:
    w, h = image.size
    _images.append(image.crop((0, 100, w, h-100)))    
im_v = get_concat_v(_images[0].crop((0, 330, w, h-200)), _images[1])
for pg, image in enumerate(_images[2:]):
    im_v = get_concat_v(im_v, image)

In [399]:
# Process IMAGE and get The contours
cnts, image = process_image(np.array(im_v))

# Applying countours and extracting each part
counter=0
for c in cnts:
    x, y, w, h = cv2.boundingRect(c)
    if h > 200 and w > 800:
        cropped_im = image[y:y+h, x:x+w]
        cv2.imwrite(f"./output/cropped_{counter}.jpg", cropped_im)
        cv2.rectangle(image, (x,y), (x+w, y+h), (36, 255, 12), 2)
        counter+=1
        
cv2.imwrite("sample_boxes.jpg", image)

True

In [400]:
text = convert_image_to_text(cropped_im)

In [402]:
from pprint import pprint
pprint(text)



('Informações sobre o parcelamento\n'
 'Nenhum registro encontrado\n'
 '\n'
 'Informações sobre os pagamentos efetuados\n'
 '\n'
 'Data Limite Data de Valor Referênci Óraã Data de BancovAgência Número de '
 'Tipo de Crédito\n'
 'Pag Arrecadação Recolhido erereência rgao Recepção - Arquivamento Pp\n'
 '\n'
 '30/11/2021 24/11/2021 R$ 15.479,28 ANTECIPACAO 9999999) 25/11/2021 '
 '999/9999-9 999999999999 Tr REraDO O\n'
 '30/12/2021 10/12/2021 R$ 15.634,09 JANTECIPACAO 9999999 13/12/2021 '
 '999/9999-9 999999999999 e RErADO O\n'
 '31/01/2022 28/01/2022 R$ 15.753,27 JANTECIPACAO 9999999 31/01/2022 '
 '999/9999-9 999999999999 Tr REraDO O\n'
 '25/02/2022 23/02/2022 R$ 15.866,28 JANTECIPACAO 9999999 24/02/2022 '
 '999/9999-9 999999999999 e RErADO O\n'
 '31/03/2022 28/03/2022 R$ 15.983,92 JANTECIPACAO 9999999 29/03/2022 '
 '999/9999-9 999999999999 Tr REraDO O\n'
 '29/04/2022 27/04/2022 R$ 16.127,87 ANTECIPACAO 9999999 28/04/2022 999/9999-9 '
 '999999999999 e RErADO O\n'
 '\n'
 'Informações de oc