In [1]:
import os
import tqdm
from PIL import ImageEnhance
from tempfile import TemporaryDirectory
from pdf2image import convert_from_path
from img2pdf import convert
from typing import List
from img2table.document import PDF
from img2table.ocr import TesseractOCR

In [2]:
def change_pdf_colors_for_ocr(pdf_input_path: str, pdf_output_file: str) -> None:
    """
    Changes colors of the PDF file to black and white for better OCR results.
    :param pdf_input_path: path to the input PDF file
    :param pdf_output_file: path to the output PDF file
    :return: None
    """
    images = convert_from_path(pdf_input_path)

    for i in range(len(images)):
        img_tmp = images[i].convert("L")
        enhancer_con = ImageEnhance.Contrast(img_tmp)
        img_tmp = enhancer_con.enhance(3)
        enhancer_bri = ImageEnhance.Brightness(img_tmp)
        images[i] = enhancer_bri.enhance(2.5)

    with TemporaryDirectory() as temp_dir:
        image_list = list()
        
        for page_number in range(1, len(images) + 1):
            path = os.path.join(temp_dir, "page_" + str(page_number) + ".jpeg")
            image_list.append(path)
            images[page_number-1].save(path, "JPEG") # (page_number - 1) because index starts from 0

        with open(pdf_output_file, "bw") as gray_pdf:
            gray_pdf.write(convert(image_list))

In [5]:
banks = ["alior_bank", "city", "mBank", "milenium", "PKO"]

In [4]:
def extract_tables(banks: List[str]) -> None:
    """
    Extracts tables from PDF files and saves them as XLSX files.
    :param banks: list of banks
    :return: None
    """
    ocr = TesseractOCR(lang="pol")
    for bank in tqdm.tqdm(banks):
        for file in os.listdir(f"../data/{bank}/"):
            if file.endswith(".pdf") and not file.startswith("bw"):
                change_pdf_colors_for_ocr(f'../data/{bank}/{file}', f'../data/{bank}/bw_{file}')
                pdf = PDF(src=f'../data/{bank}/bw_{file}')
                pdf.to_xlsx(f'../data/tables/{file[:-4]}.xlsx', ocr=ocr)

In [None]:
extract_tables(banks)