In [None]:
from glob import glob
from pdf2image import convert_from_path
import os
import spacy
import pytesseract
import cv2

In [18]:
pdf_paths = glob('./pdfs/*.pdf')
pdf_paths

['./pdfs\\ATRE_20102262361_20240314_7930_00.pdf',
 './pdfs\\ATRE_20104508023_20240404_7954_00.pdf',
 './pdfs\\ATRE_20106026883_20240401_7957_00.pdf',
 './pdfs\\ATRE_20107012011_20240326_7940_00.pdf',
 './pdfs\\ATRE_20324737171_20240520_8032_00.pdf',
 './pdfs\\ATRE_20438252089_20240327_7951_00.pdf',
 './pdfs\\ATRE_20503497990_20240507_8021_00.pdf',
 './pdfs\\ATRE_20509654060_20240506_8020_00.pdf',
 './pdfs\\ATRE_20519388104_20240326_7941_00.pdf',
 './pdfs\\ATRE_20536893530_20240326_7939_00.pdf']

In [24]:
pdf_names = [pdf_path[7:-4] for pdf_path in pdf_paths]
pdf_names

['ATRE_20102262361_20240314_7930_00',
 'ATRE_20104508023_20240404_7954_00',
 'ATRE_20106026883_20240401_7957_00',
 'ATRE_20107012011_20240326_7940_00',
 'ATRE_20324737171_20240520_8032_00',
 'ATRE_20438252089_20240327_7951_00',
 'ATRE_20503497990_20240507_8021_00',
 'ATRE_20509654060_20240506_8020_00',
 'ATRE_20519388104_20240326_7941_00',
 'ATRE_20536893530_20240326_7939_00']

In [None]:
for pdf_path in pdf_paths:
    if not os.path.exists(f'images/{pdf_path[7:-4]}'):
        os.makedirs(f'images/{pdf_path[7:-4]}')
    pages = convert_from_path(pdf_path, 350)
    for i, page in enumerate(pages):
        page.save(f'./images/{pdf_path[7:-4]}/page_{i}.jpg', 'JPEG')

In [19]:
pdf_name_image_list = {}
for pdf_path in pdf_paths:
    pdf_name_image_list[f'{pdf_path[7:-4]}'] = glob(f'./images/{pdf_path[7:-4]}/*.jpg')
pdf_name_image_list

{'ATRE_20102262361_20240314_7930_00': ['./images/ATRE_20102262361_20240314_7930_00\\page_0.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_1.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_10.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_2.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_3.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_4.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_5.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_6.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_7.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_8.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_9.jpg'],
 'ATRE_20104508023_20240404_7954_00': ['./images/ATRE_20104508023_20240404_7954_00\\page_0.jpg',
  './images/ATRE_20104508023_20240404_7954_00\\page_1.jpg',
  './images/ATRE_20104508023_20240404_7954_00\\page_10.jpg',
  './images/ATRE_20104508023_20240404_7954_00\\page_2.jpg',
  './images/ATRE_201045

In [28]:
# Path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Load the Spanish NLP model
nlp = spacy.load("es_core_news_sm")

# Function to process a single page image and extract text from two columns
def process_image(image_path):
    # Read the image
    image = cv2.imread(image_path)
    height, width, _ = image.shape
    # print(f"Image dimensions: {width}x{height}")
    # Divide the image into two columns
    left_col = image[850:3594, :width // 2]
    right_col = image[850:3594, width // 2:]

    # Convert to binary for better OCR results
    ret1, thresh1 = cv2.threshold(left_col, 120, 255, cv2.THRESH_BINARY)
    ret2, thresh2 = cv2.threshold(right_col, 120, 255, cv2.THRESH_BINARY)

    # OCR configuration
    custom_config = r'--oem 3 --psm 4'

    # Extract text from both columns
    text_left = str(pytesseract.image_to_string(thresh1, config=custom_config))
    text_right = str(pytesseract.image_to_string(thresh2, config=custom_config))

    # Combine the text from both columns
    text = text_left + "\n" + text_right

    return text

# Directory to save the output text files
output_dir = r"./texts"
os.makedirs(output_dir, exist_ok=True)

for doc_name in pdf_names:
    
    images = pdf_name_image_list[doc_name]
    
    doc_text = ""
    for image in images:
        if len(images) == 10:
            if "page_0" in image or "page_7" in image or "page_8" in image or "page_9" in image:
                extracted_text = process_image(image)
        elif len(images) == 11:
            if "page_0" in image or "page_8" in image or "page_9" in image or "page_10" in image:
                extracted_text = process_image(image)
        elif len(images) == 12:
            if "page_0" in image or "page_9" in image or "page_10" in image or "page_11" in image:
                extracted_text = process_image(image)

        doc_text = doc_text + "\n" + extracted_text
        
    print(f"Extracted text from {doc_name}.pdf:")
    # print(doc_text)
    output_file = os.path.join(output_dir, f"{doc_name}.txt")
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(doc_text)
    
    print(f"Text extracted and saved to {output_file}")
    
    # doc = nlp(doc_text)
    # # Print extracted entities
    # for ent in doc.ents:
    #     print(
    #         f"""
    #         {ent.text = }
    #         {ent.start_char = }
    #         {ent.end_char = }
    #         {ent.label_ = }
    #         spacy.explain('{ent.label_}') = {spacy.explain(ent.label_)}"""
    #     )


Extracted text from ATRE_20102262361_20240314_7930_00.pdf:

“Contrato }, que celepran ae Und parte Atria CMerBla
S.A.C., con RUC N° 20501860329, con domicilio en Av.
Pardo y Aliaga N° 675, Of. 301, Distrito de San Isidro,
Provincia y Departamento de Lima, debidamente
representada por su Apoderado, el sefior Fernando Javier
Vega Sanchez, con DNI N° 09997706, segun poderes
inscritos en la Partida Electrénica N° 11269325 del
Registro de Personas Juridicas de la Oficina Registral de
Lima, a la que en adelante se denominara, la
“Generadora’ y de la otra parte e! Cliente, cuyos datos de
identificacidn se detallan en ei Anexo 1 de este Contrata,
en los términos y condiciones siguientes:

Para efectos de este Contrato, ej Cliente y a Generadora
podran ser denominados individualmente como [a “Parte”
y conjuntamente come las “Partes”.

Primera - Marco Legal

Este Contrato se encuentra sujeto al régimen de libertad
de precios, segtin jo establecido articulo 89 de la Ley de:
Concesiones Eléctricas

KeyboardInterrupt: 