In [None]:
from glob import glob
from pdf2image import convert_from_path
import os
import spacy
import pytesseract
import cv2

In [18]:
pdf_paths = glob('./pdfs/*.pdf')
pdf_paths

['./pdfs\\ATRE_20102262361_20240314_7930_00.pdf',
 './pdfs\\ATRE_20104508023_20240404_7954_00.pdf',
 './pdfs\\ATRE_20106026883_20240401_7957_00.pdf',
 './pdfs\\ATRE_20107012011_20240326_7940_00.pdf',
 './pdfs\\ATRE_20324737171_20240520_8032_00.pdf',
 './pdfs\\ATRE_20438252089_20240327_7951_00.pdf',
 './pdfs\\ATRE_20503497990_20240507_8021_00.pdf',
 './pdfs\\ATRE_20509654060_20240506_8020_00.pdf',
 './pdfs\\ATRE_20519388104_20240326_7941_00.pdf',
 './pdfs\\ATRE_20536893530_20240326_7939_00.pdf']

In [24]:
pdf_names = [pdf_path[7:-4] for pdf_path in pdf_paths]
pdf_names

['ATRE_20102262361_20240314_7930_00',
 'ATRE_20104508023_20240404_7954_00',
 'ATRE_20106026883_20240401_7957_00',
 'ATRE_20107012011_20240326_7940_00',
 'ATRE_20324737171_20240520_8032_00',
 'ATRE_20438252089_20240327_7951_00',
 'ATRE_20503497990_20240507_8021_00',
 'ATRE_20509654060_20240506_8020_00',
 'ATRE_20519388104_20240326_7941_00',
 'ATRE_20536893530_20240326_7939_00']

In [None]:
for pdf_path in pdf_paths:
    if not os.path.exists(f'images/{pdf_path[7:-4]}'):
        os.makedirs(f'images/{pdf_path[7:-4]}')
    pages = convert_from_path(pdf_path, 350)
    for i, page in enumerate(pages):
        page.save(f'./images/{pdf_path[7:-4]}/page_{i}.jpg', 'JPEG')

In [19]:
pdf_name_image_list = {}
for pdf_path in pdf_paths:
    pdf_name_image_list[f'{pdf_path[7:-4]}'] = glob(f'./images/{pdf_path[7:-4]}/*.jpg')
pdf_name_image_list

{'ATRE_20102262361_20240314_7930_00': ['./images/ATRE_20102262361_20240314_7930_00\\page_0.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_1.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_10.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_2.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_3.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_4.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_5.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_6.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_7.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_8.jpg',
  './images/ATRE_20102262361_20240314_7930_00\\page_9.jpg'],
 'ATRE_20104508023_20240404_7954_00': ['./images/ATRE_20104508023_20240404_7954_00\\page_0.jpg',
  './images/ATRE_20104508023_20240404_7954_00\\page_1.jpg',
  './images/ATRE_20104508023_20240404_7954_00\\page_10.jpg',
  './images/ATRE_20104508023_20240404_7954_00\\page_2.jpg',
  './images/ATRE_201045

In [None]:
# Path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Load the Spanish NLP model
nlp = spacy.load("es_core_news_sm")

# Function to process a single page image and extract text from two columns
def process_image(image_path):
    # Read the image
    image = cv2.imread(image_path)
    height, width, _ = image.shape
    # print(f"Image dimensions: {width}x{height}")
    # Divide the image into two columns
    left_col = image[850:3594, :width // 2]
    right_col = image[850:3594, width // 2:]

    # Convert to binary for better OCR results
    ret1, thresh1 = cv2.threshold(left_col, 120, 255, cv2.THRESH_BINARY)
    ret2, thresh2 = cv2.threshold(right_col, 120, 255, cv2.THRESH_BINARY)

    # OCR configuration
    custom_config = r'--oem 3 --psm 4'

    # Extract text from both columns
    text_left = str(pytesseract.image_to_string(thresh1, config=custom_config))
    text_right = str(pytesseract.image_to_string(thresh2, config=custom_config))

    # Combine the text from both columns
    text = text_left + "\n" + text_right

    return text

# Directory to save the output text files
output_dir = r"./texts"
os.makedirs(output_dir, exist_ok=True)

for doc_name in pdf_names:
    
    images = pdf_name_image_list[doc_name]
    print(len(images))
    doc_text = ""
    for image in images:
        if len(images) == 10:
            if "page_0" in image or "page_7" in image or "page_8" in image or "page_9" in image:
                extracted_text = process_image(image)
        elif len(images) == 11:
            if "page_0" in image or "page_8" in image or "page_9" in image or "page_90" in image:
                extracted_text = process_image(image)
        elif len(images) == 12:
            if "page_0" in image or "page_9" in image or "page_90" in image or "page_91" in image:
                extracted_text = process_image(image)
        elif len(images) == 13:
            if "page_0" in image or "page_90" in image or "page_91" in image or "page_92" in image:
                extracted_text = process_image(image)

        doc_text = doc_text + "\n" + extracted_text
        
    print(f"Extracted text from {doc_name}.pdf:")
    # print(doc_text)
    output_file = os.path.join(output_dir, f"{doc_name}.txt")
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(doc_text)
    
    print(f"Text extracted and saved to {output_file}")
    
    # doc = nlp(doc_text)
    # # Print extracted entities
    # for ent in doc.ents:
    #     print(
    #         f"""
    #         {ent.text = }
    #         {ent.start_char = }
    #         {ent.end_char = }
    #         {ent.label_ = }
    #         spacy.explain('{ent.label_}') = {spacy.explain(ent.label_)}"""
    #     )


In [None]:
import re
import spacy
import json
import os

# Cargar el modelo de lenguaje de SpaCy
nlp = spacy.load("es_core_news_sm")

def extract_information(text):
    doc = nlp(text)

    # Expresiones regulares para buscar RUC, nombre del cliente, fecha de firma, etc.
    ruc_pattern = re.compile(r'RUC N°? (\d+)')
    client_name_pattern = re.compile(r'(?<=Denominaci[oó]n social:)\s*(.+)')
    client_representative_pattern = re.compile(r'(?<=Representante legal:)\s*(.+)')
    signature_date_pattern = re.compile(r'Fecha de Firma de la Generadora:?\s*([\d/]+)')
    start_date_pattern = re.compile(r'Fecha de Inicio(.+)\n?.*\n?(.+)(\d{2}/\d{2}/\d{4})')
    end_date_pattern = re.compile(r'Fecha de Término\s*\n?.*\n?\s*(\d{2}/\d{2}/\d{4})')
    power_period_pattern = re.compile(r'(?<=Potencia del Periodo:)\s*\d+ kW')

    # Buscar las coincidencias en el texto
    ruc_match = ruc_pattern.search(text)
    client_name_match = client_name_pattern.search(text)
    client_representative_match = client_representative_pattern.search(text)
    signature_date_match = signature_date_pattern.search(text)
    start_date_match = start_date_pattern.search(text)
    end_date_match = end_date_pattern.search(text)
    power_period_match = power_period_pattern.search(text)

    # Extraer las coincidencias si existen
    ruc = ruc_match.group(1) if ruc_match else "No encontrado"
    client_name = client_name_match.group(1).strip() if client_name_match else "No encontrado"
    client_representative = client_representative_match.group(1).strip() if client_representative_match else "No encontrado"
    signature_date = signature_date_match.group(1) if signature_date_match else "No encontrado"
    start_date = start_date_match.group(1) if start_date_match else "No encontrado"
    end_date = end_date_match.group(1) if end_date_match else "No encontrado"
    power_period = power_period_match.group(0) if power_period_match else "No encontrado"

    return {
        "RUC": ruc,
        "Client Name": client_name,
        "Client Representative": client_representative,
        "Signature Date": signature_date,
        "Start Date": start_date,
        "End Date": end_date,
        "Power Period": power_period
    }

# Leer el archivo de texto
with open('./texts/ATRE_20102262361_20240314_7930_00.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Extraer la información del texto
info = extract_information(text)

# Imprimir la información extraída
for key, value in info.items():
    print(f"{key}: {value}")

# Guardar la información en un archivo JSON
output_dir = './jsons'
os.makedirs(output_dir, exist_ok=True)

output_file = os.path.join(output_dir, 'ATRE_20102262361_20240314_7930_00.json')
with open(output_file, 'w', encoding='utf-8') as json_file:
    json.dump(info, json_file, ensure_ascii=False, indent=4)

print(f"Information extracted and saved to {output_file}")


In [None]:
import spacy
import os

# Cargar el modelo de español de spaCy
nlp = spacy.load("es_core_news_sm")

def extraer_dni(texto):
    # Procesar el texto con spaCy
    doc = nlp(texto)
    
    # Buscar patrones que puedan indicar un DNI
    for ent in doc.ents:
        # Buscar entidades que puedan ser un DNI
        if ent.label_ == "PER" or ent.label_ == "MISC":
            # Buscar un número de 8 dígitos cerca de la entidad
            contexto = doc[max(0, ent.start - 5):min(len(doc), ent.end + 5)]
            for token in contexto:
                if token.like_num and len(token.text) == 8:
                    return token.text
    
    # Si no se encuentra, buscar cualquier número de 8 dígitos en el texto
    for token in doc:
        if token.like_num and len(token.text) == 8:
            return token.text
    
    return None

# Directorio que contiene los archivos de texto
directorio = './texts'

# Iterar sobre todos los archivos .txt en el directorio
for nombre_archivo in os.listdir(directorio):
    if nombre_archivo.endswith('.txt'):
        ruta_archivo = os.path.join(directorio, nombre_archivo)
        
        # Leer el contenido del archivo
        with open(ruta_archivo, 'r', encoding='utf-8') as archivo:
            contenido = archivo.read()
        
        # Extraer el DNI
        dni_extraido = extraer_dni(contenido)
        
        # Imprimir el resultado
        if dni_extraido:
            print(f"Archivo: {nombre_archivo}, Denominación social extraído: {dni_extraido}")
        else:
            print(f"Archivo: {nombre_archivo}, No se encontró DNI")