In [8]:
import pymupdf
from pdf2docx import Converter
import re

In [17]:
import re

class CTITParser:
    ITEM_INDEX = 0
    REFERENCE_INDEX = 1
    NCM_INDEX = 2
    DESCRIPTION_INDEX = 3

    SUPPLY_WILDCARD = 'I#'

    PRODUCT_WILDCARD = 'P#'
    REFERENCE_WILDCARD = 'R#'

    SUPPLY_ROW_LENGTH = 4
    PRODUCT_ROW_LENGTH = 4
    RELATION_ROW_LENGTH = 11

    ctit_number_pattern = r"Número de CTIT:\s*(\d+/\d+)"
    vat_number_pattern = r"CUIT:\s*(\d+)"
    release_date_pattern = r"Fecha de emisión:\s*(\d{4}-\d{2}-\d{2})"

    supply_table_pattern = \
        r"INSUMOS \nITEM\nREFERENCIA\nNCM\nDESCRIPCIÓN\n(.*?)(PRODUCTOS|CONJUNTOS INSUMOS SUSTITUTIVOS) \nITEM\nREFERENCIA\nNCM\nDESCRIPCIÓN\n"
    product_table_pattern = \
        r"PRODUCTOS \nITEM\nREFERENCIA\nNCM\nDESCRIPCIÓN\n(.*?)RELACIÓN INSUMO / PRODUCTO \nRELACIÓN\s?INS/CON" #\nPRODUCTOS\nMERMAS\nPÉRDIDAS\nITEM\nITEM\nCANT\nUM\nITEM\nCANT\nUM\nCANT\nUM\nCANT\nUM\n"
    relation_table_pattern = \
        r"RELACIÓN INSUMO / PRODUCTO \nRELACIÓN\s?INS/CON\nPRODUCTOS\nMERMAS\nPÉRDIDAS\nITEM\nITEM\nCANT\nUM\nITEM\nCANT\nUM\nCANT\nUM\nCANT\nUM\n(.*?)Detalle de Merma \nITEM\nINSUMO\nPRODUCTO\nMERMA\nRELACIÓN\nITEM\nITEM\nNCM\nDESCRIPCION\n"

    supply_row_pattern = r"(I#\d+)\n([\s\S]+?)\n?(\d{8})\n((?:[^\n]+\n?)+?)(?=I#\d+|$)"
    product_row_pattern = r"(P#\d+)\n([\s\S]+?)\n(\d{8})\n((?:[^\n]+\n?)+?)(?=P#\d+|$)"
    relation_row_pattern = \
        r"(R#\d+)\n(I#\d+)\n([\d.]+)\n(\w+)\n(P#\d+)\n([\d.]+)\n([\w\n]+)\n([\d.]+)\n([\w\n]+)\n([\d.]+)\n((?:[^\n]+\n?)+?)(?=R#\d+|$)"

    @staticmethod
    def to_NCM_format(number):
        # Verifica si ya está en el formato deseado
        if re.match(r"^\d{4}\.\d{2}\.\d{2}$", number):
            return number  # Si ya está formateado, lo devuelve sin cambios
        else:
            # Si no está en el formato, aplica el formateo
            return f"{number[:4]}.{number[4:6]}.{number[6:]}"

    @classmethod
    def format_component_list(cls, list_of_tuple):
        component_list = [list(component_tuple) for component_tuple in list_of_tuple]
        # Remove Prefefix
        for component in component_list:
            component[cls.ITEM_INDEX] = component[cls.ITEM_INDEX][2:]
            # TODO: remove whitespace from end of string
            component[cls.REFERENCE_INDEX] = component[cls.REFERENCE_INDEX].replace("\n", " ")
            component[cls.NCM_INDEX] = cls.to_NCM_format(component[cls.NCM_INDEX])
            component[cls.DESCRIPTION_INDEX] = component[cls.DESCRIPTION_INDEX].replace("\n", " ")

        return component_list

    @classmethod
    def format_component_dict(cls, list_of_tuple, type_value):
        # component_list = [list(component_tuple) for component_tuple in list_of_tuple]
        keys = ['item', 'reference', 'ncm', 'description']
        component_list = list()
        for component in list_of_tuple:
            component_list.append({
                'item': component[cls.ITEM_INDEX][2:],
                'reference': component[cls.REFERENCE_INDEX].replace("\n", " "),
                'ncm': cls.to_NCM_format(component[cls.NCM_INDEX]),
                'description': component[cls.DESCRIPTION_INDEX].replace("\n", " "),
                'type': type_value,
                # 'default_unit': None
            })

        return component_list

    @classmethod
    def format_component_dict_V2(cls, component_row, type_value):
        # component_list = [list(component_tuple) for component_tuple in list_of_tuple]
        keys = ['item', 'reference', 'ncm', 'description']
        return {
            'item': component_row[0],
            'reference': component_row[1],
            'ncm': cls.to_NCM_format(component_row[2]),
            'description': component_row[3],
            'type': type_value,
            # 'default_unit': None
            }

    @classmethod
    def format_relation_list(cls, list_of_tuple):
        relation_list = [list(relation_tuple) for relation_tuple in list_of_tuple]
        for relation in relation_list:
            relation[cls.ITEM_INDEX] = relation[cls.ITEM_INDEX][2:]
        return relation_list

    @classmethod
    def format_relation_dict(cls, list_of_tuple):
        relation_list = list()
        for relation in list_of_tuple:
            relation_list.append({
                'item': relation[cls.ITEM_INDEX][2:],
                'supply_item': relation[1][2:],
                'supply_qty': relation[2],
                'supply_unit': relation[3],
                'product_item': relation[4][2:],
                'product_qty': relation[5],
                'product_unit': relation[6],
                'waste_qty': relation[7],
                'waste_unit': relation[8].replace("\n", ""),
                'loss_qty': relation[9],
                'loss_unit': relation[10].replace("\n", "")
            })
        return relation_list

    @classmethod
    def format_relation_dict_V2(cls, relation_row):
        return {
                'item': relation_row[0],
                'supply_item': relation_row[1],
                'supply_qty': relation_row[2],
                'supply_unit': relation_row[3],
                'product_item': relation_row[4],
                'product_qty': relation_row[5],
                'product_unit': relation_row[6],
                'waste_qty': relation_row[7],
                'waste_unit': relation_row[8],
                'loss_qty': relation_row[9],
                'loss_unit': relation_row[10]
        }


    def set_unit(self):
        for supply in self._supply_list:
            relation = [element for element in self._relation_list if element.get('supply_item') == supply.get('item')]
            if relation:
                supply['default_unit'] = relation[0]['supply_unit']
            else:
                supply['default_unit'] = 'Unidad'
        for product in self._product_list:
            relation = [element for element in self._relation_list if element.get('product_item') == product.get('item')]
            if relation:
                product['default_unit'] = relation[0]['product_unit']
            else:
                product['default_unit'] = 'Unidad'

    def __init__(self, stream: bytes = None, mode = ''):
        self._document = pymupdf.open(filename='pdf', stream=stream)
        text = ""
        for page in self._document:
            text += page.get_text()

        # print(text)
        ctit_number_search = re.search(self.ctit_number_pattern, text)
        if not ctit_number_search:
            raise Exception("No CTIT number found")
        self._ctit_number = ctit_number_search.group(1)

        vat_number_search = re.search(self.vat_number_pattern, text)
        if not vat_number_search:
            raise Exception("No VAT number found")
        self._vat_number = vat_number_search.group(1)

        release_date_search = re.search(self.release_date_pattern, text)
        if not release_date_search:
            raise Exception("No release date found")
        self._release_date = release_date_search.group(1)

        if mode == 'legacy':
            # Get supply
            supply_section = re.search(self.supply_table_pattern, text, re.DOTALL)
            if not supply_section:
                raise Exception("No supply section found")
            self._supply_list = re.findall(self.supply_row_pattern, supply_section.group(1))
            self._supply_list = self.format_component_dict(self._supply_list, 'S')

            # Get product
            product_section = re.search(self.product_table_pattern, text, re.DOTALL)
            if not product_section:
                raise Exception("No product section found")
            self._product_list = re.findall(self.product_row_pattern, product_section.group(1))
            self._product_list = self.format_component_dict(self._product_list, 'P')

            # Get relation
            relation_section = re.search(self.relation_table_pattern, text, re.DOTALL)
            if not relation_section:
                raise Exception("No relation section found")
            self._relation_list = re.findall(self.relation_row_pattern, relation_section.group(1))
            self._relation_list = self.format_relation_dict(self._relation_list)


        else:
            cv = Converter(stream=stream)
            tables = cv.extract_tables()
            if len(tables) < 3:
                raise Exception("No tables found")
            self._supply_list = []
            self._product_list = []
            self._relation_list = []
            for table in tables:
                # Check table type
                for row in table:
                    item = row[0]
                    if len(row) == self.SUPPLY_ROW_LENGTH and item.startswith(self.SUPPLY_WILDCARD):
                        self._supply_list.append(self.format_component_dict_V2(row, 'S'))
                    elif len(row) == self.PRODUCT_ROW_LENGTH and item.startswith(self.PRODUCT_WILDCARD):
                        self._product_list.append(self.format_component_dict_V2(row, 'P'))
                    elif len(row) == self.RELATION_ROW_LENGTH and item.startswith(self.REFERENCE_WILDCARD):
                        self._relation_list.append(self.format_relation_dict_V2(row))
                    else:
                        continue
        # Set unit correctly
        self.set_unit()

    @property
    def ctit_number(self):
        return self._ctit_number

    @property
    def vat_number(self):
        return self._vat_number

    @property
    def release_date(self):
        return self._release_date

    def get_supply_list(self):
        return self._supply_list

    def get_product_list(self):
        return self._product_list

    def get_relation_list(self):
        return self._relation_list


file =open("./ctit-soimers.pdf", "rb")
# ctit_fondomote = pymupdf.open(stream=file, mode='f')
ctit_obj =  CTITParser(stream=file.read())
# print(ctit_obj.ctit_number)
# print(ctit_obj.release_date)
print(ctit_obj.get_supply_list())
# print(ctit_obj.get_product_list())
# print(ctit_obj.get_relation_list())
# relation = ctit_obj.get_relation_list()


[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m
[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/8) Page 1
[INFO] (2/8) Page 2
[INFO] (3/8) Page 3
[INFO] (4/8) Page 4
[INFO] (5/8) Page 5
[INFO] (6/8) Page 6
[INFO] (7/8) Page 7
[INFO] (8/8) Page 8


[{'item': 'I#1', 'reference': 'CH1', 'ncm': '8419.90.20', 'description': 'CHAPA CURVADA DE ACERO AL CARBONO (SA-516 Gr.70) REVESTIDA CON ACERO INOXIDABLE (SA-240 TP317L) PARA CUERPO DE COLUMNA DE DESTILACION, ESPESOR FINAL 15,9mm (12,7 + 3,2mm). ANCHO 2000mm x LARGO 5375mm.\nRADIO INTERIOR 1700mm (SEGÚN PLANO 2271-IN-M-FAB-100)', 'type': 'S', 'default_unit': 'UNIDAD'}, {'item': 'I#2', 'reference': 'CH2', 'ncm': '8419.90.20', 'description': 'CHAPA CURVADA DE ACERO AL CARBONO (SA-516 Gr.70) REVESTIDA CON ACERO INOXIDABLE (SA-240 TP317L) PARA CUERPO DE COLUMNA DE DESTILACION, ESPESOR FINAL 15,9mm (12,7+3,2mm). ANCHO 2000mm x LARGO 5356mm.\nRADIO INTERIOR 1700mm (SEGÚN PLANO 2271-IN-M-FAB-100).', 'type': 'S', 'default_unit': 'UNIDAD'}, {'item': 'I#3', 'reference': 'CH3', 'ncm': '8419.90.20', 'description': 'CHAPA CURVADA DE ACERO AL CARBONO (SA-516 Gr.70) REVESTIDA CON ACERO INOXIDABLE (SA-240 TP317L) PARA CUERPO DE COLUMNA DE DESTILACION, ESPESOR FINAL 12,7mm (9,5+3,2mm). ANCHO 2000mm x L

In [5]:
ctit_fondo = pymupdf.open(filename="ctit-soimers.pdf")

In [17]:
for page in ctit_fondo.pages():
    tables = page.extract_tables()
    
print(tables[0].extract())

AttributeError: 'Page' object has no attribute 'extract_tables'

In [12]:
from pdf2docx import Converter

In [16]:
cv = Converter("./ctit-fondomonte.pdf")
tables = cv.extract_tables()
cv.close()

for table in tables:
    print(table)

[INFO] [1;36m[1/4] Opening document...[0m
[INFO] [1;36m[2/4] Analyzing document...[0m
[INFO] [1;36m[3/4] Parsing pages...[0m
[INFO] (1/3) Page 1
[INFO] (2/3) Page 2
[INFO] (3/3) Page 3


[['<NEST TABLE>']]
[['I#1', 'FMFUNDA8', '54072000', 'TEJIDOS DE HILADOS DE FILAMENTOS SINTETICOS - DE POLIPROPILENO - TUBULARES DE PESO 160 g/m2. De 185 cm de largo x 116 cm de ancho'], ['I#2', 'FMFUNDA9', '54072000', 'TEJIDOS DE HILADOS DE FILAMENTOS SINTETICOS - DE POLIPROPILENO - TUBULARES DE PESO 160 g/m2. De 178,5 cm de largo x 132 cm de ancho'], ['I#3', 'FMFUNDA10', '54072000', 'TEJIDOS DE HILADOS DE FILAMENTOS SINTETICOS - DE POLIPROPILENO - TUBULARES DE PESO 160 g/m2. De 167 cm de largo x 132 cm de ancho']]
[['C#1', 'FMFUNDA10', '54072000', 'TEJIDOS DE HILADOS DE FILAMENTOS SINTETICOS - DE POLIPROPILENO - TUBULARES DE PESO 160 g/m2. De 167 cm de largo x 132 cm de ancho'], ['C#1', 'FMFUNDA8', '54072000', 'TEJIDOS DE HILADOS DE FILAMENTOS SINTETICOS - DE POLIPROPILENO - TUBULARES DE PESO 160 g/m2. De 185 cm de largo x 116 cm de ancho'], ['C#1', 'FMFUNDA9', '54072000', 'TEJIDOS DE HILADOS DE FILAMENTOS SINTETICOS - DE POLIPROPILENO - TUBULARES DE PESO 160 g/m2. De 178,5 cm de larg