In [1]:
from img2table.document import PDF
from img2table.ocr import TesseractOCR
from langdetect import detect
import re
import pandas as pd
from pdf2image import convert_from_path
import pytesseract

# Path to the PDF
pdf_path = "PDFdocs/Certificat d’origine – Eur.1.pdf"

# Convert the PDF into images
pages = convert_from_path(pdf_path)

# Instantiation of the pdf
pdf = PDF(src=pdf_path)

# Instantiation of the OCR, Tesseract, which requires prior installation
ocr = TesseractOCR(lang="eng+fra+ara")

# Table identification and extraction
pdf_tables = pdf.extract_tables(ocr=ocr)

# Initialize an empty dictionary to hold dataframes
dfs = {}

# Process each table
for page_num, tables in pdf_tables.items():
    # Get the image corresponding to the page
    image = pages[page_num]

    for i, table in enumerate(tables):
        # Get the bounding box of the table
        bbox = table.bbox

        try:
            bbox_tuple = (bbox.x1, bbox.y1, bbox.x2, bbox.y2)
            # Crop the image to the bounding box
            cropped_image = image.crop(bbox_tuple)

            # Extract text from the cropped image
            table_text = pytesseract.image_to_string(cropped_image, lang="eng+fra+ara")

            # Convert the table text into a dataframe
            lines = table_text.split('\n')
            rows = [line.split() for line in lines if line.strip()]
            df = pd.DataFrame(rows)

            # Add the dataframe to the dictionary
            dfs[f'Page {page_num + 1} Table {i + 1}'] = df

            # Now, process each line in the table
            for line in rows:
                # Separate the line into different languages and numerical values
                line_parts = line
                english_parts = []
                french_parts = []
                arabic_parts = []
                numeric_parts = []

                for part in line_parts:
                    if part.isdigit():
                        numeric_parts.append(part)
                    else:
                        try:
                            lang = detect(part)
                            if lang == 'en':
                                english_parts.append(part)
                            elif lang == 'fr':
                                french_parts.append(part)
                            elif lang == 'ar':
                                arabic_parts.append(part)
                        except:
                            pass

                print(f'English: {" ".join(english_parts)}, French: {" ".join(french_parts)}, Arabic: {" ".join(arabic_parts)}, Numeric: {" ".join(numeric_parts)}')
        except AttributeError:
            print("Unable to get bounding box coordinates.")

# Write the dataframes to an Excel file
with pd.ExcelWriter('tables.xlsx') as writer:
    for name, df in dfs.items():
        df.to_excel(writer, sheet_name=name)


tesseract 4.1.1
 leptonica-1.79.0
  libgif 5.1.4 : libjpeg 8d (libjpeg-turbo 2.0.3) : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 0.6.1 : libopenjp2 2.3.1
 Found AVX2
 Found AVX
 Found FMA
 Found SSE
 Found libarchive 3.4.0 zlib/1.2.11 liblzma/5.2.4 bz2lib/1.0.8 liblz4/1.9.2 libzstd/1.4.4
English: , French: , Arabic: , Numeric: 
English: , French: , Arabic: , Numeric: 1
English: , French: , Arabic: , Numeric: 7
English: , French: , Arabic: , Numeric: 
English: , French: , Arabic: , Numeric: 3
English: , French: uhlisé les échanges préférentiels entre, Arabic: , Numeric: 
English: , French: LE, Arabic: , Numeric: 
English: , French: es Ou lerritoires, Arabic: , Numeric: 5
English: , French: groupe ou groupe ou, Arabic: , Numeric: 
English: , French: les produits destination, Arabic: , Numeric: 
English: , French: sont considères comme, Arabic: , Numeric: 
English: , French: criginaireS, Arabic: , Numeric: 
English: Informetions relatives, French: au transport, Arabic: , Numeri