In [38]:
from PIL import Image
import pytesseract
import PyPDF2
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTFigure, LTRect
import pdfplumber
from pdf2image import convert_from_path
import os

# 最小图像尺寸
MIN_WIDTH = 1  
MIN_HEIGHT = 1  

def text_extraction(element):
    line_text = element.get_text()
    line_formats = []
    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            for character in text_line:
                if isinstance(character, LTChar):
                    line_formats.append(character.fontname)
                    line_formats.append(character.size)
    format_per_line = list(set(line_formats))
    return (line_text, format_per_line)

def crop_image(element, pageObj, cropped_pdf_path):
    [image_left, image_top, image_right, image_bottom] = [element.x0, element.y0, element.x1, element.y1]
    
    # 计算宽度和高度
    width = image_right - image_left
    height = image_top - image_bottom
    
    # 检查宽度和高度是否满足最小要求
    if width >= MIN_WIDTH and height >= MIN_HEIGHT:
        pageObj.mediabox.lower_left = (image_left, image_bottom)
        pageObj.mediabox.upper_right = (image_right, image_top)
        cropped_pdf_writer = PyPDF2.PdfWriter()
        cropped_pdf_writer.add_page(pageObj)
        with open(cropped_pdf_path, 'wb') as cropped_pdf_file:
            cropped_pdf_writer.write(cropped_pdf_file)
        return True
    else:
        return False

def is_image_blank(image_path):
    img = Image.open(image_path)
    grayscale_img = img.convert('L')  # 转换为灰度图
    histogram = grayscale_img.histogram()
    
    # 如果所有像素都是白色，则直方图的最后一个值（255）将接近图像总像素数
    if histogram[255] >= img.size[0] * img.size[1] * 0.99:
        return True
    return False

def convert_to_images(input_file, output_file):
    images = convert_from_path(input_file)
    image = images[0]
    image.save(output_file, "PNG")

def image_to_text(image_path):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text

def extract_table(pdf_path, page_num, table_num):
    pdf = pdfplumber.open(pdf_path)
    table_page = pdf.pages[page_num]
    tables = table_page.extract_tables()
    if tables and len(tables) > table_num:
        table = tables[table_num]
        return table
    return None

def table_converter(table):
    if not table:
        return ''
    
    table_string = ''
    for row_num in range(len(table)):
        row = table[row_num]
        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
        table_string += ('|' + '|'.join(cleaned_row) + '|' + '\n')
    
    # 检查是否为全空白表格
    if all(cell == 'None' for cell in cleaned_row):
        return ''
    
    return table_string[:-1]

def save_results(pdf_path, page_num, table_num, element, pageObj, image_index, table_index):
    base_filename = os.path.splitext(os.path.basename(pdf_path))[0]
    
    base_output_dir = '/home/xylv/dataset/fabpdf/results_divided'
    table_output_dir = os.path.join(base_output_dir, 'results_dividedtable', base_filename)
    image_output_dir = os.path.join(base_output_dir, 'results_dividedimage', base_filename)
    cropped_pdf_output_dir = os.path.join(base_output_dir, '1', base_filename)
    
    os.makedirs(table_output_dir, exist_ok=True)
    os.makedirs(image_output_dir, exist_ok=True)
    os.makedirs(cropped_pdf_output_dir, exist_ok=True)
    
    # Save cropped image PDF
    cropped_pdf_path = os.path.join(cropped_pdf_output_dir, f'cropped_image_{page_num}_{image_index}.pdf')
    if crop_image(element, pageObj, cropped_pdf_path):
        image_path = os.path.join(image_output_dir, f'PDF_image_{page_num}_{image_index}.png')
        convert_to_images(cropped_pdf_path, image_path)
        
        if not is_image_blank(image_path):
            image_text = image_to_text(image_path)
            return image_text

    # Extract and save table
    table = extract_table(pdf_path, page_num, table_num)
    table_string = table_converter(table)
    if table_string:
        table_file_path = os.path.join(table_output_dir, f'table_{page_num}_{table_index}.txt')
        with open(table_file_path, 'w') as table_file:
            table_file.write(table_string)
    
    return None

# Example usage:
pdf_path = '/home/xylv/dataset/fabpdf/fab_pdf/userguide/calbr_3dstack_user.pdf'
output_dir = '/home/xylv/dataset/fabpdf/results_divide'
os.makedirs(output_dir, exist_ok=True)

pdfFileObj = open(pdf_path, 'rb')
pdfReaded = PyPDF2.PdfReader(pdfFileObj)

for pagenum, page in enumerate(extract_pages(pdf_path)):
    pageObj = pdfReaded.pages[pagenum]
    page_text = []
    line_format = []
    text_from_images = []
    text_from_tables = []
    page_content = []
    table_num = 0
    image_index = 0
    table_index = 0
    first_element = True
    table_extraction_flag = False
    pdf = pdfplumber.open(pdf_path)
    page_tables = pdf.pages[pagenum]
    tables = page_tables.find_tables()

    page_elements = [(element.y1, element) for element in page._objs]
    page_elements.sort(key=lambda a: a[0], reverse=True)

    for i, component in enumerate(page_elements):
        pos = component[0]
        element = component[1]
        
        if isinstance(element, LTTextContainer):
            if not table_extraction_flag:
                (line_text, format_per_line) = text_extraction(element)
                page_text.append(line_text)
                line_format.append(format_per_line)
                page_content.append(line_text)
            else:
                pass

        if isinstance(element, LTFigure):
            image_text = save_results(pdf_path, pagenum, table_num, element, pageObj, image_index, table_index)
            if image_text:
                text_from_images.append(image_text)
                page_content.append(image_text)
                page_text.append('image')
                line_format.append('image')
                image_index += 1

        if isinstance(element, LTRect):
            if first_element and (table_num + 1) <= len(tables):
                lower_side = page.bbox[3] - tables[table_num].bbox[3]
                upper_side = element.y1 
                table = extract_table(pdf_path, pagenum, table_num)
                table_string = table_converter(table)
                if table_string:
                    text_from_tables.append(table_string)
                    page_content.append(table_string)
                    table_extraction_flag = True
                    first_element = False
                    page_text.append('table')
                    line_format.append('table')
                    table_index += 1

            if element.y0 >= lower_side and element.y1 <= upper_side:
                pass
            elif not isinstance(page_elements[i + 1][1], LTRect):
                table_extraction_flag = False
                first_element = True
                table_num += 1

    if text_from_images or text_from_tables:
        dctkey = 'Page_' + str(pagenum)
        result = ''.join(page_content)
        with open(os.path.join(output_dir, f'{dctkey}.txt'), 'w') as f:
            f.write(result)

pdfFileObj.close()


KeyboardInterrupt: 

In [None]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = '/home/xylv/enviroment/bin/tesseract'


In [None]:
import os
os.environ['TESSDATA_PREFIX'] = '/home/xylv/enviroment/share/'
os.environ['LD_LIBRARY_PATH'] = '/home/xylv/enviroment/lib'
# 添加 Tesseract 和 libtiff.so.5.7.0 的路径
os.environ['PATH'] = '/home/xylv/enviroment/bin:' + os.environ['PATH']
os.environ['LD_LIBRARY_PATH'] = '/home/xylv/enviroment/lib/libtiff.so.5.7.0:' + os.environ.get('LD_LIBRARY_PATH', '')

# 打印确认环境变量
print("Updated PATH:", os.environ['PATH'])
print("Updated LD_LIBRARY_PATH:", os.environ['LD_LIBRARY_PATH'])

Updated PATH: /home/xylv/enviroment/bin:/home/xylv/enviroment/bin:/home/xylv/anaconda3/envs/qa/bin:/home/xylv/.vscode-server/cli/servers/Stable-fee1edb8d6d72a0ddff41e5f71a671c23ed924b9/server/bin/remote-cli:/home/xylv/enviroment/node-v15.14.0-linux-x64/bin:/home/xylv/enviroment/node-v18.17.0-linux-x64/bin:/home/xylv/enviroment:/home/xylv/.local/bin:/home/xylv/enviroment/node-v15.14.0-linux-x64/bin:/home/xylv/enviroment/node-v18.17.0-linux-x64/bin:/home/xylv/enviroment:/home/xylv/.local/bin:/home/xylv/anaconda3/envs/qa/bin:/home/xylv/anaconda3/condabin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/home/xylv/anaconda3/bin:/bin:/usr/bin:/home/xylv/enviroment/node-v10.16.0-linux-x64/bin:/home/xylv/.local/bin:/home/xylv/bin:/home/xylv/anaconda3/bin:/bin:/usr/bin:/home/xylv/enviroment/node-v10.16.0-linux-x64/bin:/home/xylv/.local/bin:/home/xylv/bin
Updated LD_LIBRARY_PATH: /home/xylv/enviroment/lib/libtiff.so.5.7.0:/home/xylv/enviroment/lib
