### 使用 wkhtmltopdf 将 HTML 转换为 PDF
#### 期货数据中包含PDF和HTML两种格式文件，需要将HTML文件转换为PDF文件

In [11]:
# 配置项
DATA_DIR = "../data/20250401"
OUTPUT_DIR = "../output"

import os
from pdfminer.high_level import extract_text

def extract_text_to_txt(pdf_path, txt_dir):
    """将PDF文件内容提取到TXT文件中"""
    try:
        # 创建TXT文件名（与PDF同名）
        txt_filename = os.path.splitext(os.path.basename(pdf_path))[0] + ".txt"
        txt_path = os.path.join(txt_dir, txt_filename)
        
        # 提取PDF文本
        text = extract_text(pdf_path)
        
        # 写入TXT文件
        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write(text)
            
        return True
    except Exception as e:
        print(f"处理文件 {pdf_path} 时出错: {str(e)}")
        return False

def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    txt_output_dir = os.path.join(OUTPUT_DIR, "text_files")
    os.makedirs(txt_output_dir, exist_ok=True)

    # 获取所有PDF文件
    pdf_files = []
    for root, _, files in os.walk(DATA_DIR):
        pdf_files.extend([os.path.join(root, f) for f in files if f.lower().endswith(".pdf")])

    # 使用tqdm显示进度
    for pdf_path in tqdm(pdf_files, desc="提取PDF文本"):
        extract_text_to_txt(pdf_path, txt_output_dir)

    print(f"文本提取完成，结果保存至：{txt_output_dir}")

if __name__ == "__main__":
    main()

提取PDF文本:  78%|███████▊  | 72/92 [02:00<00:06,  2.91it/s]

处理文件 ../data/20250401\弘业期货_323616.PDF 时出错: No /Root object! - Is this really a PDF?
处理文件 ../data/20250401\恒泰期货_323467.PDF 时出错: No /Root object! - Is this really a PDF?
处理文件 ../data/20250401\恒泰期货_323468.PDF 时出错: No /Root object! - Is this really a PDF?
处理文件 ../data/20250401\恒泰期货_323470.PDF 时出错: No /Root object! - Is this really a PDF?
处理文件 ../data/20250401\恒泰期货_323577.PDF 时出错: No /Root object! - Is this really a PDF?
处理文件 ../data/20250401\恒泰期货_323578.PDF 时出错: No /Root object! - Is this really a PDF?


提取PDF文本:  95%|█████████▍| 87/92 [02:03<00:01,  3.00it/s]

处理文件 ../data/20250401\迈科期货_324419.PDF 时出错: No /Root object! - Is this really a PDF?


提取PDF文本: 100%|██████████| 92/92 [02:07<00:00,  1.39s/it]

文本提取完成，结果保存至：../output\text_files





### 需要提前下载 wkhtmltopdf 的可执行文件，并配置路径
下载链接：https://wkhtmltopdf.org/downloads.html

In [None]:
import pdfkit
import os
from pathlib import Path
from tqdm import tqdm  

# 配置wkhtmltopdf路径
path_wkhtmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)

for i in range(20250401,20250429):
    input_folder = f'../data/{i}/'  
    output_folder = f'../output/{i}/'
    print(f"正在转换{i}。。。")

    # 1. 递归收集所有HTML文件路径
    html_files = []
    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.lower().endswith('.html'):
                html_files.append(os.path.join(root, file))

    # 2. 初始化tqdm进度条
    with tqdm(total=len(html_files), desc="转换PDF", unit="file", 
            bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]") as pbar:
        
        for input_path in html_files:
            # 3. 动态更新进度条描述（显示当前文件名）
            pbar.set_postfix(file=os.path.basename(input_path)[:20] + "...")
            
            # 4. 构建输出路径（保留原始目录结构）
            relative_path = os.path.relpath(os.path.dirname(input_path), input_folder)
            output_subdir = os.path.join(output_folder, relative_path)
            Path(output_subdir).mkdir(parents=True, exist_ok=True)
            pdf_filename = os.path.splitext(os.path.basename(input_path))[0] + '.pdf'
            output_path = os.path.join(output_subdir, pdf_filename)
            
            try:
                # 5. 转换文件
                pdfkit.from_file(
                    input=input_path,
                    output_path=output_path,
                    configuration=config,
                    options={
                        'encoding': 'UTF-8',
                        'enable-local-file-access': True,
                        'quiet': '' 
                    }
                )
            except OSError as e:
                if "ContentNotFoundError" in str(e):
                    # tqdm.write(f"⚠ 警告: {input_path} 资源缺失（PDF已生成）")
                    pass
                else:
                    tqdm.write(f"✗ 失败: {input_path} - {str(e)}")
            except Exception as e:
                tqdm.write(f"✗ 异常: {input_path} - {str(e)}")
            
            # 6. 更新进度
            pbar.update(1)  # 重要：每次循环后进度+1

    print(f"\n转换完成！PDF保存在: {os.path.abspath(output_folder)}")

正在转换20250408。。。


转换PDF:   6%|▌         | 13/236 [00:33<29:19]  

✗ 失败: ../data/20250408/中衍期货_325216_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF:   6%|▌         | 14/236 [00:34<22:12]  

✗ 失败: ../data/20250408/中衍期货_325217_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 236/236 [06:31<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250408
正在转换20250409。。。


转换PDF:   8%|▊         | 13/163 [00:31<19:25]  

✗ 失败: ../data/20250409/中衍期货_325670_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF:   9%|▊         | 14/163 [00:33<14:31]  

✗ 失败: ../data/20250409/中衍期货_325671_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 163/163 [04:40<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250409
正在转换20250410。。。


转换PDF:   9%|▊         | 19/218 [00:38<26:04]  

✗ 失败: ../data/20250410/中衍期货_326134_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF:   9%|▉         | 20/218 [00:40<19:54]  

✗ 失败: ../data/20250410/中衍期货_326135_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 218/218 [06:26<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250410
正在转换20250411。。。


转换PDF:  12%|█▏        | 17/139 [00:36<16:35]  

✗ 失败: ../data/20250411/中衍期货_326635_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 139/139 [03:56<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250411
正在转换20250412。。。


转换PDF: 100%|██████████| 3/3 [00:36<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250412
正在转换20250413。。。


转换PDF: 100%|██████████| 2/2 [00:04<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250413
正在转换20250414。。。


转换PDF:   6%|▌         | 11/185 [00:29<21:59]  

✗ 失败: ../data/20250414/中衍期货_327598_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF:   6%|▋         | 12/185 [00:32<17:31]  

✗ 失败: ../data/20250414/中衍期货_327645_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 185/185 [04:24<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250414
正在转换20250415。。。


转换PDF:   5%|▌         | 10/182 [00:31<23:06] 

✗ 失败: ../data/20250415/中衍期货_327854_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF:   6%|▌         | 11/182 [00:34<18:49]  

✗ 失败: ../data/20250415/中衍期货_327878_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 182/182 [04:12<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250415
正在转换20250416。。。


转换PDF:   4%|▍         | 6/151 [00:28<21:16]  

✗ 失败: ../data/20250416/中衍期货_328342_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 151/151 [03:51<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250416
正在转换20250417。。。


转换PDF:   8%|▊         | 17/215 [00:33<24:57]  

✗ 失败: ../data/20250417/中衍期货_328775_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF:   8%|▊         | 18/215 [00:38<21:26]  

✗ 失败: ../data/20250417/中衍期货_328780_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 215/215 [05:15<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250417
正在转换20250418。。。


转换PDF:   9%|▉         | 16/171 [00:34<20:03]  

✗ 失败: ../data/20250418/中衍期货_329164_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 171/171 [04:34<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250418
正在转换20250419。。。


转换PDF: 100%|██████████| 7/7 [01:07<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250419
正在转换20250420。。。


转换PDF: 100%|██████████| 1/1 [00:03<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250420
正在转换20250421。。。


转换PDF:   2%|▏         | 4/162 [00:26<26:16]  

✗ 失败: ../data/20250421/中衍期货_329790_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF:   3%|▎         | 5/162 [00:27<18:01]  

✗ 失败: ../data/20250421/中衍期货_329791_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 162/162 [05:04<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250421
正在转换20250422。。。


转换PDF:   8%|▊         | 13/155 [00:31<17:58]  

✗ 失败: ../data/20250422/中衍期货_330225_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF:   9%|▉         | 14/155 [00:33<14:01]  

✗ 失败: ../data/20250422/中衍期货_330342_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 155/155 [03:48<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250422
正在转换20250423。。。


转换PDF:   8%|▊         | 14/166 [00:31<19:11]  

✗ 失败: ../data/20250423/中衍期货_330769_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF:   9%|▉         | 15/166 [00:35<16:24]  

✗ 失败: ../data/20250423/中衍期货_330775_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 166/166 [04:11<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250423
正在转换20250424。。。


转换PDF:  12%|█▏        | 20/170 [00:38<19:46]  

✗ 失败: ../data/20250424/中衍期货_331186_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF:  12%|█▏        | 21/170 [00:42<16:24]  

✗ 失败: ../data/20250424/中衍期货_331187_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 170/170 [04:42<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250424
正在转换20250425。。。


转换PDF:   7%|▋         | 8/122 [00:28<15:33]  

✗ 失败: ../data/20250425/中衍期货_331634_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 122/122 [03:01<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250425
正在转换20250426。。。


转换PDF: 100%|██████████| 14/14 [00:35<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250426
正在转换20250427。。。


转换PDF: 100%|██████████| 1/1 [00:01<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250427
正在转换20250428。。。


转换PDF:   1%|          | 1/165 [00:23<1:05:30]

✗ 失败: ../data/20250428/中衍期货_332186_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF:   1%|          | 2/165 [00:27<32:29]    

✗ 失败: ../data/20250428/中衍期货_332187_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 165/165 [05:57<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250428
正在转换20250429。。。


转换PDF:   2%|▏         | 1/59 [00:23<23:10]

✗ 失败: ../data/20250429/中衍期货_332706_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF:   3%|▎         | 2/59 [00:27<11:18]  

✗ 失败: ../data/20250429/中衍期货_332707_0.html - wkhtmltopdf reported an error:
Exit with code 1 due to network error: HostNotFoundError



转换PDF: 100%|██████████| 59/59 [01:38<00:00]



转换完成！PDF保存在: h:\project\economy\output\20250429
正在转换20250430。。。


转换PDF: |          | 0/0 [00:00<?]


转换完成！PDF保存在: h:\project\economy\output\20250430





In [3]:
import os
from PyPDF2 import PdfReader
from tqdm import tqdm
from openpyxl import Workbook

DATA_DIR = "../data"  # 数据文件夹路径
OUTPUT_DIR = "../output"  # 输出文件夹路径
# 记录损坏文件的 XLSX 文件路径
BAD_PDF_RECORD = os.path.join(OUTPUT_DIR, "bad_pdfs.xlsx")


def is_pdf_valid(pdf_path):
    """检查PDF文件是否损坏，并记录损坏文件信息"""
    try:
        with open(pdf_path, 'rb') as f:
            PdfReader(f)
        return True
    except Exception as e:
        print(f"文件 {pdf_path} 损坏: {str(e)}")
        return False


def copy_valid_pdfs():
    # 创建工作簿和工作表
    wb = Workbook()
    ws = wb.active
    ws.append(['文件名', '来源路径'])

    # 收集所有日期文件夹
    date_folders = []
    for root, dirs, _ in os.walk(DATA_DIR):
        for dir_name in dirs:
            if dir_name.isdigit() and len(dir_name) == 8:  # 假设日期文件夹名为 8 位数字
                date_folders.append(os.path.join(root, dir_name))

    # 遍历每个日期文件夹
    for date_folder in tqdm(date_folders, desc="处理文件夹"):
        pdf_files = []
        for root, _, files in os.walk(date_folder):
            for file in files:
                if file.lower().endswith('.pdf'):
                    pdf_files.append(os.path.join(root, file))

        # 处理当前日期文件夹中的PDF文件
        for pdf_path in tqdm(pdf_files, desc=f"处理 {os.path.basename(date_folder)}", leave=False):
            if not is_pdf_valid(pdf_path):
                # 记录损坏文件信息到 XLSX
                file_name = os.path.basename(pdf_path)
                ws.append([file_name, pdf_path])
            else:
                # 获取日期文件夹名，如 20250401
                date_folder_name = os.path.basename(date_folder)
                output_date_dir = os.path.join(OUTPUT_DIR, date_folder_name)
                os.makedirs(output_date_dir, exist_ok=True)
                output_path = os.path.join(output_date_dir, os.path.basename(pdf_path))
                # 复制文件
                with open(pdf_path, 'rb') as src, open(output_path, 'wb') as dst:
                    dst.write(src.read())

        print(f"文件夹 {date_folder_name} 处理完成")

    # 保存 XLSX 文件
    wb.save(BAD_PDF_RECORD)
    print(f"损坏的 PDF 文件信息已记录到 {BAD_PDF_RECORD}")


if __name__ == "__main__":
    copy_valid_pdfs()

处理文件夹:   3%|▎         | 1/29 [00:01<00:48,  1.72s/it]

文件 ../data\20250401\弘业期货_323616.PDF 损坏: EOF marker not found
文件 ../data\20250401\恒泰期货_323467.PDF 损坏: EOF marker not found
文件 ../data\20250401\恒泰期货_323468.PDF 损坏: EOF marker not found
文件 ../data\20250401\恒泰期货_323470.PDF 损坏: EOF marker not found
文件 ../data\20250401\恒泰期货_323577.PDF 损坏: EOF marker not found
文件 ../data\20250401\恒泰期货_323578.PDF 损坏: EOF marker not found
文件 ../data\20250401\迈科期货_324419.PDF 损坏: EOF marker not found
文件夹 20250401 处理完成




文件 ../data\20250402\恒泰期货_323751.PDF 损坏: EOF marker not found
文件 ../data\20250402\恒泰期货_323752.PDF 损坏: EOF marker not found
文件 ../data\20250402\恒泰期货_323763.PDF 损坏: EOF marker not found
文件 ../data\20250402\迈科期货_324418.PDF 损坏: EOF marker not found


处理文件夹:   7%|▋         | 2/29 [00:03<00:40,  1.49s/it]

文件夹 20250402 处理完成




文件 ../data\20250403\恒泰期货_324315.PDF 损坏: EOF marker not found
文件 ../data\20250403\恒泰期货_324316.PDF 损坏: EOF marker not found
文件 ../data\20250403\恒泰期货_324460.PDF 损坏: EOF marker not found
文件 ../data\20250403\恒泰期货_324461.PDF 损坏: EOF marker not found
文件 ../data\20250403\恒泰期货_324462.PDF 损坏: EOF marker not found
文件 ../data\20250403\恒泰期货_324463.PDF 损坏: EOF marker not found
文件 ../data\20250403\瑞达期货_324468.PDF 损坏: EOF marker not found
文件 ../data\20250403\瑞达期货_324470.PDF 损坏: EOF marker not found




文件 ../data\20250403\瑞达期货_324477.PDF 损坏: EOF marker not found
文件 ../data\20250403\瑞达期货_324478.PDF 损坏: EOF marker not found
文件 ../data\20250403\瑞达期货_324479.PDF 损坏: EOF marker not found
文件 ../data\20250403\迈科期货_324417.PDF 损坏: EOF marker not found


处理文件夹:  10%|█         | 3/29 [00:04<00:39,  1.53s/it]

文件夹 20250403 处理完成




文件夹 20250404 处理完成


处理文件夹:  17%|█▋        | 5/29 [00:04<00:17,  1.39it/s]

文件夹 20250405 处理完成


处理文件夹:  21%|██        | 6/29 [00:09<00:39,  1.72s/it]

文件夹 20250406 处理完成




文件 ../data\20250407\恒泰期货_325057.PDF 损坏: EOF marker not found
文件 ../data\20250407\恒泰期货_325058.PDF 损坏: EOF marker not found
文件 ../data\20250407\恒泰期货_325227.PDF 损坏: EOF marker not found
文件 ../data\20250407\恒泰期货_325228.PDF 损坏: EOF marker not found
文件 ../data\20250407\恒泰期货_325229.PDF 损坏: EOF marker not found
文件 ../data\20250407\恒泰期货_325230.PDF 损坏: EOF marker not found
文件 ../data\20250407\恒泰期货_325231.PDF 损坏: EOF marker not found


处理文件夹:  24%|██▍       | 7/29 [00:19<01:36,  4.41s/it]

文件夹 20250407 处理完成




文件 ../data\20250408\恒泰期货_325458.PDF 损坏: EOF marker not found
文件 ../data\20250408\恒泰期货_325459.PDF 损坏: EOF marker not found
文件 ../data\20250408\恒泰期货_325460.PDF 损坏: EOF marker not found
文件 ../data\20250408\恒泰期货_325699.PDF 损坏: EOF marker not found
文件 ../data\20250408\恒泰期货_325700.PDF 损坏: EOF marker not found
文件 ../data\20250408\恒泰期货_325701.PDF 损坏: EOF marker not found
文件 ../data\20250408\恒泰期货_325705.PDF 损坏: EOF marker not found




文件 ../data\20250408\迈科期货_327522.PDF 损坏: EOF marker not found
文件 ../data\20250408\迈科期货_327523.PDF 损坏: EOF marker not found


处理文件夹:  28%|██▊       | 8/29 [00:24<01:33,  4.45s/it]

文件夹 20250408 处理完成





文件 ../data\20250409\恒泰期货_326004.PDF 损坏: EOF marker not found
文件 ../data\20250409\恒泰期货_326005.PDF 损坏: EOF marker not found
文件 ../data\20250409\恒泰期货_326013.PDF 损坏: EOF marker not found
文件 ../data\20250409\恒泰期货_326150.PDF 损坏: EOF marker not found
文件 ../data\20250409\恒泰期货_326151.PDF 损坏: EOF marker not found
文件 ../data\20250409\恒泰期货_326152.PDF 损坏: EOF marker not found
文件 ../data\20250409\恒泰期货_326153.PDF 损坏: EOF marker not found


处理文件夹:  31%|███       | 9/29 [00:28<01:26,  4.31s/it]45it/s][A

文件夹 20250409 处理完成




文件 ../data\20250410\恒泰期货_326280.PDF 损坏: EOF marker not found
文件 ../data\20250410\恒泰期货_326281.PDF 损坏: EOF marker not found
文件 ../data\20250410\恒泰期货_326321.PDF 损坏: EOF marker not found
文件 ../data\20250410\恒泰期货_326657.PDF 损坏: EOF marker not found
文件 ../data\20250410\恒泰期货_326658.PDF 损坏: EOF marker not found
文件 ../data\20250410\恒泰期货_326659.PDF 损坏: EOF marker not found
文件 ../data\20250410\恒泰期货_326666.PDF 损坏: EOF marker not found


处理文件夹:  34%|███▍      | 10/29 [00:32<01:21,  4.28s/it]

文件夹 20250410 处理完成




文件 ../data\20250411\恒泰期货_327619.PDF 损坏: EOF marker not found
文件 ../data\20250411\恒泰期货_327620.PDF 损坏: EOF marker not found
文件 ../data\20250411\恒泰期货_327684.PDF 损坏: EOF marker not found




文件 ../data\20250411\瑞达期货_327637.PDF 损坏: EOF marker not found
文件 ../data\20250411\瑞达期货_327639.PDF 损坏: EOF marker not found




文件 ../data\20250411\迈科期货_327519.PDF 损坏: EOF marker not found
文件 ../data\20250411\迈科期货_327520.PDF 损坏: EOF marker not found
文件 ../data\20250411\迈科期货_327521.PDF 损坏: EOF marker not found


处理文件夹:  38%|███▊      | 11/29 [00:38<01:24,  4.70s/it]

文件夹 20250411 处理完成


处理文件夹:  41%|████▏     | 12/29 [00:43<01:20,  4.75s/it]

文件夹 20250412 处理完成


处理文件夹:  45%|████▍     | 13/29 [00:51<01:34,  5.90s/it]

文件夹 20250413 处理完成





文件 ../data\20250414\恒泰期货_327617.PDF 损坏: EOF marker not found
文件 ../data\20250414\恒泰期货_327618.PDF 损坏: EOF marker not found
文件 ../data\20250414\恒泰期货_327683.PDF 损坏: EOF marker not found
文件 ../data\20250414\恒泰期货_327704.PDF 损坏: EOF marker not found
文件 ../data\20250414\恒泰期货_327886.PDF 损坏: EOF marker not found
文件 ../data\20250414\恒泰期货_327887.PDF 损坏: EOF marker not found
文件 ../data\20250414\恒泰期货_327888.PDF 损坏: EOF marker not found
文件 ../data\20250414\恒泰期货_327889.PDF 损坏: EOF marker not found
文件 ../data\20250414\恒泰期货_327890.PDF 损坏: EOF marker not found


处理 20250414:  67%|██████▋   | 110/165 [00:07<00:01, 33.83it/s][A

文件 ../data\20250414\迈科期货_329529.PDF 损坏: EOF marker not found


处理文件夹:  48%|████▊     | 14/29 [01:03<01:53,  7.59s/it]

文件夹 20250414 处理完成




文件 ../data\20250415\恒泰期货_328073.PDF 损坏: EOF marker not found
文件 ../data\20250415\恒泰期货_328168.PDF 损坏: EOF marker not found




文件 ../data\20250415\迈科期货_329528.PDF 损坏: EOF marker not found


处理文件夹:  52%|█████▏    | 15/29 [01:06<01:28,  6.33s/it]

文件夹 20250415 处理完成




文件 ../data\20250416\恒泰期货_328569.PDF 损坏: EOF marker not found
文件 ../data\20250416\恒泰期货_328570.PDF 损坏: EOF marker not found
文件 ../data\20250416\恒泰期货_328571.PDF 损坏: EOF marker not found
文件 ../data\20250416\恒泰期货_328572.PDF 损坏: EOF marker not found
文件 ../data\20250416\恒泰期货_328631.PDF 损坏: EOF marker not found
文件 ../data\20250416\恒泰期货_328647.PDF 损坏: EOF marker not found
文件 ../data\20250416\恒泰期货_328661.PDF 损坏: EOF marker not found
文件 ../data\20250416\恒泰期货_329419.PDF 损坏: EOF marker not found
文件 ../data\20250416\恒泰期货_329420.PDF 损坏: EOF marker not found
文件 ../data\20250416\恒泰期货_329421.PDF 损坏: EOF marker not found
文件 ../data\20250416\恒泰期货_329422.PDF 损坏: EOF marker not found


处理文件夹:  55%|█████▌    | 16/29 [01:11<01:14,  5.71s/it]

文件夹 20250416 处理完成




文件 ../data\20250417\恒泰期货_328982.PDF 损坏: EOF marker not found
文件 ../data\20250417\恒泰期货_328983.PDF 损坏: EOF marker not found
文件 ../data\20250417\恒泰期货_328991.PDF 损坏: EOF marker not found


处理文件夹:  59%|█████▊    | 17/29 [01:14<01:00,  5.05s/it]

文件 ../data\20250417\迈科期货_329527.PDF 损坏: EOF marker not found
文件夹 20250417 处理完成




文件 ../data\20250418\恒泰期货_329390.PDF 损坏: EOF marker not found
文件 ../data\20250418\恒泰期货_329398.PDF 损坏: EOF marker not found
文件 ../data\20250418\恒泰期货_329416.PDF 损坏: EOF marker not found
文件 ../data\20250418\恒泰期货_329417.PDF 损坏: EOF marker not found
文件 ../data\20250418\恒泰期货_329418.PDF 损坏: EOF marker not found
文件 ../data\20250418\恒泰期货_329475.PDF 损坏: EOF marker not found
文件 ../data\20250418\恒泰期货_329483.PDF 损坏: EOF marker not found
文件 ../data\20250418\恒泰期货_329611.PDF 损坏: EOF marker not found




文件 ../data\20250418\瑞达期货_329571.PDF 损坏: EOF marker not found
文件 ../data\20250418\瑞达期货_329572.PDF 损坏: EOF marker not found
文件 ../data\20250418\瑞达期货_329573.PDF 损坏: EOF marker not found
文件 ../data\20250418\瑞达期货_329574.PDF 损坏: EOF marker not found




文件 ../data\20250418\瑞达期货_329579.PDF 损坏: EOF marker not found
文件 ../data\20250418\瑞达期货_329580.PDF 损坏: EOF marker not found
文件 ../data\20250418\瑞达期货_329581.PDF 损坏: EOF marker not found




文件 ../data\20250418\迈科期货_329525.PDF 损坏: EOF marker not found
文件 ../data\20250418\迈科期货_329526.PDF 损坏: EOF marker not found


处理文件夹:  62%|██████▏   | 18/29 [01:22<01:04,  5.87s/it]

文件夹 20250418 处理完成


处理文件夹:  66%|██████▌   | 19/29 [01:30<01:04,  6.48s/it]

文件夹 20250419 处理完成


处理文件夹:  69%|██████▉   | 20/29 [01:35<00:54,  6.05s/it]

文件夹 20250420 处理完成




文件 ../data\20250421\华龙期货_329963.PDF 损坏: EOF marker not found




文件 ../data\20250421\恒泰期货_330600.PDF 损坏: EOF marker not found
文件 ../data\20250421\恒泰期货_330601.PDF 损坏: EOF marker not found
文件 ../data\20250421\恒泰期货_330780.PDF 损坏: EOF marker not found
文件 ../data\20250421\恒泰期货_330781.PDF 损坏: EOF marker not found
文件 ../data\20250421\恒泰期货_330792.PDF 损坏: EOF marker not found
文件 ../data\20250421\恒泰期货_330793.PDF 损坏: EOF marker not found




文件 ../data\20250421\瑞达期货_330052.PDF 损坏: EOF marker not found
文件 ../data\20250421\瑞达期货_330053.PDF 损坏: EOF marker not found




文件 ../data\20250421\迈科期货_331995.PDF 损坏: EOF marker not found


处理文件夹:  72%|███████▏  | 21/29 [01:50<01:10,  8.87s/it]

文件夹 20250421 处理完成




文件 ../data\20250422\恒泰期货_330598.PDF 损坏: EOF marker not found
文件 ../data\20250422\恒泰期货_330599.PDF 损坏: EOF marker not found
文件 ../data\20250422\恒泰期货_330776.PDF 损坏: EOF marker not found
文件 ../data\20250422\恒泰期货_330777.PDF 损坏: EOF marker not found
文件 ../data\20250422\恒泰期货_330778.PDF 损坏: EOF marker not found
文件 ../data\20250422\恒泰期货_330779.PDF 损坏: EOF marker not found


处理文件夹:  76%|███████▌  | 22/29 [01:54<00:50,  7.25s/it]

文件夹 20250422 处理完成


incorrect startxref pointer(3)



文件 ../data\20250423\恒泰期货_331071.PDF 损坏: EOF marker not found
文件 ../data\20250423\恒泰期货_331072.PDF 损坏: EOF marker not found
文件 ../data\20250423\恒泰期货_331074.PDF 损坏: EOF marker not found
文件 ../data\20250423\恒泰期货_331182.PDF 损坏: EOF marker not found
文件 ../data\20250423\恒泰期货_331183.PDF 损坏: EOF marker not found
文件 ../data\20250423\恒泰期货_331184.PDF 损坏: EOF marker not found
文件 ../data\20250423\恒泰期货_331185.PDF 损坏: EOF marker not found


处理文件夹:  79%|███████▉  | 23/29 [01:58<00:37,  6.27s/it]2it/s][A

文件夹 20250423 处理完成




文件 ../data\20250424\恒泰期货_331356.PDF 损坏: EOF marker not found
文件 ../data\20250424\恒泰期货_331357.PDF 损坏: EOF marker not found
文件 ../data\20250424\恒泰期货_331630.PDF 损坏: EOF marker not found
文件 ../data\20250424\恒泰期货_331631.PDF 损坏: EOF marker not found
文件 ../data\20250424\恒泰期货_331632.PDF 损坏: EOF marker not found
文件 ../data\20250424\恒泰期货_331633.PDF 损坏: EOF marker not found




文件 ../data\20250424\迈科期货_331992.PDF 损坏: EOF marker not found
文件 ../data\20250424\迈科期货_331993.PDF 损坏: EOF marker not found
文件 ../data\20250424\迈科期货_331994.PDF 损坏: EOF marker not found


处理文件夹:  83%|████████▎ | 24/29 [02:02<00:28,  5.66s/it]

文件夹 20250424 处理完成


incorrect startxref pointer(1)


文件 ../data\20250425\恒泰期货_331748.PDF 损坏: EOF marker not found
文件 ../data\20250425\恒泰期货_331749.PDF 损坏: EOF marker not found
文件 ../data\20250425\恒泰期货_331751.PDF 损坏: EOF marker not found
文件 ../data\20250425\恒泰期货_332069.PDF 损坏: EOF marker not found




文件 ../data\20250425\瑞达期货_332058.PDF 损坏: EOF marker not found
文件 ../data\20250425\瑞达期货_332059.PDF 损坏: EOF marker not found
文件 ../data\20250425\瑞达期货_332060.PDF 损坏: EOF marker not found
文件 ../data\20250425\瑞达期货_332061.PDF 损坏: EOF marker not found




文件 ../data\20250425\迈科期货_331991.PDF 损坏: EOF marker not found


处理文件夹:  86%|████████▌ | 25/29 [02:09<00:23,  5.98s/it]

文件夹 20250425 处理完成


处理文件夹:  90%|████████▉ | 26/29 [02:17<00:19,  6.58s/it]

文件夹 20250426 处理完成


处理文件夹:  93%|█████████▎| 27/29 [02:19<00:10,  5.40s/it]

文件夹 20250427 处理完成




文件 ../data\20250428\恒泰期货_332536.PDF 损坏: EOF marker not found
文件 ../data\20250428\恒泰期货_332537.PDF 损坏: EOF marker not found
文件 ../data\20250428\恒泰期货_332538.PDF 损坏: EOF marker not found
文件 ../data\20250428\恒泰期货_332716.PDF 损坏: EOF marker not found
文件 ../data\20250428\恒泰期货_332717.PDF 损坏: EOF marker not found
文件 ../data\20250428\恒泰期货_332718.PDF 损坏: EOF marker not found
文件 ../data\20250428\恒泰期货_332719.PDF 损坏: EOF marker not found




文件 ../data\20250428\瑞达期货_332419.PDF 损坏: EOF marker not found
文件 ../data\20250428\瑞达期货_332420.PDF 损坏: EOF marker not found


处理文件夹:  97%|█████████▋| 28/29 [02:31<00:07,  7.26s/it]

文件夹 20250428 处理完成




文件 ../data\20250429\恒泰期货_333123.PDF 损坏: EOF marker not found
文件 ../data\20250429\恒泰期货_333124.PDF 损坏: EOF marker not found
文件 ../data\20250429\恒泰期货_333125.PDF 损坏: EOF marker not found
文件 ../data\20250429\恒泰期货_333126.PDF 损坏: EOF marker not found


处理文件夹: 100%|██████████| 29/29 [02:36<00:00,  5.40s/it]

文件夹 20250429 处理完成
损坏的 PDF 文件信息已记录到 ../output\bad_pdfs.xlsx



