### 将PDF文件的大写后缀PDF转化为小写pdf

In [2]:
import os
from tqdm import tqdm

# 定义 output 文件夹路径，确保路径正确
output_folder = os.path.join(os.getcwd(), '../output')

# 遍历 output 文件夹及其子文件夹
for root, dirs, files in tqdm(os.walk(output_folder), desc='处理文件夹进度'): 
    for file in tqdm(files, desc='处理文件进度', leave=False):
        if file.upper().endswith('.PDF'):
            old_file_path = os.path.join(root, file)
            file_name, _ = os.path.splitext(file)
            new_file_path = os.path.join(root, file_name + '.pdf')
            
            # 重命名文件
            try:
                os.rename(old_file_path, new_file_path)
            except Exception as e:
                print(f'重命名文件 {old_file_path} 时出错: {e}')

处理文件夹进度: 190it [00:16, 11.20it/s]


### 重命名所有pdf文件，将期货公司名称替换为对应的映射数字
此举是为了将数据上传到kaggle的Dataset中，避免文件名中包含中文，导致上传失败

ps：若是使用的是原始data数据（html+pdf），执行完html-pdf.ipynb后无需执行这一个单元格

In [3]:
import os
import shutil
from tqdm import tqdm

# 定义 output 文件夹路径和新文件夹路径
output_folder = os.path.join(os.getcwd(), '../output')
new_output_folder = os.path.join(os.getcwd(), '../output_renamed')

# 收集所有公司名
company_names = set()
for root, dirs, files in os.walk(output_folder):
    for file in files:
        if file.lower().endswith('.pdf'):
            company_name = file[:4]
            company_names.add(company_name)

# 生成公司名到数字的映射字典
company_mapping = {name: idx for idx, name in enumerate(sorted(company_names), start=1)}
print('公司名到数字的映射字典：')
print(company_mapping)

# 创建新文件夹
os.makedirs(new_output_folder, exist_ok=True)

# 遍历 output 文件夹，复制并重命名文件
for root, dirs, files in tqdm(os.walk(output_folder), desc='处理文件夹进度'):
    for file in tqdm(files, desc='处理文件进度', leave=False):
        if file.lower().endswith('.pdf'):
            company_name = file[:4]
            if company_name in company_mapping:
                new_prefix = str(company_mapping[company_name])
                new_file_name = new_prefix + file[4:]
                old_file_path = os.path.join(root, file)
                # 计算新文件在新文件夹中的相对路径
                relative_path = os.path.relpath(root, output_folder)
                new_sub_folder = os.path.join(new_output_folder, relative_path)
                os.makedirs(new_sub_folder, exist_ok=True)
                new_file_path = os.path.join(new_sub_folder, new_file_name)
                try:
                    shutil.copy2(old_file_path, new_file_path)
                except Exception as e:
                    print(f'复制并重命名文件 {old_file_path} 时出错: {e}')

公司名到数字的映射字典：
{'上海中期': 1, '东海期货': 2, '中原期货': 3, '中融汇信': 4, '中衍期货': 5, '中金财富': 6, '中银期货': 7, '五矿期货': 8, '倍特期货': 9, '光大期货': 10, '兴业期货': 11, '冠通期货': 12, '华融融达': 13, '华金期货': 14, '华鑫期货': 15, '华龙期货': 16, '国信期货': 17, '国元期货': 18, '国联期货': 19, '大越期货': 20, '宏源期货': 21, '宝城期货': 22, '山金期货': 23, '广金期货': 24, '徽商期货': 25, '新湖期货': 26, '格林大华': 27, '汇鑫期货': 28, '浙商期货': 29, '瑞达期货': 30, '紫金天风': 31, '财信期货': 32, '道通期货': 33, '金信期货': 34, '金元期货': 35, '金石期货': 36, '铜冠金源': 37, '长安期货': 38, '长江期货': 39}


处理文件夹进度: 190it [08:34,  2.71s/it]


#### 变换 “紫金天风”（此处做单独处理，是因为之前使用的 wkhtmltopdf 转换的紫金天风的相关 pdf 文件为空白）
此处直接使用 bs4 转化成最终入库需要读取的 txt 文件

In [3]:
import os
from bs4 import BeautifulSoup

# 定义文件夹路径
data_folder = 'h:/project/economy/data'
output_folder = 'h:/project/economy/output'

# 遍历 output 文件夹
for root, dirs, files in os.walk(output_folder):
    for file in files:
        if file.startswith('紫金天风') and file.lower().endswith('.pdf'):
            # 获取对应的 HTML 文件路径
            relative_path = os.path.relpath(root, output_folder)
            html_date_folder = os.path.join(data_folder, relative_path)
            base_name = os.path.splitext(file)[0]
            html_file_name = next((f for f in os.listdir(html_date_folder) if f.startswith(base_name) and f.endswith('.html')), None)
            if html_file_name:
                html_file_path = os.path.join(html_date_folder, html_file_name)
                # 读取 HTML 文件
                try:
                    with open(html_file_path, 'r', encoding='utf-8') as f:
                        html_content = f.read()
                    # 使用 BeautifulSoup 提取文本
                    soup = BeautifulSoup(html_content, 'html.parser')
                    text = soup.get_text()
                    # 生成 TXT 文件路径
                    txt_file_path = os.path.join(root, base_name + '.txt')
                    # 删除原 PDF 文件
                    pdf_file_path = os.path.join(root, file)
                    os.remove(pdf_file_path)
                    # 将提取的文本保存到 TXT 文件
                    with open(txt_file_path, 'w', encoding='utf-8') as f:
                        f.write(text)
                    print(f'已将 {pdf_file_path} 替换为 {txt_file_path}')
                except Exception as e:
                    print(f'处理文件 {html_file_path} 时出错: {e}')

已将 h:/project/economy/output\20250401\紫金天风_323359_0.pdf 替换为 h:/project/economy/output\20250401\紫金天风_323359_0.txt
已将 h:/project/economy/output\20250401\紫金天风_323360_0.pdf 替换为 h:/project/economy/output\20250401\紫金天风_323360_0.txt
已将 h:/project/economy/output\20250401\紫金天风_323361_0.pdf 替换为 h:/project/economy/output\20250401\紫金天风_323361_0.txt
已将 h:/project/economy/output\20250401\紫金天风_323362_0.pdf 替换为 h:/project/economy/output\20250401\紫金天风_323362_0.txt
已将 h:/project/economy/output\20250401\紫金天风_323363_0.pdf 替换为 h:/project/economy/output\20250401\紫金天风_323363_0.txt
已将 h:/project/economy/output\20250401\紫金天风_323364_0.pdf 替换为 h:/project/economy/output\20250401\紫金天风_323364_0.txt
已将 h:/project/economy/output\20250401\紫金天风_323365_0.pdf 替换为 h:/project/economy/output\20250401\紫金天风_323365_0.txt
已将 h:/project/economy/output\20250402\紫金天风_323901_0.pdf 替换为 h:/project/economy/output\20250402\紫金天风_323901_0.txt
已将 h:/project/economy/output\20250402\紫金天风_323902_0.pdf 替换为 h:/project/economy/output\20250402\紫

### 观察后，发现这几个期货公司的pdf文件转换后，没有有用的期货信息，执行删除

In [None]:
import os

# 定义目标前缀列表
target_prefixes = ['汇鑫期货','财信期货', '中原期货', '兴业期贷', 
    '中融汇信', '国元期货', '中银期货', '华融融达']
# 定义 output 文件夹路径
output_folder = '../output'

# 遍历 output 文件夹及其子文件夹
for root, dirs, files in os.walk(output_folder):
    for file in files:
        # 检查文件是否符合前缀和后缀条件
        if any(file.startswith(prefix) for prefix in target_prefixes) and file.endswith('_0.pdf'):
            file_path = os.path.join(root, file)
            try:
                os.remove(file_path)
                print(f'已删除文件: {file_path}')
            except Exception as e:
                print(f'删除文件 {file_path} 时出错: {e}')

已删除文件: ../output\20250401\汇鑫期货_323386_0.pdf
已删除文件: ../output\20250401\汇鑫期货_323387_0.pdf
已删除文件: ../output\20250401\汇鑫期货_323388_0.pdf
已删除文件: ../output\20250401\汇鑫期货_323389_0.pdf
已删除文件: ../output\20250402\汇鑫期货_323895_0.pdf
已删除文件: ../output\20250402\汇鑫期货_323896_0.pdf
已删除文件: ../output\20250402\汇鑫期货_323897_0.pdf
已删除文件: ../output\20250402\汇鑫期货_323898_0.pdf
已删除文件: ../output\20250403\汇鑫期货_324325_0.pdf
已删除文件: ../output\20250403\汇鑫期货_324326_0.pdf
已删除文件: ../output\20250407\汇鑫期货_325547_0.pdf
已删除文件: ../output\20250407\汇鑫期货_325548_0.pdf
已删除文件: ../output\20250407\汇鑫期货_325549_0.pdf
已删除文件: ../output\20250407\汇鑫期货_325550_0.pdf
已删除文件: ../output\20250408\汇鑫期货_325543_0.pdf
已删除文件: ../output\20250408\汇鑫期货_325544_0.pdf
已删除文件: ../output\20250408\汇鑫期货_325545_0.pdf
已删除文件: ../output\20250408\汇鑫期货_325546_0.pdf
已删除文件: ../output\20250409\汇鑫期货_325847_0.pdf
已删除文件: ../output\20250409\汇鑫期货_325848_0.pdf
已删除文件: ../output\20250409\汇鑫期货_325849_0.pdf
已删除文件: ../output\20250409\汇鑫期货_325850_0.pdf
已删除文件: ../output\20250410\汇鑫期货_3

### 使用 mineru 将 pdf 文件转化为 md 文件
详见 pdf-md.ipynb

以下代码为已完成转化的 md 文件提取三元组信息，并转化为 txt 文件

### 将之前单独处理的“紫金天风”复制进 output_txt 文件夹中

In [5]:
import os
import shutil

# 定义 output_txt 和 output_md 文件夹路径
output_txt_folder = os.path.join(os.getcwd(), '../output_txt')
output_md_folder = os.path.join(os.getcwd(), '../output_md')

# 遍历 output_txt 文件夹及其子文件夹
for root, dirs, files in os.walk(output_txt_folder):
    for file in files:
        if file.startswith('紫金天风') and file.endswith('.txt'):
            src_path = os.path.join(root, file)
            # 获取相对路径，用于确定在 output_md 中的目标位置
            relative_path = os.path.relpath(root, output_txt_folder)
            dst_folder = os.path.join(output_md_folder, relative_path)
            
            # 创建目标文件夹（如果不存在）
            os.makedirs(dst_folder, exist_ok=True)
            
            dst_path = os.path.join(dst_folder, file)
            
            try:
                # 复制文件到目标位置
                shutil.copy2(src_path, dst_path)
                print(f'已复制文件: {src_path} -> {dst_path}')
            except Exception as e:
                print(f'复制文件 {src_path} 时出错: {e}')

已复制文件: h:\project\economy\src\../output_txt\20250401\紫金天风_323359_0.txt -> h:\project\economy\src\../output_md\20250401\紫金天风_323359_0.txt
已复制文件: h:\project\economy\src\../output_txt\20250401\紫金天风_323360_0.txt -> h:\project\economy\src\../output_md\20250401\紫金天风_323360_0.txt
已复制文件: h:\project\economy\src\../output_txt\20250401\紫金天风_323361_0.txt -> h:\project\economy\src\../output_md\20250401\紫金天风_323361_0.txt
已复制文件: h:\project\economy\src\../output_txt\20250401\紫金天风_323362_0.txt -> h:\project\economy\src\../output_md\20250401\紫金天风_323362_0.txt
已复制文件: h:\project\economy\src\../output_txt\20250401\紫金天风_323363_0.txt -> h:\project\economy\src\../output_md\20250401\紫金天风_323363_0.txt
已复制文件: h:\project\economy\src\../output_txt\20250401\紫金天风_323364_0.txt -> h:\project\economy\src\../output_md\20250401\紫金天风_323364_0.txt
已复制文件: h:\project\economy\src\../output_txt\20250401\紫金天风_323365_0.txt -> h:\project\economy\src\../output_md\20250401\紫金天风_323365_0.txt
已复制文件: h:\project\economy\src\../output_t

### 以下代码为跳过了 html->pdf 处理，直接使用 kaggle 的 economy_pdf 数据集的人使用
按照公司映射字典，将 pdf 文件重命名，然后再使用 mineru 将 pdf 文件转化为 md 文件

In [None]:
import os
import re

# 读取company.txt创建映射字典
def load_company_mapping(company_file):
    mapping = {}
    with open(company_file, 'r', encoding='utf-8') as f:
        for line in f:
            if ':' in line:
                company, num = line.strip().split(':')
                mapping[num.strip()] = company.strip()
    return mapping

# 重命名PDF文件
def rename_pdf_files(pdf_dir, mapping):
    for root, dirs, files in os.walk(pdf_dir):
        for file in files:
            if file.lower().endswith('.pdf'):
                # 提取前面的数字
                match = re.match(r'(\d+)_', file)
                if match:
                    num = match.group(1)
                    if num in mapping:
                        # 构建新文件名
                        new_name = file.replace(f"{num}_", f"{mapping[num]}_")
                        # 重命名文件
                        old_path = os.path.join(root, file)
                        new_path = os.path.join(root, new_name)
                        os.rename(old_path, new_path)
                        print(f"Renamed: {file} -> {new_name}")

if __name__ == "__main__":
    # 设置路径 - 使用原始字符串或双反斜杠
    company_file = r"h:\project\economy\company.txt" 
    pdf_dir = r"h:\project\economy\output_pdf"
    
    # 加载映射关系
    mapping = load_company_mapping(company_file)
    
    # 重命名PDF文件
    rename_pdf_files(pdf_dir, mapping)

Renamed: 10_323366.pdf -> 光大期货_323366.pdf
Renamed: 10_323367.pdf -> 光大期货_323367.pdf
Renamed: 10_323391.pdf -> 光大期货_323391.pdf
Renamed: 10_323455.pdf -> 光大期货_323455.pdf
Renamed: 10_323456.pdf -> 光大期货_323456.pdf
Renamed: 10_323457.pdf -> 光大期货_323457.pdf
Renamed: 10_323459.pdf -> 光大期货_323459.pdf
Renamed: 10_323554.pdf -> 光大期货_323554.pdf
Renamed: 10_323580.pdf -> 光大期货_323580.pdf
Renamed: 11_323332_0.pdf -> 兴业期货_323332_0.pdf
Renamed: 12_323335.pdf -> 冠通期货_323335.pdf
Renamed: 12_323583.pdf -> 冠通期货_323583.pdf
Renamed: 12_323584.pdf -> 冠通期货_323584.pdf
Renamed: 12_323585.pdf -> 冠通期货_323585.pdf
Renamed: 12_323586.pdf -> 冠通期货_323586.pdf
Renamed: 12_323587.pdf -> 冠通期货_323587.pdf
Renamed: 12_323588.pdf -> 冠通期货_323588.pdf
Renamed: 12_323589.pdf -> 冠通期货_323589.pdf
Renamed: 12_323590.pdf -> 冠通期货_323590.pdf
Renamed: 12_323591.pdf -> 冠通期货_323591.pdf
Renamed: 12_323612.pdf -> 冠通期货_323612.pdf
Renamed: 13_323745.pdf -> 华融融达_323745.pdf
Renamed: 13_323746.pdf -> 华融融达_323746.pdf
Renamed: 13_323869_0.pdf -> 华融

In [5]:
import os
import re

def load_reverse_company_mapping(company_file):
    reverse_mapping = {}
    with open(company_file, 'r', encoding='utf-8') as f:
        for line in f:
            if ':' in line:
                company, num = line.strip().split(':')
                reverse_mapping[company.strip()] = num.strip()
    return reverse_mapping

def rename_pdf_files(pdf_dir, reverse_mapping):
    for root, dirs, files in os.walk(pdf_dir):
        for file in files:
            if file.lower().endswith('.pdf'):
                # 匹配格式：公司名称_数字其他内容.pdf
                match = re.match(r'([\u4e00-\u9fa5]{4})_(\d+)(.*)\.pdf', file)
                if match:
                    prefix = match.group(1)  # 前四个汉字（公司名称）
                    num = match.group(2)     # 数字部分
                    suffix = match.group(3)  # 剩余部分
                    
                    # 仅处理剩余部分中包含公司名称的情况
                    new_suffix = suffix
                    for company, code in reverse_mapping.items():
                        if company in new_suffix:
                            new_suffix = new_suffix.replace(company, code)
                    
                    if new_suffix != suffix:
                        new_file = f"{prefix}_{num}{new_suffix}.pdf"
                        old_path = os.path.join(root, file)
                        new_path = os.path.join(root, new_file)
                        os.rename(old_path, new_path)
                        print(f"Renamed: {file} -> {new_file}")

if __name__ == "__main__":
    company_file = r"h:\project\economy\company.txt"
    pdf_dir = r"h:\project\economy\output_pdf"
    
    reverse_mapping = load_reverse_company_mapping(company_file)
    rename_pdf_files(pdf_dir, reverse_mapping)

Renamed: 中原期货_32356中原期货_0.pdf -> 中原期货_323563_0.pdf
Renamed: 中原期货_32398中原期货_0.pdf -> 中原期货_323983_0.pdf
Renamed: 中融汇信_32338中融汇信_0.pdf -> 中融汇信_323384_0.pdf
Renamed: 中融汇信_32357中融汇信_0.pdf -> 中融汇信_323574_0.pdf
Renamed: 倍特期货_32346倍特期货_0.pdf -> 倍特期货_323469_0.pdf
Renamed: 国信期货_3235国信期货_0.pdf -> 国信期货_323517_0.pdf
Renamed: 道通期货_3233道通期货_0.pdf -> 道通期货_323333_0.pdf
Renamed: 中原期货_32430中原期货_0.pdf -> 中原期货_324303_0.pdf
Renamed: 华融融达_3244华融融达_0.pdf -> 华融融达_324413_0.pdf
Renamed: 华融融达_3245华融融达_0.pdf -> 华融融达_324513_0.pdf
Renamed: 上海中期_32431上海中期_0.pdf -> 上海中期_324311_0.pdf
Renamed: 倍特期货_32424倍特期货_0.pdf -> 倍特期货_324249_0.pdf
Renamed: 大越期货_3242大越期货_0.pdf -> 大越期货_324220_0.pdf
Renamed: 中原期货_32550中原期货_0.pdf -> 中原期货_325503_0.pdf
Renamed: 华融融达_3257华融融达_0.pdf -> 华融融达_325713_0.pdf
Renamed: 上海中期_32568上海中期_0.pdf -> 上海中期_325681_0.pdf
Renamed: 中融汇信_32542中融汇信_0.pdf -> 中融汇信_325424_0.pdf
Renamed: 华融融达_3264华融融达_0.pdf -> 华融融达_326413_0.pdf
Renamed: 国信期货_3256国信期货_0.pdf -> 国信期货_325617_0.pdf
Renamed: 中原期货_32635中原期货_0.pdf -> 中原期货_3

### 去除子文件夹

In [1]:
import os
import shutil

def flatten_pdf_folders(output_pdf_dir):
    """
    将output_pdf目录下所有日期子文件夹中的文件移动到对应的日期文件夹中
    并删除空的子文件夹
    """
    for date_folder in os.listdir(output_pdf_dir):
        date_path = os.path.join(output_pdf_dir, date_folder)
        
        if os.path.isdir(date_path):
            # 遍历日期文件夹下的所有子文件夹
            for root, dirs, files in os.walk(date_path):
                for file in files:
                    # 获取文件完整路径
                    file_path = os.path.join(root, file)
                    
                    # 如果不是直接在日期文件夹下的文件
                    if root != date_path:
                        # 移动文件到日期文件夹
                        new_path = os.path.join(date_path, file)
                        
                        # 处理文件名冲突
                        counter = 1
                        while os.path.exists(new_path):
                            name, ext = os.path.splitext(file)
                            new_path = os.path.join(date_path, f"{name}_{counter}{ext}")
                            counter += 1
                        
                        shutil.move(file_path, new_path)
                        print(f"Moved: {file_path} -> {new_path}")
            
            # 删除空的子文件夹
            for root, dirs, files in os.walk(date_path, topdown=False):
                if root != date_path and not os.listdir(root):
                    os.rmdir(root)
                    print(f"Removed empty folder: {root}")

if __name__ == "__main__":
    output_pdf_dir = r"h:\project\economy\output_pdf"
    flatten_pdf_folders(output_pdf_dir)

Moved: h:\project\economy\output_pdf\20250401\323354\浙商期货_323354_0.pdf -> h:\project\economy\output_pdf\20250401\浙商期货_323354_0.pdf
Moved: h:\project\economy\output_pdf\20250401\323355\浙商期货_323355_0.pdf -> h:\project\economy\output_pdf\20250401\浙商期货_323355_0.pdf
Moved: h:\project\economy\output_pdf\20250401\323356\浙商期货_323356_0.pdf -> h:\project\economy\output_pdf\20250401\浙商期货_323356_0.pdf
Moved: h:\project\economy\output_pdf\20250401\323357\浙商期货_323357_0.pdf -> h:\project\economy\output_pdf\20250401\浙商期货_323357_0.pdf
Moved: h:\project\economy\output_pdf\20250401\323358\浙商期货_323358_0.pdf -> h:\project\economy\output_pdf\20250401\浙商期货_323358_0.pdf
Moved: h:\project\economy\output_pdf\20250401\323761\浙商期货_323761_0.pdf -> h:\project\economy\output_pdf\20250401\浙商期货_323761_0.pdf
Removed empty folder: h:\project\economy\output_pdf\20250401\323354
Removed empty folder: h:\project\economy\output_pdf\20250401\323355
Removed empty folder: h:\project\economy\output_pdf\20250401\323356
Removed em