# 2.1開始
使用pdfplumber套件讀取PL檔案名稱為"PSE_件號_PL"的pdf檔，並轉換成txt檔


In [9]:
import os
import pdfplumber

# 輸入和輸出資料夾的路徑
input_folder = r"_PL\pdf_file"  # 輸入資料夾的路徑
output_folder = r"_PL\pdf2txt"  # 輸出資料夾的路徑

# 確保輸出資料夾存在，如果不存在則創建它
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# 列出輸入資料夾中符合條件的所有pdf檔案
pdf_files = [f for f in os.listdir(input_folder) if f.startswith('PSE_') and f.endswith('.pdf') and 'PL' in f]

# 儲存所有替換後的txt內容
all_texts = []

# 使用下面該函數把全形有空格的文字替換掉
def replace_multiple_texts(s):
    """
    一次性替換多個子字串
    """
    replacements = {
        'C O R P O R A T E': 'CORPORATE',
        'O F F I C E S': 'OFFICE',
        'N U M B E R': 'NUMBER',
        'R E V': 'REV',
        'S H E E T': 'SHEET',
        'B O E I N G': 'BOEING',
        'S E A T T L E': 'SETTLE',
        'W A': 'WA',
        '98 1 2 4': '98124',
        'L IS T': 'LIST',
        'T IT L E': 'TITLE',
        'M O D E L': 'MODEL',
        'C O N T R A C T': 'CONTRACT',
        'D A T E': 'DATE',
        'TL II TS LT E':'LIST TITLE',
        'S H T':'SHT',
        'INSTL':'ASSY'
    }
    
    for old, new in replacements.items():
        s = s.replace(old, new)
    
    return s

# 刪除只包含一個單詞的行
def remove_single_word_lines(text):
    """
    刪除只包含一個單詞的行
    """
    lines = text.split('\n')
    filtered_lines = [line for line in lines if len(line.split()) > 1]
    return '\n'.join(filtered_lines)

# 針對每一個PDF檔案
for pdf_file in pdf_files:
    input_path = os.path.join(input_folder, pdf_file)
    
    # 打開PDF檔案並提取文本
    with pdfplumber.open(input_path) as pdf:
        combined_text = ""
        
        # 從第3頁開始提取文本
        for page_number in range(1, len(pdf.pages)):
            page = pdf.pages[page_number]
            text = page.extract_text()
            
            # 替換文本中的多個子字串
            replaced_text = replace_multiple_texts(text)
            
            # 刪除只包含一個單詞的行
            filtered_text = remove_single_word_lines(replaced_text)
            
            # 組合所有頁面的文本
            combined_text += f"Page {page_number + 2}:\n{filtered_text}\n{'='*50}\n"
    
    all_texts.append(combined_text)
    
    # 從pdf檔案名稱中提取簡短的名稱作為輸出檔案名稱
    short_name = pdf_file.split('_')[1]
    
    # 寫入替換後的文本到輸出資料夾中
    output_path = os.path.join(output_folder, f"{short_name}.txt")
    with open(output_path, "w", encoding="utf-8") as text_file:
        text_file.write(combined_text)

# 印出完成訊息
print("Text extraction, replacing, and combining complete.")


Text extraction, replacing, and combining complete.


# 2.1 結束

# 2.2開始

對轉換成txt檔的文件進行關鍵字(如件號、名稱、材質、規範)擷取並填入進去excel中

In [10]:
import os
import openpyxl
from openpyxl.styles import Alignment, PatternFill, Font

# 關鍵資訊擷取
def read_txt(filename):
    global pl, pl1, list_title, stock, per 
    pl = []
    pl1 = []
    list_title = []
    stock = []
    per = []


    with open(filename, 'r', encoding='utf-8') as file:
        stock_flag = 0
        pl_found = 0  # 新增一個 flag 來判斷是否已經找到 PL

        line = file.readlines()

        # 跳過第一行(因為第一行有 PL)  
        for content in line[1:]:
            # 將每一行中的每個單字分別拆開
            word = content.split()
            # 判斷 PL 是否在此行中
            if 'PARTS' == word[0]:
                # 取出位於 PL 之後的字串並存入 pl[]
                index = word.index('PL')
                pl = ' '.join(word[index+1:-1]).strip()
                pl_found = 1  # 設置 flag 為 True，表示已經找到 PL
                #print(pl)
                break  # 找到 PL 後就跳出迴圈
        # 如果找到了 PL，則執行後續的程式碼
        if pl_found:
            for content in line[11:-14]:
                word = content.split()
                first_char_is_digit = word[0][0].isdigit()
                last_char_is_alpha = word[-1][-1].isalpha()
                if (first_char_is_digit and last_char_is_alpha and len(word[0]) < 4 and 'PER' not in content and 'INFO' not in content and 'SECONDS' not in content) or \
                   (first_char_is_digit and 'MD' in content) or (first_char_is_digit and word[-1] == '-'and 'INFO' not in content and 'SECONDS' not in content) or \
                   (first_char_is_digit and word[1][-1].isalpha() and len(word[0]) < 4 and 'PER' not in content and 'INFO' not in content and 'SECONDS' not in content) or\
                   ('ASSY' == word[0] and word[1][0] == '-' and word[1][1:].isdigit() and word[-2]!='SHEET' and word[-1]!='PAGE)') or \
                   ('ASSY' == word[0] and word[1][0] == '-' and word[1][1:].isdigit() and word[-4]=='DRAWING') or \
                   ('ASSY' == word[0] and word[1][0] == '-' and word[1][-1].isalpha() and word[-2]!='SHEET' and word[-1]!='PAGE)'):
                    #取出位於ASSY或1後面的件號並存入pl1[]
                    index = index = word.index(word[0]) if first_char_is_digit else word.index('ASSY')
                    ass_ans = ' '.join(word[index + 1:index + 2]).strip()
                    pl1.append(ass_ans)
                    print(pl1)
                #判斷對應的件號名稱是否在此行中
                if (first_char_is_digit and last_char_is_alpha and len(word[0]) < 4 and 'PER' not in content and 'INFO' not in content and 'SECONDS' not in content) or \
                   (first_char_is_digit and 'MD' in content) or (first_char_is_digit and word[-1] == '-'and 'INFO' not in content and 'SECONDS' not in content) or \
                   (first_char_is_digit and word[1][-1].isalpha() and len(word[0]) < 4 and 'PER' not in content and 'INFO' not in content and 'SECONDS' not in content) or\
                   ('ASSY' == word[0] and word[1][0] == '-' and word[1][1:].isdigit() and word[-2]!='SHEET' and word[-1]!='PAGE)') or \
                   ('ASSY' == word[0] and word[1][0] == '-' and word[1][1:].isdigit() and word[-4]=='DRAWING') or \
                   ('ASSY' == word[0] and word[1][0] == '-' and word[1][-1].isalpha() and word[-2]!='SHEET' and word[-1]!='PAGE)'):
                    #取出位於件號到-之間的名稱存入list_title
                    index = word.index('-')
                    index2 = word.index(word[0]) if first_char_is_digit else word.index('ASSY')
                    list_title_ans = ' '.join(word[int(index2) +2:int(index)]).strip()
                    list_title.append(list_title_ans)
        # 找材質及規範
        line_number = 0
        start_line = None  # 起始行的索引
        for j in range(len(line)):
            line_number += 1
            words = line[j].split()  # 將行分割成單詞
            #因為材質及規範會對應到沒個件
            if (line[j][0].isdigit() and 'MD' in words and not any(word == 'PER' for word in words) and 'CHANGE' not in words and 'INFO' not in words) or \
               (line[j][0].isdigit() and '-' == line[j][-2] and not any(word == 'PER' for word in words) and 'CHANGE' not in words and 'INFO' not in words) or \
               (line[j][0].isdigit() and ('-' == line[j][2] or line[j][2].isalpha()) and not any(word == 'PER' for word in words) and 'CHANGE' not in words and 'INFO' not in words) or \
               (line[j][0].isdigit() and line[j][-1] == '-' and not any(word == 'PER' for word in words) and 'CHANGE' not in words and 'INFO' not in words) or\
               (line[j][0].isdigit() and line[j][-2][0].isalpha() and not any(word == 'PER' for word in words) and 'CHANGE' not in words and 'INFO' not in words and 'PAGE' not in words and 'VOLATILES' not in words and 'PRIOR' not in words and 'SECONDS' not in words)or\
               ('ASSY' == line[j][0:4] and 'DRAWING' in line[j]) or\
               ('ASSY' == line[j][0:4] and '(CONTINUED' not in line[j] and 'SHEET'not in line[j]):
                print(f"Condition triggered at line {line_number}")
                stock_flag = 1
                if stock_flag == 1:
                    if start_line is not None:
                        # 在起始行到結束行之間檢查是否包含 'STOCK' 或 'MAKE FROM'
                        found_stock = any('STOCK' in line[k] and 'PER' in line[k+1] for k in range(start_line, j + 1))
                        found_make_from = any('MAKE FROM' in line[k] for k in range(start_line, j))

                        if found_stock:
                            for k in range(start_line, j+1):
                                if 'STOCK' in line[k] and 'PER' in line[k+1]:
                                    index = line[k+1].find('PER')
                                    stock_ans = line[k+1][:index].strip()
                                    per_ans = line[k+1][index + len('PER'):].strip()
                                    stock.append(stock_ans)
                                    per.append(per_ans)
                                    start_line = None
                                    break
                                
                        elif found_make_from:
                            # 如果沒有找到 'STOCK' 但找到了 "MAKE FROM"，則將其後的文字存入 stock_ans
                            for k in range(start_line, j):
                                if 'MAKE FROM' in line[k]:
                                    index = line[k].find('FROM')
                                    stock_ans = line[k][index + len('FROM'):-2].strip()
                                    stock.append(stock_ans)
                                    per.append(" ")
                                    start_line = None  # 找到 "MAKE FROM" 後，重置標誌
                                    break
                        else:
                            # 如果起始行到結束行之間沒有找到 'STOCK' 或 "MAKE FROM"，則添加空白字符
                            stock.append(" ")
                            per.append(" ")

                    start_line = j  # 更新起始行的索引

        # 處理最後一次迴圈滿足條件後的所有行，直到文本的最後一行
        if start_line is not None and start_line < len(line) - 1:
            stock_found = False
            for k in range(start_line + 1, len(line)):
                if 'STOCK' in line[k]:
                    stock_found = True
                if stock_found and 'STOCK' in line[k] and 'PER' in line[k+1]:
                    index = line[k+1].find('PER')
                    stock_ans = line[k+1][:index].strip()
                    per_ans = line[k+1][index + len('PER'):].strip()
                    stock.append(stock_ans)
                    per.append(per_ans)
                    break
                elif stock_found and 'MAKE FROM' in line[k]:
                    index = line[k].find('FROM')
                    stock_ans = line[k][index + len('FROM'):-2].strip()
                    stock.append(" ")
                    per.append(" ")
                    break
            else:
                # 如果起始行到結束行之間沒有找到 'STOCK' 或 "MAKE FROM"，則添加空白字符
                stock.append(" ")
                per.append(" ")
        print(per)


# 將擷取到的關鍵資訊依照MBOM格式匯入EXCEL中
def creat_excel(output_folder, file_name):
    global stock, per, pl, pl1, list_title 

    workbook = openpyxl.Workbook()
    sheet = workbook.active

    #將B欄填入NO
    sheet.cell(row=1, column=2).value = 'NO'

    #將C~K欄填入0~8
    for col in range(3, 12):
        sheet.cell(row=1, column=col).value = col - 3
    
    # 將 pl[] 的值存入 excel 的 L 欄
    sheet.cell(row=1, column=12).value = '件號\nPart Number'
    for i in range(0, len(pl1)):
        if len(pl1[i]) < 6:  # 如果 pl1[i] 的長度小於 6
            sheet.cell(row=i+2, column=12).value = pl + pl1[i]
        else:
            sheet.cell(row=i+2, column=12).value = pl1[i]


    # 將 list_title[] 的值存入 excel 的 M 欄
    sheet.cell(row = 1, column = 13).value = '名稱\nNOMENCLA TURE'
    for i in range(0, len(list_title)):
        sheet.cell(row=i+2, column=13).value = list_title[i]
    
    # 將 N 欄的標題設置為 '類別\nCode'
    sheet.cell(row = 1, column = 14).value = '類別\nCode'
    
    # 將 O 欄的標題設置為 '零件代碼\nShape'
    sheet.cell(row = 1, column = 15).value = '零件代碼\nShape'

    # 將 stock[] 的值存入 excel 的 P 欄
    sheet.cell(row = 1, column = 16).value = '材質\nMaterial'
    for i in range(0, len(stock)):
        sheet.cell(row = i+2, column = 16).value = stock[i]

    # 將 per[] 的值存入 excel 的 Q 欄
    sheet.cell(row = 1, column = 17).value = '規範\nSpec.'
    for i in range(0, len(per)):
        sheet.cell(row = i+2, column = 17).value = per[i]
    
    #EXCEL排版設定
    target_row = 1
    last_col = sheet.max_column

    # 設置Alignment物件，指定水平置中和自動換行
    alignment = Alignment(horizontal='center', wrapText=True)

    # 遍歷指定行的每一列，並將文字設置為置中和自動換行
    for col in range(1, last_col + 1):
        cell = sheet.cell(row=target_row, column=col)
        cell.alignment = alignment
    
    #設定顏色與字形
    for col_index in range(1, last_col + 1):
        cell = sheet.cell(row=1, column=col_index)
        cell.fill = PatternFill(start_color="CCFFCC", end_color="CCFFCC", fill_type="solid")
    #設定字形
        if isinstance(cell.value, str) and any('\u4e00' <= char <= '\u9fff' for char in cell.value):
                font = Font(name='Malgun Gothic', color='000000', bold=False)  # 設定要的中文字體
        else:
            font = Font(name='Calibri', color='000000', bold=False)  # 設定要的英文字體
                
        cell.font = font
    
    # 設定欄寬
    columns_to_adjust = ['L', 'M', 'P', 'Q']
    default_column_width = 30
    columns_to_adjust_1 = ['N', 'O']
    default_column_width_1 = 10

    for col_letter in columns_to_adjust:
        sheet.column_dimensions[col_letter].width = default_column_width
    for col_letter in columns_to_adjust_1:
        sheet.column_dimensions[col_letter].width = default_column_width_1

    workbook.save(os.path.join(output_folder, f"{file_name}_output.xlsx"))


# 輸入目標資料夾路徑給 creat_excel 函式
def read_folder(input_folder, output_folder):
    global index
    index = 1
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.txt'):
            file_path = os.path.join(input_folder, file_name)
            read_txt(file_path)
            base_name = os.path.splitext(file_name)[0]
            creat_excel(output_folder, base_name)
            index += 1

# 修改這行以指定目標資料夾和輸出資料夾路徑
input_folder_path = r"_PL\pdf2txt"  
output_folder_path = r"_PL\final_output"
read_folder(input_folder_path, output_folder_path)


['-1']
['-1', '284W1803-20']
['-1', '284W1803-20', '401T2801-3']
['-1', '284W1803-20', '401T2801-3', '-2']
['-1', '284W1803-20', '401T2801-3', '-2', '284T0383-2']
['-1', '284W1803-20', '401T2801-3', '-2', '284T0383-2', '284W1803-1']
['-1', '284W1803-20', '401T2801-3', '-2', '284T0383-2', '284W1803-1', '401T2801-3']
['-1', '284W1803-20', '401T2801-3', '-2', '284T0383-2', '284W1803-1', '401T2801-3', '-3']
['-1', '284W1803-20', '401T2801-3', '-2', '284T0383-2', '284W1803-1', '401T2801-3', '-3', '284T0383-2']
['-1', '284W1803-20', '401T2801-3', '-2', '284T0383-2', '284W1803-1', '401T2801-3', '-3', '284T0383-2', '284W1803-2']
['-1', '284W1803-20', '401T2801-3', '-2', '284T0383-2', '284W1803-1', '401T2801-3', '-3', '284T0383-2', '284W1803-2', '401T2801-3']
['-1', '284W1803-20', '401T2801-3', '-2', '284T0383-2', '284W1803-1', '401T2801-3', '-3', '284T0383-2', '284W1803-2', '401T2801-3', '-4']
['-1', '284W1803-20', '401T2801-3', '-2', '284T0383-2', '284W1803-1', '401T2801-3', '-3', '284T0383-2

# 2.2 結束

# 2.3開始

將所有輸出的excel整合起來並依照MBOM格式填入

In [11]:
import pandas as pd
import glob
import os

def combine_excel_files(folder_path, output_file):
    # 要合併的Excel檔案路徑
    folder_path = os.path.join(folder_path, '*.xlsx')  # 使用資料夾路徑並匹配所有xlsx檔案

    # 讀取所有Excel檔案
    all_files = glob.glob(folder_path)

    # 創建一個空的DataFrame用於存儲合併後的資料
    combined_data = pd.DataFrame()

    # 迭代讀取每個Excel檔案並合併到combined_data中
    for file in all_files:
        df = pd.read_excel(file)  # 讀取Excel檔案
        combined_data = pd.concat([combined_data, df], ignore_index=True)  # 將資料合併到combined_data中

    # 寫入合併後的資料到新的Excel檔案中
    combined_data.to_excel(output_file, index=False)  # 檔案名稱為output_file，不包括索引

# 使用範例
input_folder = r'_PL\final_output'  # 替換成你的資料夾路徑
output_file = r'_PL\final_output\combine.xlsx'  # 合併後的Excel檔案名稱

combine_excel_files(input_folder, output_file)


In [12]:
import pandas as pd
import glob
import os
from openpyxl import load_workbook
from openpyxl.styles import Alignment, Font, PatternFill

def mbom_excel_files(folder_path, output_file):
    # 要合併的Excel檔案路徑
    folder_path = os.path.join(folder_path, '*.xlsx')  # 使用資料夾路徑並匹配所有xlsx檔案

    # 讀取所有Excel檔案
    all_files = glob.glob(folder_path)

    # 創建一個空的DataFrame用於存儲合併後的資料
    combined_data = pd.DataFrame()

    # 迭代讀取每個Excel檔案並合併到combined_data中
    for file in all_files:
        df = pd.read_excel(file)  # 讀取Excel檔案
        combined_data = pd.concat([combined_data, df], ignore_index=True)  # 將資料合併到combined_data中

    # 寫入合併後的資料到新的Excel檔案中
    combined_data.to_excel(output_file, index=False)  # 檔案名稱為output_file，不包括索引

    # 排版設定
    wb = load_workbook(output_file)
    sheet = wb.active
    target_row = 1
    last_col = sheet.max_column

    sheet.cell(row=1,column=1).value = ' '
    #將B欄填入NO
    sheet.cell(row=1, column=2).value = 'NO'

    # 設置Alignment物件，指定水平置中和自動換行
    alignment = Alignment(horizontal='center', wrapText=True)

    # 遍歷指定行的每一列，並將文字設置為置中和自動換行
    for col in range(1, last_col + 1):
        cell = sheet.cell(row=target_row, column=col)
        cell.alignment = alignment
    
    # 設定顏色與字形
    for col_index in range(1, last_col + 1):
        cell = sheet.cell(row=1, column=col_index)
        cell.fill = PatternFill(start_color="CCFFCC", end_color="CCFFCC", fill_type="solid")
        # 設定字形
        if isinstance(cell.value, str) and any('\u4e00' <= char <= '\u9fff' for char in cell.value):
            font = Font(name='Malgun Gothic', color='000000', bold=False)  # 設定要的中文字體
        else:
            font = Font(name='Calibri', color='000000', bold=False)  # 設定要的英文字體
        cell.font = font
    
    # 設定欄寬
    columns_to_adjust = ['L', 'M', 'P', 'Q']
    default_column_width = 30
    columns_to_adjust_1 = ['N', 'O']
    default_column_width_1 = 10

    for col_letter in columns_to_adjust:
        sheet.column_dimensions[col_letter].width = default_column_width
    for col_letter in columns_to_adjust_1:
        sheet.column_dimensions[col_letter].width = default_column_width_1

    # 儲存修改後的 Excel 檔案
    wb.save(output_file)

# 使用範例
input_folder = r'_PL\final_output'  # 替換成你的資料夾路徑
output_file = r'_PL\final_output\combine.xlsx'  # 合併後的Excel檔案名稱

mbom_excel_files(input_folder, output_file)


# 2.3結束