In [1]:
from bs4 import BeautifulSoup
import json
import os

# 讀取 HTML 檔案
html_file = r'your html file'
with open(html_file, 'r', encoding='utf-8') as file:
    html_content = file.read()

# 解析 HTML
soup = BeautifulSoup(html_content, 'lxml')

# 儲存結果的列表
content_data = []
tables_data = []

# 1. 提取標題和段落
for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
    if tag.name.startswith('h'):  # 判斷是否為標題
        # 當前標題
        title = tag.get_text(strip=True)
        # 獲取後續的段落
        next_elements = tag.find_next_siblings()  # 獲取所有後續元素
        paragraphs = []

        for elem in next_elements:
            if elem.name == 'p':  # 當前元素是 <p>
                paragraphs.append(elem.get_text(strip=True))
            elif elem.name in ['table', 'img']:  # 排除 <table> 或 <img>
                continue
            else:  # 當前元素不是 <p>，則停止搜尋
                break

        # 將標題和其後的段落放入列表中
        content_data.append((title, paragraphs))

# 如果沒有找到標題，就將所有文本按順序存儲
if not content_data:
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if not (p.find('table') or p.find('img'))]
    content_data = [('No Title', paragraphs)]

# 2. 提取表格數據
tables = soup.find_all('table')

if tables:
    for i, table in enumerate(tables):
        table_rows = []
        rows = table.find_all('tr')
        
        for row in rows:
            # 提取每一行的每一個單元格
            cols = row.find_all(['td', 'th'])  # 包含 <td> 和 <th> 單元格
            cols = [col.get_text(strip=True) for col in cols]  # 獲取文本並去除多餘空白
            table_rows.append(cols)  # 將行數據添加到表格列表
        
        tables_data.append({f"Table {i+1}": table_rows})  # 將表格數據存入結果列表

# 提取價格數據
henry_hub_price = None
columbia_gulf_price = None
texas_gas_zone_price = None

henry_hub_index = None
columbia_gulf_index = None

# 計算 Henry Hub 與 Columbia Gulf Mainline 的距離
min_distance = float('inf')

# 尋找價格
for table in tables_data:
    for key, rows in table.items():
        for index, row in enumerate(rows):
            if isinstance(row, list) and len(row) >= 2:
                # 查找 Columbia Gulf Mainline
                if "Columbia Gulf Mainline" in row:
                    columbia_gulf_price = row[row.index("Columbia Gulf Mainline") + 2]
                    columbia_gulf_index = index  # 記錄 Columbia Gulf Mainline 的行號
                
                # 查找 Henry Hub 價格
                if "Henry Hub" in row:
                    henry_hub_temp_price = row[row.index("Henry Hub") + 2]  # 暫時獲取 Henry Hub 價格
                    henry_hub_temp_index = index  # 暫時記錄 Henry Hub 的行號
                    
                    # 如果已經找到 Columbia Gulf Mainline，則計算與之的距離
                    if columbia_gulf_index is not None:
                        distance = abs(henry_hub_temp_index - columbia_gulf_index)
                        # 如果這個 Henry Hub 比較近，則更新最終的價格和行號
                        if distance < min_distance:
                            min_distance = distance
                            henry_hub_price = henry_hub_temp_price
                            henry_hub_index = henry_hub_temp_index

                # 查找 Texas Gas Zone 1
                if "Texas Gas Zone 1" in row:
                    texas_gas_zone_price = row[row.index("Texas Gas Zone 1") + 2]

# 輸出提取的價格
print(f"Henry Hub Price: {henry_hub_price}")
print(f"Columbia Gulf Mainline Price: {columbia_gulf_price}")
print(f"Texas Gas Zone 1 Price: {texas_gas_zone_price}")

# 生成輸出檔案的名稱
base_filename = os.path.splitext(os.path.basename(html_file))[0]
output_dir = r"your output directory"

content_data_file = os.path.join(output_dir, f"{base_filename}_content_data.txt")
tables_data_file = os.path.join(output_dir, f"{base_filename}_tables_data.json")

# 3. 將標題和段落數據寫入 TXT 檔案
with open(content_data_file, 'w', encoding='utf-8') as txt_file:
    for title, paragraphs in content_data:
        txt_file.write(f"Title: {title}\n")  # 寫入標題
        for paragraph in paragraphs:
            txt_file.write(f"{paragraph}\n")  # 寫入段落
        txt_file.write("\n")  # 在每個標題後加一行空白

# 4. 將表格數據寫入 JSON 檔案
with open(tables_data_file, 'w', encoding='utf-8') as json_file:
    json.dump(tables_data, json_file, ensure_ascii=False, indent=4)

# 結果通知
print(f"標題和段落數據已保存至 {content_data_file}，表格數據已保存至 {tables_data_file}")


Henry Hub Price: 3.310
Columbia Gulf Mainline Price: 3.160
Texas Gas Zone 1 Price: 3.145
標題和段落數據已保存至 C:\Users\N000189549\Desktop\Python\auto price import\auto price import\reprocessed txt and json\NGI daily index_20250207_content_data.txt，表格數據已保存至 C:\Users\N000189549\Desktop\Python\auto price import\auto price import\reprocessed txt and json\NGI daily index_20250207_tables_data.json
