In [1]:
!pip install pdfplumber -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[?25h

### English content process(Run)

1. 遇到連續幾個文字大小超過15的字設定一個chapter_start
=True的參數，當後續遇到連續文字大小小於15的字chapter_start=False，將這段區間內文字取出，大小超過15的字取出且去除換行，作為檔案名稱.txt，再刪去內容字段"OceanofPDF.com"，之後每個段落寫入文件的一行來保存檔案，以便後續資料操作

In [None]:
import pdfplumber
from collections import Counter
import re
import os

# 設定字體大小閾值
FONT_SIZE_THRESHOLD = 15

def clean_filename(text):
    """
    清理標題以作為合法的檔名 (移除 / \ : * ? " < > |)
    """
    # 移除非法字元
    text = re.sub(r'[\\/*?:"<>|]', "", text)
    # 移除換行符號
    text = text.replace("\n", " ").replace("\r", "")
    return text.strip()

def get_chapter_header(page):
    """
    掃描頁面，尋找連續大於 FONT_SIZE_THRESHOLD 的字元。
    回傳: (標題文字, 標題底端Y座標)
    如果沒找到，回傳 (None, 0)
    """
    large_chars = [c for c in page.chars if c["size"] > FONT_SIZE_THRESHOLD]

    if not large_chars:
        return None, 0

    # 找出這些大字元組成的文字
    header_text = "".join([c["text"] for c in large_chars])

    # 找出標題佔據的區域最底端 (bottom)，內文應從這裡之後開始
    # 我們取所有大字元中最大的 bottom 值
    header_bottom = max([c["bottom"] for c in large_chars])

    return clean_filename(header_text), header_bottom

def save_chapter(output_dir,filename, paragraphs):
    """
    將段落寫入檔案，並過濾浮水印
    """
    if not paragraphs:
        return

    # 確保輸出目錄存在
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    full_path = os.path.join(output_dir, f"{filename}.txt")

    with open(full_path, "w", encoding="utf-8") as f:
        for p in paragraphs:
            # 任務要求：刪去內容字段 "OceanofPDF.com"
            clean_p = p.replace("OceanofPDF.com", "")
            # 如果刪除後只剩空白，則不寫入
            if clean_p.strip():
                f.write(clean_p + "\n")

    print(f"已儲存章節: {full_path} (段落數: {len(paragraphs)})")

def is_new_paragraph_logic(line_obj, prev_line_obj, left_margin_mode, page_width):
    """
    (沿用之前的邏輯) 判斷是否為新段落
    """
    if not prev_line_obj: return True

    current_x0 = line_obj['x0']
    prev_text = prev_line_obj['text'].strip()
    sentence_enders = ('.', '?', '!', '"', '”', '’')
    is_sentence_end = prev_text.endswith(sentence_enders)

    if not is_sentence_end:
        if current_x0 > (left_margin_mode + 50): return True
        return False

    if current_x0 > (left_margin_mode + 10): return True

    estimated_right_margin = page_width - left_margin_mode
    if prev_line_obj['x1'] < (estimated_right_margin - 30): return True

    return False

def process_pdf_to_chapters(pdf_path,output_dir):
    current_chapter_name = "Prologue_or_Start" # 預設第一章之前的檔名
    chapter_index = 0 #新增計數器
    current_paragraphs = []

    # 用來跨頁合併段落的 buffer
    buffer_paragraph = ""
    prev_line_obj = None

    # 全域左邊界偵測 (簡化版，只跑前幾頁)
    common_x0 = 0
    with pdfplumber.open(pdf_path) as pdf:
        # --- 預先偵測左邊界 ---
        starts = []
        for p in pdf.pages[4:18]:
            words = p.extract_words()
            if words: starts.append(words[0]['x0'])
        if starts:
            common_x0 = Counter([int(x) for x in starts]).most_common(1)[0][0]

        # --- 開始逐頁處理 ---
        for i, page in enumerate(pdf.pages):
            width = page.width

            # 1.【軌道一】檢查是否有章節標題 (文字大小 > 15)
            header_text, header_bottom = get_chapter_header(page)

            if header_text:
                # 發現新章節！
                # A. 先把"上一章"殘留的 buffer 收尾存入 list
                if buffer_paragraph:
                    current_paragraphs.append(buffer_paragraph.strip())
                    buffer_paragraph = ""
                    prev_line_obj = None

                # B. 寫入上一章的檔案：使用 f-string 將序號格式化為 3 位數 (例如: 000_Prologue, 001_Chapter One)
                numbered_filename = f"{chapter_index:03d}_{current_chapter_name}"
                save_chapter(output_dir,numbered_filename, current_paragraphs)

                chapter_index += 1  # 儲存完畢後，序號加 1，準備給下一個章節標題使用

                # C. 重置狀態，準備開始新章節
                current_chapter_name = header_text
                current_paragraphs = []
                print(f"--- 發現新章節: {header_text} (頁數: {i+1}) ---")

            # 2.【軌道二】提取內文 Words
            # 關鍵：只提取 header_bottom 之後的文字，避免把標題重複抓進內文
            words = page.extract_words(x_tolerance=3, y_tolerance=6)

            # 過濾掉標題區域的字 (只保留 top > header_bottom 的字)
            # 加上一個小緩衝區 (+5) 避免切太齊
            content_words = [w for w in words if w['top'] > header_bottom + 5]

            if not content_words:
                continue

            # 3. 組裝行 (Lines)
            lines = []
            current_line = [content_words[0]]
            for word in content_words[1:]:
                if abs(word['top'] - current_line[-1]['top']) < 10:
                    current_line.append(word)
                else:
                    lines.append(current_line)
                    current_line = [word]
            lines.append(current_line)

            # 4. 段落判斷 (Paragraphs)
            for line in lines:
                line_text = " ".join([w['text'] for w in line])

                # 在這裡也可以先做一次簡單過濾，雖然 save_chapter 也會做
                if "OceanofPDF.com" in line_text:
                    line_text = line_text.replace("OceanofPDF.com", "").strip()
                    if not line_text: continue

                line_x0 = line[0]['x0']
                line_x1 = line[-1]['x1']
                current_line_obj = {'x0': line_x0, 'x1': line_x1, 'text': line_text}

                is_new = is_new_paragraph_logic(current_line_obj, prev_line_obj, common_x0, width)

                if is_new:
                    if buffer_paragraph:
                        current_paragraphs.append(buffer_paragraph.strip())
                    buffer_paragraph = line_text
                else:
                    if buffer_paragraph.endswith("-"):
                        buffer_paragraph = buffer_paragraph[:-1] + line_text
                    else:
                        buffer_paragraph += " " + line_text

                prev_line_obj = current_line_obj

    # 5. 迴圈結束後，別忘了儲存最後一章
    if buffer_paragraph:
        current_paragraphs.append(buffer_paragraph.strip())

    # 儲存最後一章時也加上序號 ---
    numbered_filename = f"{chapter_index:03d}_{current_chapter_name}"
    save_chapter(output_dir,numbered_filename, current_paragraphs)

# 使用方式
output_dir = "output_text_EN"
process_pdf_to_chapters("PAUL CLEAVE/Trust_No_One.pdf",output_dir)

  清理標題以作為合法的檔名 (移除 / \ : * ? " < > |)


--- 發現新章節: To Miss Roberts—my favorite teacher. Bees (頁數: 4) ---
已儲存章節: /content/output_text_EN/001_To Miss Roberts—my favorite teacher. Bees.txt (段落數: 77)
--- 發現新章節: DAY ONE (頁數: 12) ---
已儲存章節: /content/output_text_EN/002_DAY ONE.txt (段落數: 72)
--- 發現新章節: DAY FOUR (頁數: 22) ---
已儲存章節: /content/output_text_EN/003_DAY FOUR.txt (段落數: 46)
--- 發現新章節: DAY FIVE (頁數: 32) ---
已儲存章節: /content/output_text_EN/004_DAY FIVE.txt (段落數: 67)
--- 發現新章節: DAY TEN (頁數: 44) ---
已儲存章節: /content/output_text_EN/005_DAY TEN.txt (段落數: 67)
--- 發現新章節: DAY FIFTEEN (頁數: 52) ---
已儲存章節: /content/output_text_EN/006_DAY FIFTEEN.txt (段落數: 98)
--- 發現新章節: DAY TWENTY (頁數: 61) ---
已儲存章節: /content/output_text_EN/007_DAY TWENTY.txt (段落數: 62)
--- 發現新章節: DAY THIRTY (頁數: 71) ---
已儲存章節: /content/output_text_EN/008_DAY THIRTY.txt (段落數: 58)
--- 發現新章節: DAY THIRTY-ONE (頁數: 78) ---
已儲存章節: /content/output_text_EN/009_DAY THIRTY-ONE.txt (段落數: 157)
--- 發現新章節: DAY FORTY (頁數: 90) ---
已儲存章節: /content/output_text_EN/010_DAY FORTY.txt (段落數: 155)

### 從純文本文件重新載入段落列表

In [None]:
loaded_paragraphs = []
output_file_path = '/content/output_chapters/DAY ONE MILLION.txt'
with open(output_file_path, "r", encoding="utf-8") as f:
    for line in f:
        loaded_paragraphs.append(line.strip())

print(f"已從 {output_file_path} 載入 {len(loaded_paragraphs)} 個段落。")
print("載入的前3個段落:")
for i, p in enumerate(loaded_paragraphs[:20]):
    print(f"段落 {i+1}: {p}")

已從 /content/output_chapters/DAY ONE MILLION.txt 載入 149 個段落。
載入的前3個段落:
段落 1: Okay, so it’s not really day one million, and I’m not sure how liberal I was with exaggerations in the books. Derek (it’s actually Eric, but I’ve come to think of him as a Derek) told me this morning it’s been eight months since I checked in. Which, by my calculations, is nine hundred and ninety-nine thousand days and change short of a million. Still, it feels like I’ve been here forever.
段落 2: Today is a Jerry is Jerry day.
段落 3: Jerry has Alzheimer’s—check.
段落 4: Jerry used to be a crime writer—check.
段落 5: Jerry knows he shouldn’t trust Derek—check.
段落 6: Or Eric—check.
段落 7: Jerry is making a checklist—check.
段落 8: I’ve been flicking through the journal and seeing I’ve been piling crazy on top of crazy, and among some of those entries is evidence that Henry has been coming out to play. I’ve been having conversations with him. Henry and me shooting the breeze. There are two points here, Future Me, that I wan

### Chinese content process

In [None]:
from IPython.utils import process
# 只檢查第一頁與最後一頁是否有冗詞贅字要去除
import pdfplumber
import os
import logging

# 抑制 pdfminer.pdffont 的警告訊息
logging.getLogger('pdfminer.pdffont').setLevel(logging.ERROR)

base_dir = "/content/drive/MyDrive/PAUL CLEAVE/trust_no_one_ZH"
files = os.listdir(base_dir)
files = list(sorted(files,key=lambda x:int(x.split('_')[0])))

class PDFProcessor:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path

    def check_first_and_end_content(self):
        with pdfplumber.open(self.pdf_path) as pdf:
            num_of_pages = len(pdf.pages)
            print("first page content:")
            print(pdf.pages[0].extract_text())
            print("="*50)
            print("last page content:")
            print(pdf.pages[num_of_pages-1].extract_text())

if __name__ == "__main__":
    for file in files:
        full_path = os.path.join(base_dir, file)
        print(f"<<executing file {full_path}>>")
        processor = PDFProcessor(full_path).check_first_and_end_content()


<<executing file /content/drive/MyDrive/PAUL CLEAVE/trust_no_one_ZH/001_序_犯罪小說家_保羅克利夫_九九藏書.pdf>>
first page content:
序 犯罪小說家 保羅 克利夫 九九藏書
_ _ · _
read.99csw.com/book/10377/374616.html
99%
序
序
獻給羅伯特女士，我最親愛的老師
「魔鬼就潛伏在細節中。」傑瑞說。說這話時，傑瑞就是魔鬼，這些天所有細節都變幻莫測，很
難臨摹和重繪。他能回想起那個女人的臉，她張開嘴巴，卻只能發出一聲「噢」。當然，臨終之
前，人們永遠不知道該說什麼。據說奧斯卡·王爾德在辭世之前盯著床前的窗帘，他說那些窗帘
是多麼醜陋，他正在和它們決一死戰，只有其中的一個才能走，另一個留下。不過，傑瑞也記得
在哪兒讀過，沒有人敢肯定王爾德是不是真的說了這些話。想象一下，要是傑瑞也潛入王爾德的
家中，用刀將他釘在牆上，恐怕他也不會說出什麼發人深省、精闢有力的遺言來，也許他會說：
「這一刀比我預料的要更加痛苦。」不過，這句話也不會被歷史書記錄下來。
他思緒紛飛，但他不喜歡這種感覺，十分不喜歡。
一個女警官盯著他，臉上的表情像是她收留了一隻受傷的貓咪。她二十五六歲，生了一張可以喚
醒他體內邪欲的臉龐。她雙腿筆直而修長，金髮齊肩，身體曲線婀娜，穿著緊身黑色短裙和修身
的深藍色上衣，一雙湛藍的眼眸惹得他心神蕩漾。她拇指不停地揉擦著無名指，撫弄著老繭，這
種老繭他曾在一個吉他手的手上見過。另一個穿著制服的男警察靠在牆上，粗壯的雙臂交叉在胸
前，上唇留著二十世紀八十年代電視上的警察才留的鬍鬚，腰間系著武裝帶，上面掛著槍械等管
制工具。他看起來一副無所事事的模樣。
傑瑞繼續說：「女人大約三十歲，估計與實際年紀相差不過一歲。她的名字叫蘇珊，但總喜歡把
『珊』寫成『姍』。現在的人們喜歡以各種千奇百怪的方式寫字，我把這一切都歸咎於手機。」
他等著她點頭同意，但她沒有，男警察也只是靠著牆壁，毫無表示。他意識到自己的思緒再次紛
飛飄蕩了。
他深深地吸了一口氣，雙手抓緊椅子的扶手，改變了一下姿勢，好讓自己更舒服些。他閉上了眼
睛，定氣凝神，把遊離的思緒拉回來，重新回想那個會把「珊」寫成「姍」的蘇姍。那個蘇姍會
將一頭烏髮紮成一個馬尾辮，皮膚曬得黑黝黝的，

chapter -> paragraph -> sentence

EN:
 - chapter(rule based)
 - paragraph(rule based)
 - sentence(model based)

ZH:
 - chapter(download form)
 - paragraph(rule based)
 - sentence(model based)

In [None]:
# 打印文字，並查看行跟行之間的距離
with pdfplumber.open(path) as pdf:
    texts = pdf.pages


In [None]:
# 分割paragraph，間隔採用paragraph_gap_threshold=10
def extract_chinese_by_spacing(pdf_path, paragraph_gap_threshold=10):
    """
    根據「行與行之間的垂直間距」來分割中文段落。

    Args:
        pdf_path (str): PDF 檔案路徑
        paragraph_gap_threshold (float):
            判定為新段落的最小間距 (單位: pt)。
            - 一般正文行距可能在 2~5 左右。
            - 段落間距通常會大於正文行距，例如 8~15 或更大。
            - 請根據實際 PDF 調整此數值。

    Returns:
        list: 整理好的段落字串列表
    """
    all_paragraphs = []
    buffer_paragraph = ""
    prev_line_bottom = None

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # 1. 提取頁面上的文字區塊
            # x_tolerance 設定稍大(例如 5)，確保同一行的中文字能被黏在一起視為一個物件
            words = page.extract_words(x_tolerance=5, y_tolerance=3, keep_blank_chars=False)

            if not words:
                continue

            # 2. 將散落的文字區塊組裝成「完整的行」
            # 邏輯：如果垂直位置 (top) 差不多，就屬於同一行
            lines = []
            if words:
                current_line_words = [words[0]]
                for word in words[1:]:
                    # 判斷是否在同一行 (垂直差異小於 5pt)
                    if abs(word['top'] - current_line_words[-1]['top']) < 5:
                        current_line_words.append(word)
                    else:
                        lines.append(current_line_words)
                        current_line_words = [word]
                lines.append(current_line_words)

            # 3. 逐行分析垂直間距
            for line_words in lines:
                # 取得該行的文字內容與幾何位置
                line_text = "".join([w['text'] for w in line_words])
                line_top = min([w['top'] for w in line_words])     # 該行最高點
                line_bottom = max([w['bottom'] for w in line_words]) # 該行最低點

                # 判斷間距 (Gap)
                is_new_paragraph = False

                if prev_line_bottom is not None:
                    # 計算：當前行頂部 - 上一行底部
                    gap = line_top - prev_line_bottom

                    # ---【這就是您可以調整的實驗判斷】---
                    if gap > paragraph_gap_threshold:
                        is_new_paragraph = True
                        # Debug 用：可以看到哪一段是因為間距過大被切開
                        # print(f"切分段落: Gap={gap:.2f} (文字: {line_text[:10]}...)")

                if is_new_paragraph:
                    if buffer_paragraph:
                        all_paragraphs.append(buffer_paragraph)
                    buffer_paragraph = line_text
                else:
                    # 接續上一行 (中文通常不加空格，若有英文混排需求可視情況加空格)
                    buffer_paragraph += line_text

                prev_line_bottom = line_bottom

            # 頁面結束時，不要清空 prev_line_bottom，
            # 這樣可以讓下一頁的第一行跟上一頁的最後一行進行「跨頁合併」。
            # 但因為無法計算跨頁的 Gap，這裡預設行為是「接續上一段」。
            # 為了避免跨頁座標計算錯誤 (下一頁 top 通常很小)，我們將 prev_line_bottom 重置為 None
            # 讓下一頁的第一行強制「不做 Gap 判斷」，直接接在 buffer 後面 (或視為新段落，看您需求)

            # 策略：如果換頁，通常文字流是連續的。
            # 我們將 prev_line_bottom 設為 None，讓下一頁第一行進入 "prev_line_bottom is None" 的邏輯
            # 在上面的邏輯中，這會導致 is_new_paragraph = False，也就是會合併到 buffer 中。這是正確的。
            prev_line_bottom = None

    # 迴圈結束，存入最後一段
    if buffer_paragraph:
        all_paragraphs.append(buffer_paragraph)

    return all_paragraphs

# --- 使用範例 ---
# 您可以調整 threshold 參數來觀察結果
# 如果分段太碎，把數字調大 (例如 15, 20)
# 如果分段分不開，把數字調小 (例如 5, 8)
pdf_file = "/content/drive/MyDrive/PAUL CLEAVE/trust_no_one_ZH/031_桑德拉死後的第二天_犯罪小說家_保羅克利夫_九九藏書.pdf"

logging.getLogger('pdfminer.pdffont').setLevel(logging.ERROR)

base_dir = "/content/drive/MyDrive/PAUL CLEAVE/trust_no_one_ZH"
files = os.listdir(base_dir)
files = list(sorted(files,key=lambda x:int(x.split('_')[0])))
for file_name in files:
    full_path = os.path.join(base_dir, file_name)
    paragraphs = extract_chinese_by_spacing(full_path, paragraph_gap_threshold=10)
    print(f"<<{file}>>")
    for i, p in enumerate(paragraphs[:7]):
        print(f"段落 [{i}]: {p[:50]}...")
    print("="*50)

<<001_序_犯罪小說家_保羅克利夫_九九藏書.pdf>>
段落 [0]: 序犯罪小說家保羅克利夫九九藏書__·_...
段落 [1]: read.99csw.com/book/10377/374616.html...
段落 [2]: 99%...
段落 [3]: 序...
段落 [4]: 序...
段落 [5]: 獻給羅伯特女士，我最親愛的老師...
段落 [6]: 「魔鬼就潛伏在細節中。」傑瑞說。說這話時，傑瑞就是魔鬼，這些天所有細節都變幻莫測，很難臨摹和重繪。他...
<<002_第一天_犯罪小說家_保羅克利夫_九九藏書.pdf>>
段落 [0]: 第一天犯罪小說家保羅克利夫九九藏書__·_...
段落 [1]: read.99csw.com/book/10377/374617.html...
段落 [2]: 99%...
段落 [3]: 第一天...
段落 [4]: 第一天...
段落 [5]: 基本情況如下：今天是星期五。儘管還心有餘悸，但好歹你神志清醒。你的名字是傑瑞·格雷，此刻你正驚魂未定...
段落 [6]: 此時此刻，你的人生還沒有糟糕到失控的地步。是的，沒錯，你昨天丟了手機，上星期你丟了車，最近你甚至忘了...
<<003_第四天_犯罪小說家_保羅克利夫_九九藏書.pdf>>
段落 [0]: 第四天犯罪小說家保羅克利夫九九藏書__·_...
段落 [1]: read.99csw.com/book/10377/374618.html...
段落 [2]: 99%...
段落 [3]: 第四天...
段落 [4]: 第四天...
段落 [5]: 不，你沒有漏掉第二天和第三天，其實你對這兩天記憶猶新（比如你把咖啡放錯了地方，桑德拉在游泳池邊發現了...
段落 [6]: 伊娃周末過來了，並帶來一個重大消息：她要結婚了。你早就知道這一天總會來臨的，但是你仍然表現得十分驚喜...
<<004_第五天_犯罪小說家_保羅克利夫_九九藏書.pdf>>
段落 [0]: 第五天犯罪小說家保羅克利夫九九藏書__·_...
段落 [1]: read.99csw.com/book/10377/374619.html...
段落 [2]: 99%...
段落 [3]: 第五天...
段落 [4]: 第五天...
段落 [5]: 「我叫傑瑞·格雷，這是我被診斷出患有阿爾茨海默

__另外處理檔案(不提取內容，直接寫入檔案)__
> 022_搞砸婚禮的當天

- 檔名：022_搞砸婚禮的當天、
- 內容；"今天是星期天。現在是凌晨一點。這意味著搞砸婚禮的那天已經過去了,新的一天開始了。"

__所有檔案共同處理規則__
- 去除每頁結尾出現的頁碼"n/m"，n為當前頁數，m為總頁數

__各檔案開頭去除行數__

首頁去除開頭3個段落內容
> 015_第六十天<br>
> 017_婚禮前的第七天_<br>
> 024_搞砸婚禮當天的兩個小時后<br>
> 030_某一天<br>
> 032_天知道是哪一天<br>
> 035_第三十八天<br>
> 038_親愛的日記<br>

首頁去除開頭5個段落內容
> 036_不要相信漢斯亨利卡特的短篇小說

首頁去除開頭6個段落內容
> 001_序

*其餘檔案皆去除開頭5個段落內容

### 依照段落拆分中文檔案並保存成.txt(Run)

In [3]:
import pdfplumber
import os
import re

# ==========================================
# 1. 配置區域 (Configuration)
# ==========================================

# 段落間距閾值 (已確認)
GAP_THRESHOLD = 10

# 特殊檔案內容處理 (不提取，直接寫入)
# Key: 檔名開頭 (前三碼), Value: (完整檔名識別用, 硬編碼內容)
SPECIAL_CONTENT_RULES = {
    "022": ("022_搞砸婚禮的當天", "今天是星期天。現在是凌晨一點。這意味著搞砸婚禮的那天已經過去了,新的一天開始了。")
}

# 去除開頭段落數規則
# Key: 檔名開頭 (前三碼), Value: 去除的段落數量
SKIP_PARAGRAPH_RULES = {
    # 去除 6 段
    "001": 6,

    # 去除 3 段
    "015": 3, "017": 3, "024": 3, "030": 3, "032": 3, "035": 3, "038": 3,

    # 去除 5 段 (除了上述規則外的預設值也是 5，但 036 明確指定了)
    "036": 5
}

# 預設去除段落數 (其餘檔案)
DEFAULT_SKIP_COUNT = 5

# ==========================================
# 2. 核心功能函數
# ==========================================

def extract_chinese_by_spacing_filtered(pdf_path, paragraph_gap_threshold=10):
    """
    提取中文段落，並去除頁碼 (n/m 格式)
    """
    all_paragraphs = []
    buffer_paragraph = ""
    prev_line_bottom = None

    # 頁碼的正則表達式: 數字 + 斜線 + 數字 (允許中間有空格)
    # 例如: "1/200", "1 / 200", "15/30"
    page_num_pattern = re.compile(r'^\d+\s*/\s*\d+$')

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            words = page.extract_words(x_tolerance=5, y_tolerance=3, keep_blank_chars=False)

            if not words: continue

            # --- 過濾頁碼邏輯 ---
            # 策略：檢查頁面最底部(最後幾個物件)是否符合頁碼格式
            # 為了保險，我們過濾掉所有單獨成行且符合 n/m 格式的文字
            filtered_words = []
            for w in words:
                text = w['text'].strip()
                # 如果該字串完全符合頁碼格式，則跳過 (視為頁碼)
                if page_num_pattern.match(text):
                    continue
                filtered_words.append(w)
            words = filtered_words

            if not words: continue
            # ------------------

            # 組裝行 (Lines)
            lines = []
            current_line_words = [words[0]]
            for word in words[1:]:
                if abs(word['top'] - current_line_words[-1]['top']) < 5:
                    current_line_words.append(word)
                else:
                    lines.append(current_line_words)
                    current_line_words = [word]
            lines.append(current_line_words)

            # 逐行分析間距 (Gap)
            for line_words in lines:
                line_text = "".join([w['text'] for w in line_words])
                line_top = min([w['top'] for w in line_words])
                line_bottom = max([w['bottom'] for w in line_words])

                is_new_paragraph = False

                if prev_line_bottom is not None:
                    gap = line_top - prev_line_bottom
                    if gap > paragraph_gap_threshold:
                        is_new_paragraph = True

                if is_new_paragraph:
                    if buffer_paragraph:
                        all_paragraphs.append(buffer_paragraph)
                    buffer_paragraph = line_text
                else:
                    buffer_paragraph += line_text

                prev_line_bottom = line_bottom

            prev_line_bottom = None # 換頁重置

    if buffer_paragraph:
        all_paragraphs.append(buffer_paragraph)

    return all_paragraphs

def get_skip_count(filename):
    """根據檔名決定要跳過前幾個段落"""
    prefix = filename[:3] # 取得前三碼 (例如 "015")

    if prefix in SKIP_PARAGRAPH_RULES:
        return SKIP_PARAGRAPH_RULES[prefix]

    return DEFAULT_SKIP_COUNT

def process_all_files():
    # 建立輸出目錄
    if not os.path.exists(OUTPUT_FOLDER):
        os.makedirs(OUTPUT_FOLDER)

    # 取得 PDF 檔案列表並排序
    pdf_files = [f for f in os.listdir(INPUT_FOLDER) if f.lower().endswith('.pdf')]
    pdf_files.sort() # 確保依照數字順序處理

    print(f"找到 {len(pdf_files)} 個 PDF 檔案，開始處理...\n")

    for filename in pdf_files:
        prefix = filename[:3]
        file_base_name = os.path.splitext(filename)[0] # 去除 .pdf
        output_path = os.path.join(OUTPUT_FOLDER, f"{file_base_name}.txt")
        pdf_path = os.path.join(INPUT_FOLDER, filename)

        paragraphs_to_write = []

        # --- 規則 A: 特殊檔案直接寫入 ---
        if prefix in SPECIAL_CONTENT_RULES:
            # 確認檔名是否包含特定關鍵字 (多重確認)
            target_name_part = SPECIAL_CONTENT_RULES[prefix][0]
            if target_name_part in filename:
                content = SPECIAL_CONTENT_RULES[prefix][1]
                paragraphs_to_write = [content]
                print(f"[{filename}] -> 特殊檔案，寫入指定內容")
            else:
                # 如果編號是 022 但檔名不對，則走一般流程(或報錯)，這裡假設走一般流程
                print(f"[{filename}] -> 編號特殊但檔名不匹配，走一般流程")
                raw_paragraphs = extract_chinese_by_spacing_filtered(pdf_path, GAP_THRESHOLD)
                skip_n = get_skip_count(filename)
                paragraphs_to_write = raw_paragraphs[skip_n:]

        # --- 規則 B: 一般檔案提取並刪減 ---
        else:
            # 1. 提取段落 (含去除頁碼功能)
            raw_paragraphs = extract_chinese_by_spacing_filtered(pdf_path, GAP_THRESHOLD)

            # 2. 決定去除行數
            skip_n = get_skip_count(filename)

            # 3. 執行去除 (Slicing)
            if len(raw_paragraphs) > skip_n:
                paragraphs_to_write = raw_paragraphs[skip_n:]
            else:
                # 如果段落數少於要去除的數量，則寫入空檔或保留最後一段(視需求)
                # 這裡設定為寫入空內容
                paragraphs_to_write = []
                print(f"Warning: [{filename}] 段落數 ({len(raw_paragraphs)}) 少於需去除數 ({skip_n})")

            print(f"[{filename}] -> 去除前 {skip_n} 段 (原始 {len(raw_paragraphs)} -> 剩餘 {len(paragraphs_to_write)})")

        # --- 寫入檔案 (一行一段) ---
        with open(output_path, "w", encoding="utf-8") as f:
            for p in paragraphs_to_write:
                f.write(p + "\n")

# ==========================================
# 3. 執行
# ==========================================

if __name__ == "__main__":
    # 請確保當前目錄下有 'input_pdfs' 資料夾並放入您的 PDF

    # 輸入與輸出資料夾
    INPUT_FOLDER = "/content/drive/MyDrive/PAUL CLEAVE/trust_no_one_ZH"
    OUTPUT_FOLDER = "output_texts_ZH" # 輸出的 TXT 資料夾名稱

    if os.path.exists(INPUT_FOLDER):
        process_all_files()
        print("\n所有檔案處理完成！")
    else:
        print(f"錯誤: 找不到輸入資料夾 '{INPUT_FOLDER}'")

找到 38 個 PDF 檔案，開始處理...





[001_序_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 6 段 (原始 82 -> 剩餘 76)




[002_第一天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 74 -> 剩餘 69)




[003_第四天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 48 -> 剩餘 43)




[004_第五天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 69 -> 剩餘 64)




[005_第十天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 67 -> 剩餘 62)




[006_第十五天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 95 -> 剩餘 90)




[007_第二十天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 67 -> 剩餘 62)




[008_第三十一天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 156 -> 剩餘 151)




[009_第三十天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 62 -> 剩餘 57)




[010_第四十天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 149 -> 剩餘 144)




[011_第五十天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 104 -> 剩餘 99)




[012_第五十一天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 77 -> 剩餘 72)




[013_第五十三天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 140 -> 剩餘 135)




[014_第五十四天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 65 -> 剩餘 60)




[015_第六十天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 3 段 (原始 5 -> 剩餘 2)




[016_誰他媽知道是第幾天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 106 -> 剩餘 101)




[017_婚禮前的第七天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 3 段 (原始 28 -> 剩餘 25)




[018_婚禮前第五天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 62 -> 剩餘 57)




[019_搞砸婚禮前第三天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 85 -> 剩餘 80)




[020_婚禮的倒數第二天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 139 -> 剩餘 134)




[021_搞砸婚禮的當天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 144 -> 剩餘 139)
[022_搞砸婚禮的當天_犯罪小說家_保羅·克利夫_九九藏書.pdf] -> 特殊檔案，寫入指定內容




[023_搞砸婚禮當天的一個小時后_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 105 -> 剩餘 100)




[024_搞砸婚禮當天的兩個小時后_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 3 段 (原始 41 -> 剩餘 38)




[025_搞砸婚禮后的第一天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 130 -> 剩餘 125)




[026_傑瑞死了_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 71 -> 剩餘 66)




[027_最後一天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 95 -> 剩餘 90)




[028_苟延殘喘_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 147 -> 剩餘 142)




[029_我不知道我不知道_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 112 -> 剩餘 107)




[030_某一天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 3 段 (原始 54 -> 剩餘 51)




[031_桑德拉死後的第二天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 105 -> 剩餘 100)




[032_天知道是哪一天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 3 段 (原始 25 -> 剩餘 22)




[033_又是不知道的哪一天的一天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 143 -> 剩餘 138)




[034_第一百萬天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 152 -> 剩餘 147)




[035_第三十八天_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 3 段 (原始 35 -> 剩餘 32)




[036_不要相信漢斯亨利卡特的短篇小說_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 103 -> 剩餘 98)




[037_我的懺悔書_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 5 段 (原始 64 -> 剩餘 59)




[038_親愛的日記_犯罪小說家_保羅克利夫_九九藏書.pdf] -> 去除前 3 段 (原始 11 -> 剩餘 8)

所有檔案處理完成！


## alignment EN and ZH texts

### hierarchical processing: align paragraph -> align sentence(Run)

In [4]:
# install library and download models
!pip install spacy sentence_transformers -q
!python -m spacy download zh_core_web_sm

Collecting zh-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.8.0/zh_core_web_sm-3.8.0-py3-none-any.whl (48.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-pkuseg<2.0.0,>=1.0.0 (from zh-core-web-sm==3.8.0)
  Downloading spacy_pkuseg-1.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading spacy_pkuseg-1.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spacy-pkuseg, zh-core-web-sm
Successfully installed spacy-pkuseg-1.0.1 zh-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('zh_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a 

In [5]:
import torch
import spacy

# --- 修改點 A: 設定 Device ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {device}")

# --- 修改點 B: 啟用 SpaCy GPU (必須在 load 模型之前執行) ---
if device == "cuda":
    spacy.prefer_gpu()

# 載入 SpaCy 模型
print("Loading SpaCy...")
nlp_en = spacy.load("en_core_web_sm") # 或 trf 版本
nlp_zh = spacy.load("zh_core_web_sm") # 或 trf 版本

Running on: cuda
Loading SpaCy...


function_non GPU version: align_sentences_extended_gpu()(Run)

In [6]:
from sentence_transformers import SentenceTransformer, util

# 將 LaBSE 模型搬移到 GPU ---
print("Loading LaBSE...")
model_labse = SentenceTransformer('sentence-transformers/LaBSE')
model_labse.to(device) # 關鍵：移動模型權重到 GPU

def align_sentences_extended_gpu(en_sentences, zh_sentences, threshold=0.60, max_merge_window=4):
    aligned_pairs = []

    # --- 修改點 D: 確保 encode 產出在 GPU 上的 Tensor ---
    # convert_to_tensor=True 會自動根據模型所在的 device 產出 tensor
    en_embeddings = model_labse.encode(en_sentences, convert_to_tensor=True, device=device)
    zh_embeddings = model_labse.encode(zh_sentences, convert_to_tensor=True, device=device)

    i = 0
    j = 0

    while i < len(en_sentences) and j < len(zh_sentences):
        candidates = []

        # --- A. 測試合併 (Loop) ---
        for k in range(1, max_merge_window + 1):

            # 1:k (英文單句 vs 中文合併)
            if j + k <= len(zh_sentences):
                combined_zh_text = "".join(zh_sentences[j : j+k])

                # --- 修改點 E: 讓動態編碼也在 GPU 進行 ---
                emb_comb_zh = model_labse.encode(combined_zh_text, convert_to_tensor=True, device=device, show_progress_bar=False)

                # util.cos_sim 在 GPU tensor 上運算極快
                sim = util.cos_sim(en_embeddings[i], emb_comb_zh).item() # item() 取回數值到 CPU 做邏輯判斷

                candidates.append({
                    "score": sim, "type": f"1:{k}", "i_step": 1, "j_step": k,
                    "en_text": en_sentences[i], "zh_text": combined_zh_text
                })

            # k:1 (英文合併 vs 中文單句)
            if k > 1 and i + k <= len(en_sentences):
                combined_en_text = " ".join(en_sentences[i : i+k])

                # --- 修改點 E (同上) ---
                emb_comb_en = model_labse.encode(combined_en_text, convert_to_tensor=True, device=device, show_progress_bar=False)

                sim = util.cos_sim(emb_comb_en, zh_embeddings[j]).item()

                candidates.append({
                    "score": sim, "type": f"{k}:1", "i_step": k, "j_step": 1,
                    "en_text": combined_en_text, "zh_text": zh_sentences[j]
                })

        # --- B. 測試 Swap (GPU版) ---
        if i + 1 < len(en_sentences) and j + 1 < len(zh_sentences):
            # 這裡的運算全部在 GPU 上發生
            s1 = util.cos_sim(en_embeddings[i], zh_embeddings[j+1]).item()
            s2 = util.cos_sim(en_embeddings[i+1], zh_embeddings[j]).item()

            avg_score = (s1 + s2) / 2

            if min(s1, s2) > threshold - 0.1:
                candidates.append({
                    "score": avg_score, "type": "swap", "i_step": 2, "j_step": 2,
                    "swap_data": [(en_sentences[i], zh_sentences[j+1], s1),
                                  (en_sentences[i+1], zh_sentences[j], s2)]
                })

        # --- C. 決策邏輯 ---
        if not candidates: break
        best_candidate = max(candidates, key=lambda x: x['score'])

        if best_candidate['score'] < threshold:
            # Lookahead logic
            skip_zh_score = 0
            if j + 1 < len(zh_sentences):
                skip_zh_score = util.cos_sim(en_embeddings[i], zh_embeddings[j+1]).item()

            skip_en_score = 0
            if i + 1 < len(en_sentences):
                skip_en_score = util.cos_sim(en_embeddings[i+1], zh_embeddings[j]).item()

            if skip_zh_score > threshold: j += 1
            elif skip_en_score > threshold: i += 1
            else: i += 1; j += 1
            continue

        if best_candidate['type'] == 'swap':
            d1, d2 = best_candidate['swap_data']
            aligned_pairs.append({"en": d1[0], "zh": d1[1], "type": "swap_1", "score": d1[2]})
            aligned_pairs.append({"en": d2[0], "zh": d2[1], "type": "swap_2", "score": d2[2]})
        else:
            aligned_pairs.append({
                "en": best_candidate['en_text'],
                "zh": best_candidate['zh_text'],
                "type": best_candidate['type'],
                "score": best_candidate['score']
            })

        i += best_candidate['i_step']
        j += best_candidate['j_step']

    return aligned_pairs

Loading LaBSE...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

function_non GPU version: align_sentence_extended()

In [None]:
def align_sentences_extended(en_sentences, zh_sentences, threshold=0.60, max_merge_window=4):
    """
    支援 1:N 和 N:1 (N最大為 max_merge_window) 的合併測試，以及 2:2 交叉亂序 (Swap)。
    預設 max_merge_window=4，即支援 1:4 和 4:1。
    """
    aligned_pairs = []

    # 預先計算 Embedding (轉換為 Tensor 以利用 GPU 加速計算)
    # 注意：這裡只計算單句的 embedding，合併句會在迴圈中動態計算
    en_embeddings = model.encode(en_sentences, convert_to_tensor=True)
    zh_embeddings = model.encode(zh_sentences, convert_to_tensor=True)

    i = 0
    j = 0

    while i < len(en_sentences) and j < len(zh_sentences):
        candidates = [] # 用來存儲所有可能的對齊策略 [score, type, en_step, zh_step, data_dict]

        # --- A. 測試所有合併情況 (1:N 和 N:1) ---
        # 範圍從 1 到 max_merge_window (預設 1~4)
        for k in range(1, max_merge_window + 1):

            # 情況 1: 1句英文 對 k句中文 (1:k)
            if j + k <= len(zh_sentences):
                # 這裡需要動態合併文本並編碼
                # 簡單用空格連接，實際可根據標點優化
                combined_zh_text = "".join(zh_sentences[j : j+k])
                emb_comb_zh = model.encode(combined_zh_text, convert_to_tensor=True)

                sim = util.cos_sim(en_embeddings[i], emb_comb_zh).item()
                candidates.append({
                    "score": sim,
                    "type": f"1:{k}",
                    "i_step": 1,
                    "j_step": k,
                    "en_text": en_sentences[i],
                    "zh_text": combined_zh_text
                })

            # 情況 2: k句英文 對 1句中文 (k:1)
            # 注意：當 k=1 時，這與上面的 1:1 重複，為了邏輯簡單我們允許重複計算，取 max 沒影響
            if k > 1 and i + k <= len(en_sentences):
                combined_en_text = " ".join(en_sentences[i : i+k])
                emb_comb_en = model.encode(combined_en_text, convert_to_tensor=True)

                sim = util.cos_sim(emb_comb_en, zh_embeddings[j]).item()
                candidates.append({
                    "score": sim,
                    "type": f"{k}:1",
                    "i_step": k,
                    "j_step": 1,
                    "en_text": combined_en_text,
                    "zh_text": zh_sentences[j]
                })

        # --- B. 測試交叉亂序 (Swap Check) ---
        # 僅檢查 2x2 的互換 (E1->C2, E2->C1)
        if i + 1 < len(en_sentences) and j + 1 < len(zh_sentences):
            # E_i vs C_{j+1}
            s1 = util.cos_sim(en_embeddings[i], zh_embeddings[j+1]).item()
            # E_{i+1} vs C_j
            s2 = util.cos_sim(en_embeddings[i+1], zh_embeddings[j]).item()

            avg_score = (s1 + s2) / 2

            # 只有當兩者都達到一定水準，才視為 Swap (避免一個極高一個極低拉高平均)
            # 這裡加一個 min check 讓 swap 條件嚴格一點
            if min(s1, s2) > threshold - 0.1:
                candidates.append({
                    "score": avg_score,
                    "type": "swap",
                    "i_step": 2,
                    "j_step": 2,
                    # Swap 比較特殊，我們會輸出兩筆
                    "swap_data": [
                        (en_sentences[i], zh_sentences[j+1], s1),
                        (en_sentences[i+1], zh_sentences[j], s2)
                    ]
                })

        # --- C. 決策邏輯 ---
        if not candidates:
            break # 邊界保護

        # 找出分數最高的候選者
        best_candidate = max(candidates, key=lambda x: x['score'])

        # 檢查是否過閾值
        if best_candidate['score'] < threshold:
            # 策略：如果都不匹配，判定為某一方有多餘句子
            # 這裡採用的策略是：嘗試跳過中文 (中文常有額外語氣句)
            # 進階策略可以做 Lookahead (看 i+1, j 跟 i, j+1 誰比較合)

            # 這裡示範簡單的 Lookahead Check
            skip_zh_score = 0
            if j + 1 < len(zh_sentences):
                skip_zh_score = util.cos_sim(en_embeddings[i], zh_embeddings[j+1]).item()

            skip_en_score = 0
            if i + 1 < len(en_sentences):
                skip_en_score = util.cos_sim(en_embeddings[i+1], zh_embeddings[j]).item()

            if skip_zh_score > threshold:
                j += 1 # 認定中文多了一句，跳過中文
            elif skip_en_score > threshold:
                i += 1 # 認定英文多了一句，跳過英文
            else:
                # 雙方都無法匹配，同時跳過 (避免死循環)
                i += 1
                j += 1
            continue

        # 執行最佳匹配
        if best_candidate['type'] == 'swap':
            # 處理 Swap
            d1, d2 = best_candidate['swap_data']
            aligned_pairs.append({"en": d1[0], "zh": d1[1], "type": "swap_1", "score": d1[2]})
            aligned_pairs.append({"en": d2[0], "zh": d2[1], "type": "swap_2", "score": d2[2]})
        else:
            # 處理 Merge (1:1, 1:2, ..., 4:1)
            aligned_pairs.append({
                "en": best_candidate['en_text'],
                "zh": best_candidate['zh_text'],
                "type": best_candidate['type'],
                "score": best_candidate['score']
            })

        # 移動指針
        i += best_candidate['i_step']
        j += best_candidate['j_step']

    return aligned_pairs

process(Run)

In [8]:
# run with GPU
def split_sentences_spacy(text, lang='en'):
    """使用 Spacy 進行斷句"""
    if not text or not text.strip():
        return []

    if lang == 'en':
        doc = nlp_en(text)
    else:
        doc = nlp_zh(text)

    # 過濾掉過短的句子或純符號
    return [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 1]

def process_chapter_alignment(en_chapter_path, zh_chapter_path, output_path):
    """
    執行分層對齊：章節 -> 段落 -> 句子
    """
    # 1. 讀取檔案 (假設一行一段落)
    with open(en_chapter_path, 'r', encoding='utf-8') as f:
        en_paragraphs = [line.strip() for line in f if line.strip()]

    with open(zh_chapter_path, 'r', encoding='utf-8') as f:
        zh_paragraphs = [line.strip() for line in f if line.strip()]

    print(f"Loaded: {len(en_paragraphs)} EN paragraphs, {len(zh_paragraphs)} ZH paragraphs.")

    # ---------------------------------------------------------
    # 第一階段：段落級對齊 (Paragraph Alignment)
    # ---------------------------------------------------------
    print("Stage 1: Aligning Paragraphs...")
    # 直接複用對齊函數，輸入是段落列表
    # 段落合併通常不會超過 3 段，所以 window 設小一點節省時間
    aligned_paragraphs = align_sentences_extended_gpu(
        en_paragraphs,
        zh_paragraphs,
        threshold=0.50, # 段落相似度通常比句子低一點，因為雜訊多，設低一點
        max_merge_window=3
    )

    print(f"Paragraph alignment done. Found {len(aligned_paragraphs)} pairs.")

    # ---------------------------------------------------------
    # 第二階段：句子級對齊 (Sentence Alignment)
    # ---------------------------------------------------------
    print("Stage 2: Aligning Sentences within Paragraphs...")

    final_sentence_pairs = []

    for para_pair in aligned_paragraphs:
        # 取得配對好的段落文本
        p_en_text = para_pair['en']
        p_zh_text = para_pair['zh']

        # 使用 Spacy 斷句
        sents_en = split_sentences_spacy(p_en_text, 'en')
        sents_zh = split_sentences_spacy(p_zh_text, 'zh')

        # 如果任一方斷句後為空，跳過
        if not sents_en or not sents_zh:
            continue

        # 在這個小範圍內進行句對齊
        # 這裡需要高精度，threshold 設高，並開啟 1:4 合併
        sents_pairs = align_sentences_extended_gpu(
            sents_en,
            sents_zh,
            threshold=0.65,
            max_merge_window=4
        )

        # 收集結果，並加上來源段落的 metadata (這對 debug 很有用)
        for sp in sents_pairs:
            sp['source_para_score'] = para_pair['score'] # 記錄這句來自哪個可信度的段落
            final_sentence_pairs.append(sp)

    # ---------------------------------------------------------
    # 輸出結果
    # ---------------------------------------------------------
    print(f"Total sentence pairs aligned: {len(final_sentence_pairs)}")

    # 寫入 JSONL 或 TXT
    import json
    with open(output_path, 'w', encoding='utf-8') as f:
        for pair in final_sentence_pairs:
            json.dump(pair, f, ensure_ascii=False)
            f.write('\n')

en_chapter_path = r'/content/output_text_EN/001_To Miss Roberts—my favorite teacher. Bees.txt'
zh_chapter_path = r'/content/output_texts_ZH/001_序_犯罪小說家_保羅克利夫_九九藏書.txt'
process_chapter_alignment(en_chapter_path, zh_chapter_path, 'aligned_ch1.jsonl')

Loaded: 77 EN paragraphs, 76 ZH paragraphs.
Stage 1: Aligning Paragraphs...
Paragraph alignment done. Found 72 pairs.
Stage 2: Aligning Sentences within Paragraphs...
Total sentence pairs aligned: 177


### 打印'aligned_ch1.jsonl'

In [9]:
with open('aligned_ch1.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        print(line.strip())

{"en": "“The devil is in the details,” Jerry says, and back then the devil was him, and these days those details are hard to hang on to.", "zh": "「魔鬼就潛伏在細節中。」傑瑞說。說這話時，傑瑞就是魔鬼，這些天所有細節都變幻莫測，很難臨摹和重繪。", "type": "1:3", "score": 0.7246894836425781, "source_para_score": 0.8459680080413818}
{"en": "He can remember the woman’s face, the way her mouth opened when all she could manage was an oh.", "zh": "他能回想起那個女人的臉，她張開嘴巴，卻只能發出一聲「噢」。", "type": "1:1", "score": 0.7349810600280762, "source_para_score": 0.8459680080413818}
{"en": "Of course people never know what they’re going to say when their time is up.", "zh": "當然，臨終之前，人們永遠不知道該說什麼。", "type": "1:1", "score": 0.7495071887969971, "source_para_score": 0.8459680080413818}
{"en": "Oscar Wilde said something about curtains when he was on his deathbed, about how ugly they were and either they must go or he would.", "zh": "據說奧斯卡·王爾德在辭世之前盯著床前的窗帘，他說那些窗帘是多麼醜陋，他正在和它們決一死戰，只有其中的一個才能走，另一個留下。", "type": "1:1", "score": 0.6834971308708191, "source_para_score": 0.845