<a href="https://colab.research.google.com/github/waynelee9511cloud/my-colab-notebooks/blob/main/BioMistral_7B_expert_mode_V0_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1st GitHub test

In [None]:
#@title 0,1：安裝與import(安裝環境) [2 只print]
# ===========================
# BioMistral 7B xlsx 驗證 一鍵腳本（Colab A100）
# v0.2: 1st Complete development of BioMistral 7B model
# v0.3: start fine tune
# ===========================
# 0) 安裝與環境修正
# - 先移除 RAPIDS（若未安裝會顯示 not installed 可忽略）
# - 釘住 pandas==2.2.2（與 Colab / RAPIDS 相容）
# - 其餘依賴再安裝，避免把 pandas 升到 2.3.x
!pip -q uninstall -y cudf-cu12 dask-cudf-cu12 || true
!pip -q install -U "pandas==2.2.2"
!pip -q install -U "transformers>=4.50.0" accelerate huggingface_hub openpyxl pillow einops safetensors tqdm trl peft datasets

import os, json, re, time
import pandas as pd
import torch
from tqdm.auto import tqdm
from huggingface_hub import login
from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer

# ===========================
# 1) 基本設定
# ===========================
MODEL_ID = "BioMistral/BioMistral-7B"  # BioMistral 7B model
#MODEL_ID = "WayneLee9511/SFT-BMS-7B-1003-1529-v02"  # Fine tune model: BioMistral_sft_out/final
EXCEL_PATH = None  # 直接填入檔案路徑字串；若為 None 會開啟上傳視窗
OUTPUT_CSV = "validation_results.csv"
MAX_ROWS = None  # 例如 500，或設 None 全部處理
#DO_FINETUNE = True  # 若要微調改成 True（需自備/產生訓練資料）
DO_FINETUNE = False  # 若要微調改成 True（需自備/產生訓練資料）


PROMPT_CONFIG = {
    "MH": {
        "target_column": "[Diagnosis/ Conditions]",
        "error_type": "Misspelled diagnosis name",
        "example_error": "[Mypia] should be [Myopia].",
        "example_correct": "Hypertension"
    },
    "CM": {
        "target_column": "[Generic/Brand Name]",
        "error_type": "Misspelled drug name",
        "example_error": "[Lisnopril] should be [Lisinopril].",
        "example_correct": "Aspirin 100mg"
    },
    "AE": {
        "target_column": "[Event]",
        "error_type": "Misspelled event name",
        "example_error": "[Headchae] should be [Headache].",
        "example_correct": "Nausea"
    }
}



SYSTEM_PROMPT_ZH = (
    "You are a specialized tool for identifying spelling errors in specific medical data fields.",
    "Your ONLY task is to check for **actual spelling errors** in the values of the following three fields: '[Diagnosis/ Conditions]', '[Event]', and '[Generic/Brand Name]'.",
    "You MUST IGNORE all other fields, such as IDs, dates, flags, sequence numbers, or any other columns. Do not report any issues for these other fields.",

    # --- 【新增的核心規則】告訴模型什麼「不是」錯誤 ---
    "You MUST IGNORE the following types of variations, as they are NOT considered errors:",
    "- **Capitalization**: Differences in upper or lower case are not errors (e.g., 'Hypertension' and 'hypertension' are both correct).",
    "- **Spacing**: Missing or extra spaces between words and numbers are not errors (e.g., 'Aspirin 100mg' and 'Aspirin100mg' are both correct).",
    "- **Plurals**: Singular and plural forms are not errors (e.g., 'headache' and 'headaches' are both correct).",
    "- **Regional Spelling**: Both American and British English spellings are acceptable. Do not correct one to the other (e.g., 'oestrogen' vs 'estrogen'). Focus only on actual misspellings.",
    "- **Abbreviations**: Do not report abbreviations as errors. It is acceptable to use 'MI' instead of 'Myocardial Infarction'.",
    "- **Drug Names**: Both generic and brand/trade names are acceptable (e.g., 'Paracetamol' vs 'Tylenol').",
    # ----------------------------------------------------

    "If you find a genuine spelling error, your reason MUST be specific and follow the format from the examples below. If there are no spelling errors in the specified fields after considering the rules above, your summary must be 'No issues found.'.",
    "Your entire response MUST be a single, valid JSON object and nothing else.",

    # --- 範例 1: 診斷名稱拼寫錯誤 ---
    "Example Input Data:",
    "[Diagnosis/ Conditions]: Mypia",

    "Example JSON Output:",
    '{\n'
    '  "row_index": 0,\n'
    '  "issues": [\n'
    '    {\n'
    '      "field": "[Diagnosis/ Conditions]",\n'
    '      "reason": "Misspelled diagnosis name: [Mypia] should be [Myopia].",\n'
    '      "severity": "error"\n'
    '    }\n'
    '  ],\n'
    '  "summary": "Misspelled diagnosis name: [Mypia] should be [Myopia]."\n'
    '}',

    # --- 範例 2: 藥物名稱拼寫錯誤 ---
    "Example Input Data:",
    "[Generic/Brand Name]: Lisnopril 10mg",

    "Example JSON Output:",
    '{\n'
    '  "row_index": 1,\n'
    '  "issues": [\n'
    '    {\n'
    '      "field": "[Generic/Brand Name]",\n'
    '      "reason": "Misspelled drug name: [Lisnopril] should be [Lisinopril].",\n'
    '      "severity": "error"\n'
    '    }\n'
    '  ],\n'
    '  "summary": "Misspelled drug name: [Lisnopril] should be [Lisinopril]."\n'
    '}',

    # --- 範例 3: 沒有拼寫問題的資料 ---
    "Example Input Data:",
    "[Diagnosis/ Conditions]: Hypertension",

    "Example JSON Output:",
    '{\n'
    '  "row_index": 2,\n'
    '  "issues": [],\n'
    '  "summary": "No issues found."\n'
    '}'
)

# ===========================
# 2) Skip HF login for now (BioMistral may not require authentication)
# ===========================
print("Packages installed successfully. Ready to load model.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.6/564.6 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.3/506.3 kB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m65.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.[0m[31m
[0mPackages installed successfully. Ready to load model.


In [None]:
#@title  3：掛載drive與模型

# ===========================
# 3) 載入模型與處理器（bf16 + 自動放到 GPU）
# - 使用 dtype 取代舊參數 torch_dtype
# ===========================
from google.colab import drive
drive.mount('/content/drive')

import os
os.environ['HF_HOME'] = '/content/drive/MyDrive/LLM cache3'
os.environ['TRANSFORMERS_CACHE'] = '/content/drive/MyDrive/LLM cache3'


MODEL_ID = "BioMistral/BioMistral-7B"
#MODEL_ID = "WayneLee9511/SFT-BMS-7B-1003-1529-v02"
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch_dtype,
    device_map="auto"
)

# 載入 tokenizer 並修正 padding token 問題
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

processor = tokenizer
model.eval()

Mounted at /content/drive


config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/14.5G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): MistralRMSNorm((4096,)

In [None]:
#@title  4：上傳/讀excel。  5,6：資料前處理與模型匯入、推論。 (如果開微調)7重新訓練

# ===========================
# 4) 讀取 Excel（一次讀取所有工作表）
# ===========================
from google.colab import files
print("請上傳 xlsx 檔案...")
uploaded = files.upload()
if not uploaded:
    raise RuntimeError("未上傳任何檔案")
# 只取第一個上傳檔名（字串），避免成為 list
EXCEL_PATH = next(iter(uploaded))
print("已選擇檔案：", EXCEL_PATH)

# 一次讀取「所有工作表」成為 dict：{工作表名稱: DataFrame}
sheets = pd.read_excel(EXCEL_PATH, sheet_name=None, engine="openpyxl")  # 關鍵：sheet_name=None
if MAX_ROWS is not None:
    for k in list(sheets.keys()):
        sheets[k] = sheets[k].head(MAX_ROWS)
total_rows = sum(len(df) for df in sheets.values())
print(f"已讀取工作表數：{len(sheets)}，總筆數：{total_rows}")

# ===========================
# 5) 工具函式：產生 messages、推論、解析 JSON
# ===========================
def row_to_yaml(row_dict):
    # 轉成簡潔 YAML 方便模型理解欄位；避免太長可裁切
    try:
        import yaml

        for k, v in row_dict.items():
            # 如果欄位型態是 Timestamp（例如excel日期），就強制轉成文字
            if "Timestamp" in str(type(v)):
                row_dict[k] = str(v)

    except Exception:
        return "\n".join([f"{k}: {v}" for k, v in row_dict.items()])
    return yaml.safe_dump(row_dict, allow_unicode=True, sort_keys=False)

# 更新 build_messages 以接收動態 prompt
def build_messages(row_index, row_dict, dynamic_system_prompt):
    # 我們仍然只傳送目標欄位的資料給 AI
    filtered_dict = {key: value for key, value in row_dict.items() if key in dynamic_system_prompt}

    if not filtered_dict:
        return None

    yaml_text = row_to_yaml(filtered_dict)
    user_text = f"The following is a single data entry（YAML）：\\n{yaml_text}\\n Please review it according to the guidelines and return valid JSON."
    messages = [
        {"role": "user", "content": f"{dynamic_system_prompt}\\n{user_text}"}
    ]
    return messages


# 核心修正：先用 chat template 產生字串，再交給 processor 打包；避免維度/索引錯誤
@torch.inference_mode()
def validate_one(row_index, row_dict):

    messages = build_messages(row_index, row_dict)

    # 1) 先把 messages 轉成文字 prompt（不要 tokenize）
    prompt = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,   # 關鍵：回傳字串
    )

    # 2) 再由 processor 打包成張量（本任務為文字-only，不帶 images）
    inputs = processor(
        text=prompt,
        return_tensors="pt",
        padding=True
    ).to(model.device, dtype=torch_dtype)

    # 3) 生成並僅解碼新生成的 tokens
    input_len = inputs["input_ids"].shape[-1]
    gen_ids = model.generate(
        **inputs,
        max_new_tokens=512, # 將 max_new_tokens 從 128 增加到 512
        do_sample=False,
        temperature=None,
        repetition_penalty=1.05,
        use_cache=True,
    )
    text = processor.batch_decode(gen_ids[:, input_len:], skip_special_tokens=True)[0] # Select the first element


    jtxt = extract_json_str(text)
    parsed = None
    is_valid_json = False
    try:
        parsed = json.loads(jtxt) if jtxt else None
        is_valid_json = True if parsed else False
    except Exception:
        parsed = None
        is_valid_json = False

    return {
        "raw": text,
        "json_str": jtxt,
        "json_obj": parsed,
        "is_valid_json": is_valid_json # 增加一個標記，記錄是否為有效 JSON
    }

# 更新 validate_many 以接收動態 prompt
def validate_many(start_index, rows, dynamic_system_prompt):
    prompts = []
    for row_index, row_dict in enumerate(rows):
        messages = build_messages(row_index, row_dict, dynamic_system_prompt)
        if messages is None:
            prompts.append(None)
            continue

        prompt = processor.apply_chat_template(
          messages,
          add_generation_prompt=True,
          tokenize=False
        )
        prompts.append(prompt)


    valid_prompts = [p for p in prompts if p is not None]
    if not valid_prompts:
        # 為了保持索引對應，回傳一個帶有標記的空結果列表
        return [{"raw": "Skipped: No target column found.", "summary": "Skipped"} for _ in prompts]


    inputs = processor(text=prompts, return_tensors="pt", padding=True)
    for k in inputs.keys():
        if hasattr(inputs[k], "to"):
            inputs[k] = inputs[k].to(model.device)

    input_len = inputs["input_ids"].shape[-1]
    gen_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False, repetition_penalty=1.05, use_cache=True)
    texts = processor.batch_decode(gen_ids[:, input_len:], skip_special_tokens=True)

    # --- 【全新的處理邏輯】 ---
    outs = []
    # 【修改點】將 "reason" 改為 "summary"
    summary_regex = re.compile(r'"summary":\s*"(.*?)"', re.DOTALL)
    reason_regex = re.compile(r'"reason":\s*"(.*?)"', re.DOTALL)

    for t in texts:
        summary_text = ""
       # 步驟 1：優先嘗試提取 "summary"
        summaries_found = summary_regex.findall(t)

        if summaries_found:
            # 如果成功找到 summary，就使用它
            summary_text = summaries_found[0]
        else:
            # 步驟 2：【備用方案】如果找不到 summary，就嘗試提取 "reason"
            reasons_found = reason_regex.findall(t)
            if reasons_found:
                # 如果成功找到 reason，就使用它 (可能有多個，所以用 join)
                summary_text = "; ".join(reasons_found)
            else:
                # 步驟 3：如果 summary 和 reason 都找不到，才回報失敗
                #summary_text = f"Extraction Failed: Could not find 'summary' or 'reason' field. Raw output: {t[:150]}..."
                summary_text = f"Skipped validation due to uncertain words."

        outs.append({
            "raw": t,
            "summary": summary_text
        })
    return outs

# ===========================
# 6) 逐表逐列驗證並輸出結果 (動態 Prompt 版)
# ===========================
def _safe_name(name):
    return re.sub(r"[^A-Za-z0-9_.-]+", "_", str(name))

BATCH = 64
all_records = []
per_sheet_records = {}

for sheet_name, df in sheets.items():
    print(f"處理工作表：{sheet_name}，筆數：{len(df)}")

    # 【動態 Prompt 生成核心】
    # 從設定檔中取得該工作表的專屬設定
    config = PROMPT_CONFIG.get(sheet_name)
    if not config:
        print(f"警告：在 PROMPT_CONFIG 中找不到工作表 '{sheet_name}' 的設定，將略過此工作表。")
        continue

    # 動態建立專屬的 System Prompt
    dynamic_system_prompt = (
        f"You are a specialized tool for identifying spelling errors in the '{config['target_column']}' field.",
        f"Your ONLY task is to check for spelling errors in the value of the '{config['target_column']}' field.",
        "You MUST IGNORE all other fields.",
        f"If you find a spelling error, your reason MUST be '{config['error_type']}: {config['example_error']}'.",
        "If there are no spelling errors, your summary must be 'No issues found.'.",
        "Your entire response MUST be a single, valid JSON object and nothing else.",

        "\n--- Example 1: Data with a spelling error ---\n"
        f"Example Input Data:\n"
        f"{config['target_column']}: {config['example_error'].split(' should be ')[0][1:]}\n" # 從範例中提取錯誤的詞
        "Example JSON Output:\n"
        '{\n'
        '  "row_index": 0,\n'
        '  "issues": [\n'
        '    {\n'
        f'      "field": "{config["target_column"]}",\n'
        f'      "reason": "{config["error_type"]}: {config["example_error"]}",\n'
        '      "severity": "error"\n'
        '    }\n'
        '  ],\n'
        f'  "summary": "{config["error_type"]}: {config["example_error"]}"\n'
        '}',

        "\n--- Example 2: Data with no spelling issues ---\n"
        f"Example Input Data:\n"
        f"{config['target_column']}: {config['example_correct']}\n"
        "Example JSON Output:\n"
        '{\n'
        '  "row_index": 1,\n'
        '  "issues": [],\n'
        '  "summary": "No issues found."\n'
        '}'
    )
    # 將 Tuple 轉成一個完整的字串
    dynamic_system_prompt = "\n".join(dynamic_system_prompt)

    sheet_recs = []
    rows = df.to_dict('records')

    for s in range(0, len(rows), BATCH):
        batch_rows = rows[s:s+BATCH]
        # 將動態生成的 prompt 傳遞給 validate_many
        outs = validate_many(s, batch_rows, dynamic_system_prompt)

        for j, out in enumerate(outs):
            row_idx = s + j
            summary = out.get("summary", "Error: Summary not found.")
            raw_output = out.get("raw", "Error: Raw output not found.")


            # --- 【新增的驗證過濾器】 ---
            # 只有當 summary 內容看起來像是一個錯誤報告時，我們才進行過濾
            if "should be" in summary:
                # 使用正規表達式，從 summary 中提取出 [原始詞] 和 [建議詞]
                match = re.search(r'\[(.*?)\]\s*should be\s*\[(.*?)\]', summary)
                if match:
                    original_word = match.group(1)
                    corrected_word = match.group(2)

                    # 【核心過濾邏輯】
                    # 如果 原始詞 和 建議詞 在忽略大小寫後是相同的，
                    # 就代表這是一個「無效錯誤」或「大小寫錯誤」，我們將其覆寫為 "No issues found."
                    if original_word.lower() == corrected_word.lower():
                        summary = "No issues found. " # 加上標記以便追蹤

            rec = {
                "sheet_name": sheet_name,
                "row_index": int(row_idx),
                "BioMistral_summary": summary, # 使用過濾後的 summary
                #"raw_model_output": raw_output
            }
            sheet_recs.append(rec)
            all_records.append(rec)

    per_sheet_records[sheet_name] = sheet_recs

# 6a) 輸出「總覽」CSV
overview_df = pd.DataFrame(all_records)
overview_df.to_csv("validation_results__overview.csv", index=False, encoding="utf-8-sig")
print("已輸出：validation_results__overview.csv")

# 6b) 輸出「每個工作表各一檔」的合併結果
for sheet_name, df in sheets.items():
    result_df = pd.DataFrame(per_sheet_records[sheet_name])

    # 建立一個與原始 df 索引匹配的 row_index 以便合併
    df_with_index = df.reset_index().rename(columns={'index': 'row_index'})

    # 合併原始資料與驗證結果
    merged = pd.merge(df_with_index, result_df, on="row_index", how="left")

    out_name = f"validation_results__{_safe_name(sheet_name)}.csv"
    merged.to_csv(out_name, index=False, encoding="utf-8-sig")
    print(f"已輸出：{out_name}")

# ==============================================================================
#  ↑↑↑ 請用這段全新的第 6 部分，替換掉您現有的版本 ↑↑↑
# ==============================================================================


# ===========================
# 7)（選用）微調範例：SFT（文字-only），預設關閉
# - 將偽標註樣本做 SFT；實務建議換成人工審核的高品質標註
# - 調整為與推論一致的前處理：template -> processor(text=...)
# ===========================
if DO_FINETUNE:
    from datasets import Dataset
    from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
    from peft import LoraConfig, get_peft_model

    # 7.1 準備少量訓練樣本（以剛跑過的輸出為偽標註）
    # 這裡只使用 is_valid_json 為 True 的樣本進行微調
    valid_records = [r for r in all_records if r.get("is_valid_json", False)]

    train_samples = []
    for r in valid_records:
        row_idx = r["row_index"]
        sheet_name = r["sheet_name"]
        # 找到原始 df
        original_df = sheets.get(sheet_name)
        if original_df is None or row_idx >= len(original_df):
            continue # 找不到原始資料，跳過

        row_dict = {k: (None if pd.isna(v) else v) for k, v in original_df.iloc[row_idx].to_dict().items()}
        user_yaml = row_to_yaml(row_dict)
        user_text = f"Based on the following data（YAML）：\n{user_yaml}\n Please review according to the guidelines and return valid JSON."
        assistant_text = r["medgemma_json"] # 使用成功解析的 JSON 字串作為目標輸出
        messages = [
            {"role": "user", "content": user_text},
            {"role": "assistant", "content": assistant_text},
        ]
        train_samples.append({"messages": messages})

    if len(train_samples) < 10:
        print(f"可用有效訓練樣本太少 ({len(train_samples)} < 10)，略過微調。請自行提供標註資料或使用能產出更多有效 JSON 的模型，並重試。")
    else:
        print(f"找到 {len(train_samples)} 個有效訓練樣本，開始微調。")
        raw_ds = Dataset.from_list(train_samples)

        def encode_fn(ex):
            msgs = ex["messages"]
            # 與推論一致：先模板成字串，再 tokenization
            prompt = processor.apply_chat_template(
                msgs, add_generation_prompt=False, tokenize=False
            )
            tokenized = processor(text=prompt, return_tensors=None)
            return {"input_ids": tokenized["input_ids"], "labels": tokenized["input_ids"]}

        ds = raw_ds.map(encode_fn, remove_columns=raw_ds.column_names, batched=False)

        try:
            peft_config = LoraConfig(
                r=8, lora_alpha=16, lora_dropout=0.05,
                target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
                task_type="CAUSAL_LM",
            )
            model.enable_input_require_grads()
            model = get_peft_model(model, peft_config)
        except Exception as e:
            print(f"LoRA 套用失敗，改用全參數微調（小批次）：{e}")

        data_collator = DataCollatorForSeq2Seq(
            tokenizer=processor,
            model=model,
            padding=True,
            max_length=4096,
            label_pad_token_id=-100,
        )

        args = TrainingArguments(
            output_dir="medgemma_sft_out",
            per_device_train_batch_size=1,
            gradient_accumulation_steps=8,
            learning_rate=2e-5,
            num_train_epochs=1,
            logging_steps=10,
            save_steps=200,
            save_total_limit=2,
            bf16=torch.cuda.is_available(),
            gradient_checkpointing=True,
            optim="adamw_torch",
            lr_scheduler_type="cosine",
            warmup_ratio=0.03,
            report_to="none",
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=ds,
            tokenizer=processor,
            data_collator=data_collator,
        )
        trainer.train()
        trainer.save_model("/content/drive/MyDrive/BioMistral_train_backup/BioMistral_sft_out/final")
        processor.save_pretrained("/content/drive/MyDrive/BioMistral_train_backup/BioMistral_sft_out/final")
        print("微調完成，輸出目錄：/content/drive/MyDrive/BioMistral_train_backup/BioMistral_sft_out/final")

請上傳 xlsx 檔案...


Saving FB825CLRS02_DataCenter_DataSet_List_20251008144131_sort.xlsx to FB825CLRS02_DataCenter_DataSet_List_20251008144131_sort.xlsx
已選擇檔案： FB825CLRS02_DataCenter_DataSet_List_20251008144131_sort.xlsx


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


已讀取工作表數：3，總筆數：1071
處理工作表：MH，筆數：149


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


處理工作表：AE，筆數：20


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


處理工作表：CM，筆數：902


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


已輸出：validation_results__overview.csv
已輸出：validation_results__MH.csv
已輸出：validation_results__AE.csv
已輸出：validation_results__CM.csv


In [None]:
#@title 刪除colab檔案

import os

# 要刪除的特定檔案清單 (將 validation_set_wrong_dummy.xlsx 前面加上資料夾路徑)
files_to_delete = [
    'validation_results__AE.csv',
    'validation_results__CM.csv',
    'validation_results__MH.csv',
    'validation_results__overview.csv',
    'validation_set_wrong dummy2.xlsx'
]

print("開始刪除以下檔案：")

for file_name in files_to_delete:
    if os.path.exists(file_name):
        try:
            os.remove(file_name)
            print(f"✅ 已刪除：{file_name}")
        except OSError as e:
            print(f"❌ 錯誤：無法刪除 {file_name} - {e}")
    else:
        print(f"⚠️ 檔案不存在：{file_name}")

print("\n刪除程序完成。請點擊檔案側邊欄的重新整理按鈕 ($\circlearrowleft$) 查看結果。")

In [None]:
#@title  存檔訓練後模型到drive
!cp -r medgemma_sft_out /content/drive/MyDrive/BioMistral_train_backup/BioMistral_sft_out/final


In [None]:
# 評估模型並獲取損失
eval_results = trainer.evaluate()
print(f"評估損失: {eval_results['eval_loss']}")
