# 🚀 CyberPuppy 霸凌偵測訓練 - A100 優化版

**GPU**: A100 (40GB)  
**目標**: F1 ≥ 0.75  
**優化**: 大 batch + bf16 + 快速訓練  

**預計時間**: 1-2 小時（A100 加速）

---

## 1️⃣ GPU 驗證 - 確認 A100

In [None]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"Memory: {gpu_memory:.1f} GB")
    
    compute_capability = torch.cuda.get_device_capability(0)
    print(f"Compute Capability: {compute_capability}")
    print(f"BF16 Support: {compute_capability[0] >= 8}")
    
    if "A100" not in gpu_name:
        print(f"\n⚠️ 警告: 當前 GPU 是 {gpu_name}，不是 A100")
        print("此 notebook 針對 A100 優化，其他 GPU 可能需要調整 batch size")
else:
    print("❌ 未檢測到 GPU！")

## 2️⃣ GitHub 認證設置

In [None]:
from getpass import getpass

# 從 Colab Secrets 或手動輸入
try:
    from google.colab import userdata
    GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
    print("✅ 從 Colab Secrets 載入 token")
except:
    GITHUB_TOKEN = getpass("GitHub Token: ")

GITHUB_USERNAME = "thc1006"
REPO_NAME = "cyberbully-zh-moderation-bot"

!git config --global user.email "colab-a100@example.com"
!git config --global user.name "Colab A100 Training"

print("✅ Git 配置完成")

## 3️⃣ Clone Repository + 拉取 LFS 資料

In [None]:
import os
import subprocess
import shutil

# 清理舊目錄
if os.path.exists(REPO_NAME):
    shutil.rmtree(REPO_NAME, ignore_errors=True)

os.chdir("/content")

# Clone
print("📥 Cloning repository...")
repo_url = f"https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
!git clone {repo_url}

os.chdir(REPO_NAME)
print(f"📁 目錄: {os.getcwd()}")

# 拉取 LFS 資料
print("\n📦 拉取 Git LFS 資料...")
!git lfs install
!git lfs pull

# 驗證訓練資料
train_file = "data/processed/training_dataset/train.json"
file_size = os.path.getsize(train_file)
print(f"\n{'✅' if file_size > 1000000 else '❌'} 訓練資料: {file_size / 1024 / 1024:.1f} MB")

## 4️⃣ 安裝依賴 - CUDA 12.x + PyTorch 2.8

In [None]:
print("📦 安裝套件（A100 優化版本）...")

# PyTorch 2.8 + CUDA 12.6
!pip install -q --index-url https://download.pytorch.org/whl/cu126 \
  torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0

# NumPy 2.x
!pip install -q "numpy>=2,<2.3"

# Transformers + Accelerate
!pip install -q transformers==4.46.3 accelerate==1.2.1 datasets==3.2.0
!pip install -q scikit-learn==1.6.1 tqdm==4.67.1 pandas==2.2.2

# TensorBoard
!pip install -q "tensorboard~=2.19.0"

# 驗證安裝
import torch
import numpy as np
print(f"\n✅ PyTorch: {torch.__version__}")
print(f"✅ NumPy: {np.__version__}")
print(f"✅ CUDA: {torch.version.cuda}")
print(f"✅ BF16 可用: {torch.cuda.is_bf16_supported()}")

## 5️⃣ 驗證資料集

In [None]:
import json

data_dir = "data/processed/training_dataset"

for split in ["train", "dev", "test"]:
    filepath = os.path.join(data_dir, f"{split}.json")
    file_size = os.path.getsize(filepath)
    
    if file_size > 100000:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"✅ {split}.json: {len(data):,} 樣本 ({file_size / 1024 / 1024:.1f} MB)")
    else:
        print(f"❌ {split}.json 是 LFS pointer ({file_size} bytes)")

print("\n✅ 資料集就緒！")

## 6️⃣ A100 優化訓練配置

In [None]:
from dataclasses import dataclass

@dataclass
class A100Config:
    name: str
    base_model: str
    learning_rate: float
    batch_size: int
    num_epochs: int
    early_stopping_patience: int
    bullying_weight: float
    accumulation_steps: int = 1

# A100 優化配置：更大 batch + 更快訓練
configs = [
    A100Config(
        name="macbert_a100_optimized",
        base_model="hfl/chinese-macbert-base",
        learning_rate=2e-5,
        batch_size=32,
        num_epochs=15,
        early_stopping_patience=4,
        bullying_weight=2.5,
        accumulation_steps=2,  # 等效 batch=64
    ),
    A100Config(
        name="roberta_a100_aggressive",
        base_model="hfl/chinese-roberta-wwm-ext",
        learning_rate=3e-5,
        batch_size=24,
        num_epochs=18,
        early_stopping_patience=3,
        bullying_weight=2.8,
        accumulation_steps=3,  # 等效 batch=72
    ),
]

print("📋 A100 訓練配置:")
for i, cfg in enumerate(configs, 1):
    effective_batch = cfg.batch_size * cfg.accumulation_steps
    print(f"{i}. {cfg.name}")
    print(f"   - Batch: {cfg.batch_size} × {cfg.accumulation_steps} = {effective_batch}")
    print(f"   - LR: {cfg.learning_rate}")
    print(f"   - BF16: Yes")

## 7️⃣ 訓練 Model A - MacBERT (A100 優化)

In [None]:
import os
os.chdir(f"/content/{REPO_NAME}")

config = configs[0]
print(f"🚀 訓練: {config.name}")
print(f"💾 等效 Batch Size: {config.batch_size * config.accumulation_steps}")
print("="*60)

!python scripts/train_simple_with_args.py \
  --model_name {config.base_model} \
  --output_dir models/experiments/{config.name} \
  --train_file data/processed/training_dataset/train.json \
  --dev_file data/processed/training_dataset/dev.json \
  --test_file data/processed/training_dataset/test.json \
  --learning_rate {config.learning_rate} \
  --batch_size {config.batch_size} \
  --num_epochs {config.num_epochs} \
  --early_stopping_patience {config.early_stopping_patience} \
  --bullying_weight {config.bullying_weight} \
  --accumulation_steps {config.accumulation_steps} \
  --bf16

print(f"\n✅ {config.name} 訓練完成")

## 8️⃣ 訓練 Model B - RoBERTa (激進配置)

In [None]:
config = configs[1]
print(f"🚀 訓練: {config.name}")
print(f"💾 等效 Batch Size: {config.batch_size * config.accumulation_steps}")
print("="*60)

!python scripts/train_simple_with_args.py \
  --model_name {config.base_model} \
  --output_dir models/experiments/{config.name} \
  --train_file data/processed/training_dataset/train.json \
  --dev_file data/processed/training_dataset/dev.json \
  --test_file data/processed/training_dataset/test.json \
  --learning_rate {config.learning_rate} \
  --batch_size {config.batch_size} \
  --num_epochs {config.num_epochs} \
  --early_stopping_patience {config.early_stopping_patience} \
  --bullying_weight {config.bullying_weight} \
  --accumulation_steps {config.accumulation_steps} \
  --bf16

print(f"\n✅ {config.name} 訓練完成")

## 9️⃣ 評估所有模型

In [None]:
import json
import os
import glob

results = []

print("🔍 評估訓練結果...\n")

for config in configs:
    model_dir = f"models/experiments/{config.name}"
    
    if not os.path.exists(model_dir):
        print(f"⚠️ {config.name}: 目錄不存在")
        continue
    
    # 尋找評估結果
    eval_file = os.path.join(model_dir, "eval_results.json")
    final_file = os.path.join(model_dir, "final_results.json")
    
    metrics = None
    if os.path.exists(final_file):
        with open(final_file, 'r') as f:
            metrics = json.load(f)
        f1_score = metrics.get('test_bullying_f1', 0.0)
    elif os.path.exists(eval_file):
        with open(eval_file, 'r') as f:
            metrics = json.load(f)
        f1_score = metrics.get('bullying_f1', 0.0)
    else:
        print(f"⚠️ {config.name}: 未找到評估結果")
        continue
    
    results.append({
        'name': config.name,
        'f1': f1_score,
        'path': model_dir,
        'metrics': metrics
    })
    
    status = "✅" if f1_score >= 0.75 else "⚠️"
    print(f"{status} {config.name}: F1 = {f1_score:.4f}")

# 選出最佳模型
if results:
    best_model = max(results, key=lambda x: x['f1'])
    print("\n" + "="*60)
    print(f"🏆 最佳: {best_model['name']}")
    print(f"📊 F1: {best_model['f1']:.4f}")
    print("="*60)
    
    with open('best_model_a100.json', 'w') as f:
        json.dump(best_model, f, indent=2, ensure_ascii=False)
else:
    print("\n❌ 沒有可用結果")

## 🔟 複製最佳模型

In [None]:
import shutil
import pandas as pd

if 'best_model' in locals() and best_model['f1'] >= 0.70:
    deploy_dir = "models/bullying_a100_best"
    os.makedirs(deploy_dir, exist_ok=True)
    
    # 複製模型
    for file in os.listdir(best_model['path']):
        src = os.path.join(best_model['path'], file)
        if os.path.isfile(src):
            shutil.copy2(src, os.path.join(deploy_dir, file))
    
    # 保存資訊
    deploy_info = {
        'model_name': best_model['name'],
        'f1_score': best_model['f1'],
        'metrics': best_model['metrics'],
        'trained_on': 'Google Colab A100',
        'timestamp': pd.Timestamp.now().isoformat()
    }
    
    with open(os.path.join(deploy_dir, 'deployment_info.json'), 'w') as f:
        json.dump(deploy_info, f, indent=2, ensure_ascii=False)
    
    print(f"✅ 模型已複製到: {deploy_dir}")
    print(f"📊 F1: {best_model['f1']:.4f}")
else:
    print("⚠️ 未達標或無可用模型")

## 1️⃣1️⃣ 推送到 GitHub (如果達標 ≥0.75)

In [None]:
TARGET_F1 = 0.75

if 'best_model' in locals() and best_model['f1'] >= TARGET_F1:
    print(f"🎉 達標！F1 = {best_model['f1']:.4f} ≥ {TARGET_F1}")
    print("推送到 GitHub...\n")
    
    # Git LFS
    !git lfs install
    !git lfs track "models/bullying_a100_best/**/*.safetensors"
    !git lfs track "models/bullying_a100_best/**/*.bin"
    !git add .gitattributes
    
    # 添加模型
    !git add models/bullying_a100_best/
    
    # 提交
    commit_msg = f"feat: A100 trained model (F1={best_model['f1']:.4f})"
    !git commit -m "{commit_msg}"
    
    # 推送
    !git push origin main
    
    print("\n✅ 推送完成！")
elif 'best_model' in locals():
    print(f"⚠️ 未達標: F1 = {best_model['f1']:.4f} < {TARGET_F1}")
else:
    print("❌ 無可用模型")

## 1️⃣2️⃣ 訓練總結

In [None]:
print("="*80)
print("🎯 A100 訓練總結")
print("="*80)

if 'results' in locals() and results:
    print("\n訓練結果:")
    for model in sorted(results, key=lambda x: x['f1'], reverse=True):
        status = "✅" if model['f1'] >= TARGET_F1 else "⚠️"
        print(f"  {status} {model['name']}: F1 = {model['f1']:.4f}")
    
    if 'best_model' in locals():
        print(f"\n🏆 最佳: {best_model['name']}")
        print(f"📊 F1: {best_model['f1']:.4f}")
        print(f"🎯 目標: {TARGET_F1}")
        
        if best_model['f1'] >= TARGET_F1:
            print("\n✅ 成功！模型已推送到 GitHub")
        else:
            gap = TARGET_F1 - best_model['f1']
            print(f"\n⚠️ 距離目標: {gap:.4f}")
else:
    print("\n❌ 無訓練結果")

print("\n" + "="*80)