In [23]:
# ===========================
# Cell 1: 導入套件（GPU版本）
# ===========================
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# GPU加速的資料處理 (如果有安裝RAPIDS)
try:
    import cudf
    import cupy as cp
    from cuml.ensemble import RandomForestClassifier as cuRF
    from cuml.linear_model import LogisticRegression as cuLR
    from cuml.preprocessing import StandardScaler as cuScaler
    from cuml.model_selection import train_test_split as cu_train_test_split
    USE_RAPIDS = True
    print("✅ RAPIDS GPU加速已啟用！")
except ImportError:
    USE_RAPIDS = False
    print("⚠️ RAPIDS未安裝，使用CPU版本")

# GPU加速的模型
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# PyTorch for Neural Networks
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

print(f"PyTorch CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


⚠️ RAPIDS未安裝，使用CPU版本
PyTorch CUDA available: True
GPU: NVIDIA GeForce RTX 3090
GPU Memory: 25.30 GB


In [24]:
# ===========================
# Cell 2: GPU記憶體監控
# ===========================
def print_gpu_memory():
    """顯示GPU記憶體使用情況"""
    if torch.cuda.is_available():
        print(f"GPU記憶體使用: {torch.cuda.memory_allocated()/1e9:.2f} GB / {torch.cuda.get_device_properties(0).total_memory/1e9:.2f} GB")

In [None]:
# ===========================
# Cell 3: 使用GPU加速的資料處理（RAPIDS）
# ===========================
def load_data_gpu(file_path, use_rapids=USE_RAPIDS):
    """使用GPU加速載入和處理資料"""
    print("載入資料...")
    
    if use_rapids:
        # 使用cuDF載入（GPU）
        df = cudf.read_csv(file_path)
        print("✅ 使用cuDF (GPU)載入資料")
    else:
        # 使用pandas載入（CPU）
        df = pd.read_csv(file_path, low_memory=False)
        print("使用pandas (CPU)載入資料")
    
    print(f"資料集大小: {df.shape}")
    return df


# 執行載入資料
file_path = 'us-accidents/US_Accidents_March23.csv'
df = load_data_gpu(file_path)


載入資料...


In [None]:
# ===========================
# Cell 3: 處理日期時間欄位
# ===========================
def process_datetime_columns(df):
    """處理日期時間欄位，解決格式問題"""
    print("\n處理日期時間欄位...")
    
    # 使用更寬鬆的日期解析方式
    date_columns = ['Start_Time', 'End_Time', 'Weather_Timestamp']
    
    for col in date_columns:
        if col in df.columns:
            print(f"處理 {col}...")
            # 嘗試多種格式
            df[col] = pd.to_datetime(df[col], errors='coerce', format='mixed')
            
            # 檢查並報告無法解析的數量
            null_count = df[col].isnull().sum()
            if null_count > 0:
                print(f"  警告: {col} 有 {null_count} 筆無法解析的日期")
    
    # 計算持續時間（分鐘）
    if 'Start_Time' in df.columns and 'End_Time' in df.columns:
        df['Duration_minutes'] = (df['End_Time'] - df['Start_Time']).dt.total_seconds() / 60
        
        # 過濾異常的持續時間
        print(f"\n持續時間統計:")
        print(f"  最小值: {df['Duration_minutes'].min():.2f} 分鐘")
        print(f"  最大值: {df['Duration_minutes'].max():.2f} 分鐘")
        print(f"  平均值: {df['Duration_minutes'].mean():.2f} 分鐘")
        print(f"  中位數: {df['Duration_minutes'].median():.2f} 分鐘")
        
        # 移除持續時間異常的記錄
        original_len = len(df)
        df = df[(df['Duration_minutes'] > 0) & (df['Duration_minutes'] < 1440)]  # 小於24小時
        print(f"\n移除異常持續時間後，資料從 {original_len} 筆減少到 {len(df)} 筆")
    
    return df

# 執行日期處理
df = process_datetime_columns(df)

In [None]:
# ===========================
# Cell 4: 特徵工程
# ===========================
def feature_engineering(df):
    """特徵工程：創建新的特徵"""
    print("\n執行特徵工程...")
    
    # 檢查Start_Time是否存在且不為null
    if 'Start_Time' in df.columns:
        # 移除Start_Time為null的記錄
        df = df[df['Start_Time'].notna()]
        
        # 時間特徵
        df['Hour'] = df['Start_Time'].dt.hour
        df['DayOfWeek'] = df['Start_Time'].dt.dayofweek
        df['Month'] = df['Start_Time'].dt.month
        df['Year'] = df['Start_Time'].dt.year
        df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(int)
        
        # 時段分類
        df['TimeOfDay'] = pd.cut(df['Hour'], 
                                 bins=[-1, 6, 12, 18, 24], 
                                 labels=['Night', 'Morning', 'Afternoon', 'Evening'])
        
        # 季節
        df['Season'] = pd.cut(df['Month'], 
                              bins=[0, 3, 6, 9, 12], 
                              labels=['Winter', 'Spring', 'Summer', 'Fall'])
    
    # 天氣條件簡化
    if 'Weather_Condition' in df.columns:
        weather_keywords = {
            'Clear': ['Clear', 'Fair'],
            'Cloudy': ['Cloud', 'Overcast'],
            'Rain': ['Rain', 'Drizzle', 'Shower'],
            'Snow': ['Snow', 'Sleet', 'Hail'],
            'Fog': ['Fog', 'Mist'],
            'Storm': ['Storm', 'Thunder']
        }
        
        def categorize_weather(condition):
            if pd.isna(condition):
                return 'Unknown'
            condition = str(condition)
            for category, keywords in weather_keywords.items():
                if any(keyword in condition for keyword in keywords):
                    return category
            return 'Other'
        
        df['Weather_Category'] = df['Weather_Condition'].apply(categorize_weather)
    
    print(f"特徵工程完成，目前資料集大小: {df.shape}")
    
    return df

# 執行特徵工程
df = feature_engineering(df)

In [None]:
# ===========================
# Cell 5: 缺失值分析與處理
# ===========================
def analyze_missing_values(df):
    """分析缺失值"""
    print("\n分析缺失值...")
    
    # 計算每個欄位的缺失值
    missing_stats = df.isnull().sum()
    missing_percentage = (missing_stats / len(df)) * 100
    
    # 建立缺失值報告
    missing_df = pd.DataFrame({
        'Column': missing_stats.index,
        'Missing_Count': missing_stats.values,
        'Percentage': missing_percentage.values
    })
    
    # 只顯示有缺失值的欄位
    missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Percentage', ascending=False)
    
    print("\n缺失值統計（前20個）:")
    print(missing_df.head(20))
    
    # 視覺化缺失值
    if len(missing_df) > 0:
        plt.figure(figsize=(12, 8))
        plt.barh(missing_df.head(20)['Column'], missing_df.head(20)['Percentage'])
        plt.xlabel('Missing Percentage (%)')
        plt.title('Top 20 Columns with Missing Values')
        plt.tight_layout()
        plt.show()
    
    return missing_df

# 分析缺失值
missing_df = analyze_missing_values(df)


In [None]:
# ===========================
# Cell 6: 處理缺失值
# ===========================
def handle_missing_values(df, missing_threshold=60):
    """處理缺失值"""
    print(f"\n處理缺失值（閾值: {missing_threshold}%）...")
    
    # 計算缺失值百分比
    missing_percentage = (df.isnull().sum() / len(df)) * 100
    
    # 刪除缺失值比例過高的欄位
    high_missing_cols = missing_percentage[missing_percentage > missing_threshold].index.tolist()
    
    if len(high_missing_cols) > 0:
        print(f"\n刪除高缺失率欄位 ({len(high_missing_cols)} 個):")
        print(high_missing_cols[:10])  # 只顯示前10個
        df = df.drop(columns=high_missing_cols)
    
    # 對剩餘的缺失值進行填補
    # 數值型欄位
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].median(), inplace=True)
    
    # 類別型欄位
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if df[col].isnull().sum() > 0:
            df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 'Unknown', inplace=True)
    
    print(f"\n處理後資料集大小: {df.shape}")
    print(f"剩餘缺失值總數: {df.isnull().sum().sum()}")
    
    return df

# 處理缺失值
df = handle_missing_values(df, missing_threshold=60)

In [None]:
# ===========================
# Cell 7: 特徵選擇與準備
# ===========================
def select_features(df):
    """選擇模型訓練所需的特徵"""
    print("\n選擇特徵...")
    
    # 定義要使用的特徵
    numeric_features = [
        'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
        'Wind_Speed(mph)', 'Distance(mi)', 'Hour', 'DayOfWeek', 'Month', 
        'Year', 'IsWeekend'
    ]
    
    categorical_features = [
        'Side', 'State', 'Weather_Category', 'TimeOfDay', 'Season',
        'Sunrise_Sunset'
    ]
    
    boolean_features = [
        'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
        'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 
        'Traffic_Signal'
    ]
    
    # 如果有Duration_minutes，也加入
    if 'Duration_minutes' in df.columns:
        numeric_features.append('Duration_minutes')
    
    # 確保特徵存在於資料集中
    numeric_features = [f for f in numeric_features if f in df.columns]
    categorical_features = [f for f in categorical_features if f in df.columns]
    boolean_features = [f for f in boolean_features if f in df.columns]
    
    print(f"\n選擇的特徵:")
    print(f"  數值型特徵: {len(numeric_features)} 個")
    print(f"  類別型特徵: {len(categorical_features)} 個")  
    print(f"  布林型特徵: {len(boolean_features)} 個")
    
    # 將布林值轉換為整數
    for col in boolean_features:
        df[col] = df[col].astype(int)
    
    # 對類別變數進行編碼
    label_encoders = {}
    encoded_features = numeric_features + boolean_features
    
    for col in categorical_features:
        le = LabelEncoder()
        encoded_col_name = col + '_encoded'
        df[encoded_col_name] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
        encoded_features.append(encoded_col_name)
    
    print(f"\n總特徵數: {len(encoded_features)}")
    
    return df, encoded_features, label_encoders

# 選擇特徵
df, selected_features, label_encoders = select_features(df)

In [None]:
# ===========================
# Cell 8: 準備訓練資料
# ===========================
def prepare_training_data(df, features, target_col='Severity'):
    """準備訓練資料"""
    print("\n準備訓練資料...")
    
    # 確保沒有缺失值
    df_clean = df[features + [target_col]].dropna()
    
    print(f"清理後的資料大小: {df_clean.shape}")
    
    # 準備特徵和目標變數
    X = df_clean[features]
    y = df_clean[target_col]
    
    # 將嚴重度轉換為從0開始的索引（sklearn要求）
    y = y - 1
    
    # 顯示類別分布
    print("\n調整後的目標變數分布:")
    print(pd.Series(y).value_counts().sort_index())
    
    return X, y

# 準備資料
X, y = prepare_training_data(df, selected_features)


In [None]:
# ===========================
# Cell 9: 處理資料不平衡
# ===========================
def handle_imbalanced_data(X, y, method='smote'):
    """處理資料不平衡問題"""
    print(f"\n使用 {method} 處理資料不平衡...")
    
    print("原始類別分布:")
    print(pd.Series(y).value_counts().sort_index())
    
    if method == 'smote':
        # 使用SMOTE過採樣
        smote = SMOTE(random_state=42, k_neighbors=5)
        X_resampled, y_resampled = smote.fit_resample(X, y)
    elif method == 'undersample':
        # 使用隨機欠採樣
        rus = RandomUnderSampler(random_state=42)
        X_resampled, y_resampled = rus.fit_resample(X, y)
    elif method == 'combine':
        # 結合過採樣和欠採樣
        smote_enn = SMOTEENN(random_state=42)
        X_resampled, y_resampled = smote_enn.fit_resample(X, y)
    else:
        # 不處理
        X_resampled, y_resampled = X, y
    
    print("\n處理後類別分布:")
    print(pd.Series(y_resampled).value_counts().sort_index())
    
    return X_resampled, y_resampled

# 分割資料集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("訓練集大小:", X_train.shape)
print("測試集大小:", X_test.shape)

# 處理訓練集的不平衡
X_train_balanced, y_train_balanced = handle_imbalanced_data(X_train, y_train, method='smote')


In [None]:
# ===========================
# Cell 9: GPU加速的模型訓練
# ===========================

# 1. XGBoost GPU版本
def train_xgboost_gpu(X_train, X_test, y_train, y_test):
    """使用XGBoost GPU版本訓練"""
    print("\n訓練 XGBoost (GPU)...")
    
    # 轉換為DMatrix格式（XGBoost的優化格式）
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    # GPU參數設定
    params = {
        'objective': 'multi:softprob',
        'num_class': 4,
        'max_depth': 6,
        'learning_rate': 0.1,
        'n_estimators': 100,
        'tree_method': 'gpu_hist',  # GPU加速
        'predictor': 'gpu_predictor',  # GPU預測
        'gpu_id': 0,
        'random_state': 42
    }
    
    # 訓練模型
    import time
    start_time = time.time()
    model = xgb.train(params, dtrain, num_boost_round=100)
    train_time = time.time() - start_time
    
    # 預測
    y_pred_proba = model.predict(dtest)
    y_pred = np.argmax(y_pred_proba, axis=1)
    
    # 評估
    from sklearn.metrics import accuracy_score, f1_score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"  訓練時間: {train_time:.2f} 秒")
    print(f"  準確率: {accuracy:.4f}")
    print(f"  F1分數: {f1:.4f}")
    print_gpu_memory()
    
    return model, accuracy, f1

# 2. LightGBM GPU版本
def train_lightgbm_gpu(X_train, X_test, y_train, y_test):
    """使用LightGBM GPU版本訓練"""
    print("\n訓練 LightGBM (GPU)...")
    
    # GPU參數設定
    params = {
        'objective': 'multiclass',
        'num_class': 4,
        'boosting_type': 'gbdt',
        'metric': 'multi_logloss',
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'max_depth': 6,
        'learning_rate': 0.1,
        'n_estimators': 100,
        'random_state': 42,
        'verbose': -1
    }
    
    # 訓練模型
    import time
    start_time = time.time()
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # 預測
    y_pred = model.predict(X_test)
    
    # 評估
    from sklearn.metrics import accuracy_score, f1_score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"  訓練時間: {train_time:.2f} 秒")
    print(f"  準確率: {accuracy:.4f}")
    print(f"  F1分數: {f1:.4f}")
    print_gpu_memory()
    
    return model, accuracy, f1

# 3. CatBoost GPU版本
def train_catboost_gpu(X_train, X_test, y_train, y_test):
    """使用CatBoost GPU版本訓練"""
    print("\n訓練 CatBoost (GPU)...")
    
    # GPU參數設定
    model = CatBoostClassifier(
        iterations=100,
        learning_rate=0.1,
        depth=6,
        loss_function='MultiClass',
        task_type='GPU',
        devices='0',
        random_state=42,
        verbose=False
    )
    
    # 訓練模型
    import time
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # 預測
    y_pred = model.predict(X_test)
    
    # 評估
    from sklearn.metrics import accuracy_score, f1_score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"  訓練時間: {train_time:.2f} 秒")
    print(f"  準確率: {accuracy:.4f}")
    print(f"  F1分數: {f1:.4f}")
    print_gpu_memory()
    
    return model, accuracy, f1

# 4. PyTorch神經網路 (GPU)
class AccidentSeverityNN(nn.Module):
    """事故嚴重度預測神經網路"""
    def __init__(self, input_size, hidden_sizes=[256, 128, 64], num_classes=4, dropout_rate=0.3):
        super(AccidentSeverityNN, self).__init__()
        
        layers = []
        prev_size = input_size
        
        for hidden_size in hidden_sizes:
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.BatchNorm1d(hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_size = hidden_size
        
        layers.append(nn.Linear(prev_size, num_classes))
        
        self.model = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.model(x)

def train_pytorch_gpu(X_train, X_test, y_train, y_test, epochs=50, batch_size=1024):
    """使用PyTorch GPU訓練深度神經網路"""
    print("\n訓練 PyTorch Neural Network (GPU)...")
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"使用設備: {device}")
    
    # 資料標準化
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 轉換為PyTorch張量
    X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
    y_train_tensor = torch.LongTensor(y_train).to(device)
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    y_test_tensor = torch.LongTensor(y_test).to(device)
    
    # 建立資料載入器
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # 建立模型
    input_size = X_train.shape[1]
    model = AccidentSeverityNN(input_size).to(device)
    
    # 損失函數和優化器
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5)
    
    # 訓練
    import time
    start_time = time.time()
    
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        if (epoch + 1) % 10 == 0:
            avg_loss = total_loss / len(train_loader)
            print(f"  Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
            scheduler.step(avg_loss)
    
    train_time = time.time() - start_time
    
    # 評估
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        y_pred = predicted.cpu().numpy()
    
    from sklearn.metrics import accuracy_score, f1_score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"  訓練時間: {train_time:.2f} 秒")
    print(f"  準確率: {accuracy:.4f}")
    print(f"  F1分數: {f1:.4f}")
    print_gpu_memory()
    
    return model, scaler, accuracy, f1

In [None]:
# ===========================
# Cell 10: RAPIDS GPU加速的隨機森林
# ===========================
def train_rapids_rf(X_train, X_test, y_train, y_test):
    """使用RAPIDS cuML的GPU隨機森林"""
    if not USE_RAPIDS:
        print("RAPIDS未安裝，跳過此模型")
        return None, 0, 0
    
    print("\n訓練 Random Forest (RAPIDS GPU)...")
    
    # 轉換為GPU陣列
    X_train_gpu = cp.asarray(X_train)
    y_train_gpu = cp.asarray(y_train)
    X_test_gpu = cp.asarray(X_test)
    
    # 建立GPU隨機森林
    model = cuRF(
        n_estimators=100,
        max_depth=10,
        random_state=42
    )
    
    # 訓練
    import time
    start_time = time.time()
    model.fit(X_train_gpu, y_train_gpu)
    train_time = time.time() - start_time
    
    # 預測
    y_pred_gpu = model.predict(X_test_gpu)
    y_pred = cp.asnumpy(y_pred_gpu)
    
    # 評估
    from sklearn.metrics import accuracy_score, f1_score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"  訓練時間: {train_time:.2f} 秒")
    print(f"  準確率: {accuracy:.4f}")
    print(f"  F1分數: {f1:.4f}")
    
    return model, accuracy, f1

In [None]:
# ===========================
# Cell 11: 主要執行函數
# ===========================
def main_gpu_training(X_train, X_test, y_train, y_test):
    """執行所有GPU加速的模型訓練"""
    results = {}
    
    # 1. XGBoost GPU
    xgb_model, xgb_acc, xgb_f1 = train_xgboost_gpu(X_train, X_test, y_train, y_test)
    results['XGBoost_GPU'] = {'accuracy': xgb_acc, 'f1': xgb_f1}
    
    # 2. LightGBM GPU
    lgb_model, lgb_acc, lgb_f1 = train_lightgbm_gpu(X_train, X_test, y_train, y_test)
    results['LightGBM_GPU'] = {'accuracy': lgb_acc, 'f1': lgb_f1}
    
    # 3. CatBoost GPU
    cat_model, cat_acc, cat_f1 = train_catboost_gpu(X_train, X_test, y_train, y_test)
    results['CatBoost_GPU'] = {'accuracy': cat_acc, 'f1': cat_f1}
    
    # 4. PyTorch Neural Network
    nn_model, nn_scaler, nn_acc, nn_f1 = train_pytorch_gpu(X_train, X_test, y_train, y_test)
    results['PyTorch_NN'] = {'accuracy': nn_acc, 'f1': nn_f1}
    
    # 5. RAPIDS Random Forest (如果可用)
    if USE_RAPIDS:
        rf_model, rf_acc, rf_f1 = train_rapids_rf(X_train, X_test, y_train, y_test)
        results['RAPIDS_RF'] = {'accuracy': rf_acc, 'f1': rf_f1}
    
    # 顯示結果比較
    print("\n" + "="*50)
    print("模型性能比較（GPU加速版）:")
    print("="*50)
    
    for model_name, metrics in results.items():
        print(f"{model_name:15} - 準確率: {metrics['accuracy']:.4f}, F1: {metrics['f1']:.4f}")
    
    # 找出最佳模型
    best_model = max(results.items(), key=lambda x: x[1]['accuracy'])
    print(f"\n最佳模型: {best_model[0]} (準確率: {best_model[1]['accuracy']:.4f})")
    
    return results

In [None]:
# ===========================
# Cell 12: 特徵重要性分析
# ===========================
def analyze_feature_importance(model, feature_names, top_n=20):
    """分析特徵重要性"""
    print("\n分析特徵重要性...")
    
    if hasattr(model, 'feature_importances_'):
        importances = pd.DataFrame({
            'feature': feature_names,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        # 顯示前N個重要特徵
        print(f"\nTop {top_n} 重要特徵:")
        print(importances.head(top_n))
        
        # 視覺化
        plt.figure(figsize=(10, 8))
        plt.barh(importances.head(top_n)['feature'][::-1], 
                importances.head(top_n)['importance'][::-1])
        plt.xlabel('Importance')
        plt.title(f'Top {top_n} Feature Importances')
        plt.tight_layout()
        plt.show()
        
        return importances
    else:
        print("此模型不支援特徵重要性分析")
        return None

# 如果最佳模型是樹模型，分析特徵重要性
if best_model_name in ['Random Forest', 'Gradient Boosting']:
    feature_importance = analyze_feature_importance(best_model, selected_features)

In [None]:
# ===========================
# Cell 13: 超參數調整（選擇性執行）
# ===========================
def hyperparameter_tuning(X_train, y_train, model_type='rf'):
    """使用網格搜索進行超參數調整"""
    print(f"\n進行{model_type}超參數調整（這可能需要一些時間）...")
    
    if model_type == 'rf':
        # Random Forest 參數網格
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 15, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
        model = RandomForestClassifier(random_state=42, n_jobs=-1)
    elif model_type == 'gb':
        # Gradient Boosting 參數網格
        param_grid = {
            'n_estimators': [50, 100, 150],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'min_samples_split': [2, 5, 10]
        }
        model = GradientBoostingClassifier(random_state=42)
    
    # 使用分層k折交叉驗證
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    grid_search = GridSearchCV(
        model, param_grid, 
        cv=cv, 
        scoring='f1_weighted',
        verbose=1, 
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"\n最佳參數: {grid_search.best_params_}")
    print(f"最佳交叉驗證分數: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

# 如果需要，可以執行超參數調整
# best_rf_tuned = hyperparameter_tuning(X_train_balanced, y_train_balanced, model_type='rf')

In [None]:
# ===========================
# Cell 14: 儲存模型
# ===========================
import joblib

def save_model(model, scaler, label_encoders, feature_names, output_dir='./model_output/'):
    """儲存模型和相關物件"""
    import os
    
    # 建立輸出目錄
    os.makedirs(output_dir, exist_ok=True)
    
    # 儲存模型
    joblib.dump(model, f'{output_dir}best_model.pkl')
    
    # 儲存標準化器
    joblib.dump(scaler, f'{output_dir}scaler.pkl')
    
    # 儲存標籤編碼器
    joblib.dump(label_encoders, f'{output_dir}label_encoders.pkl')
    
    # 儲存特徵名稱
    joblib.dump(feature_names, f'{output_dir}feature_names.pkl')
    
    print(f"模型已儲存到 {output_dir}")

# 儲存最佳模型
save_model(best_model, scaler, label_encoders, selected_features)

In [None]:
# ===========================
# Cell 15: 預測函數
# ===========================
def predict_severity(model, scaler, label_encoders, feature_names, input_data):
    """使用訓練好的模型進行預測"""
    
    # 確保輸入資料包含所有必要的特徵
    for feature in feature_names:
        if feature not in input_data:
            print(f"警告: 缺少特徵 {feature}")
            return None
    
    # 準備特徵向量
    X_new = pd.DataFrame([input_data])[feature_names]
    
    # 如果模型需要標準化
    if isinstance(model, (LogisticRegression, MLPClassifier)):
        X_new = scaler.transform(X_new)
    
    # 進行預測
    prediction = model.predict(X_new)[0]
    prediction_proba = model.predict_proba(X_new)[0]
    
    # 將預測結果轉換回原始的嚴重度級別
    severity_level = prediction + 1
    
    print(f"\n預測結果:")
    print(f"嚴重度級別: {severity_level}")
    print(f"各級別機率:")
    for i, prob in enumerate(prediction_proba):
        print(f"  級別 {i+1}: {prob:.4f}")
    
    return severity_level, prediction_proba

# 範例預測（需要根據實際特徵調整）
# example_input = {
#     'Temperature(F)': 70.0,
#     'Humidity(%)': 80.0,
#     # ... 其他特徵
# }
# predict_severity(best_model, scaler, label_encoders, selected_features, example_input)

print("\n模型訓練完成！")
print("您可以使用 predict_severity 函數進行新的預測。")