In [1]:
# ===========================
# Cell 1: 導入套件和設定
# ===========================
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc  # 垃圾回收
warnings.filterwarnings('ignore')

# 基本套件
import os
import time
import joblib
import json
from collections import Counter

# Scikit-learn
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, 
    confusion_matrix, balanced_accuracy_score, 
    cohen_kappa_score, make_scorer
)
from sklearn.utils.class_weight import compute_class_weight

# 處理不平衡資料
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import BalancedRandomForestClassifier

# Boosting模型
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
import torch.nn.functional as F 

print("環境檢查:")
print(f"PyTorch: {torch.__version__}")
print(f"XGBoost: {xgb.__version__}")
print(f"LightGBM: {lgb.__version__}")
print(f"CUDA可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


環境檢查:
PyTorch: 2.7.0+cu118
XGBoost: 3.0.2
LightGBM: 4.3.0
CUDA可用: True
GPU: NVIDIA GeForce RTX 3090


In [2]:
# ===========================
# Cell 2: 記憶體優化函數
# ===========================

def reduce_memory_usage(df, verbose=True):
    """
    通過改變數據類型來減少DataFrame的記憶體使用
    參考自Kaggle的記憶體優化技術
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'記憶體使用減少了 {100 * (start_mem - end_mem) / start_mem:.1f}%')
        print(f'{start_mem:.2f} MB --> {end_mem:.2f} MB')
    
    return df

def clean_memory():
    """清理記憶體"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [3]:
# ===========================
# Cell 3: 載入資料（優化版）
# ===========================

def load_data_optimized(file_path, sample_frac=None, chunksize=None):
    """
    優化的資料載入，支援採樣和分塊讀取
    """
    print(f"載入資料: {file_path}")
    
    # 先讀取一小部分來了解資料
    sample_df = pd.read_csv(file_path, nrows=5)
    print("資料欄位預覽:")
    print(sample_df.columns.tolist())
    
    # 定義需要的欄位（排除不需要的文字欄位以節省記憶體）
    # 根據其他Kaggle notebook的經驗，這些是最重要的欄位
    important_cols = [
        'Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng',
        'Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)',
        'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)',
        'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way',
        'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop',
        'Traffic_Calming', 'Traffic_Signal', 'Sunrise_Sunset', 'State',
        'Side', 'Weather_Timestamp'
    ]
    
    # 過濾存在的欄位
    existing_cols = [col for col in important_cols if col in sample_df.columns]
    
    # 定義數據類型以減少記憶體
    dtype_dict = {
        'Severity': 'int8',
        'Distance(mi)': 'float32',
        'Temperature(F)': 'float32',
        'Humidity(%)': 'float32',
        'Pressure(in)': 'float32',
        'Visibility(mi)': 'float32',
        'Wind_Speed(mph)': 'float32',
        'Precipitation(in)': 'float32',
        'Amenity': 'bool',
        'Bump': 'bool',
        'Crossing': 'bool',
        'Give_Way': 'bool',
        'Junction': 'bool',
        'No_Exit': 'bool',
        'Railway': 'bool',
        'Roundabout': 'bool',
        'Station': 'bool',
        'Stop': 'bool',
        'Traffic_Calming': 'bool',
        'Traffic_Signal': 'bool'
    }
    
    # 載入資料
    if sample_frac:
        # 隨機採樣
        print(f"載入 {sample_frac*100}% 的資料...")
        df = pd.read_csv(file_path, usecols=existing_cols, dtype=dtype_dict)
        df = df.sample(frac=sample_frac, random_state=42)
    elif chunksize:
        # 分塊載入
        print(f"分塊載入，每塊 {chunksize} 行...")
        chunks = []
        for chunk in pd.read_csv(file_path, usecols=existing_cols, 
                                dtype=dtype_dict, chunksize=chunksize):
            chunks.append(chunk)
            if len(chunks) * chunksize >= 1000000:  # 限制在100萬行
                break
        df = pd.concat(chunks, ignore_index=True)
    else:
        # 完整載入
        df = pd.read_csv(file_path, usecols=existing_cols, dtype=dtype_dict)
    
    print(f"載入資料大小: {df.shape}")
    print(f"記憶體使用: {df.memory_usage().sum() / 1024**2:.2f} MB")
    
    # 顯示目標變數分布
    print("\n目標變數分布:")
    severity_counts = df['Severity'].value_counts().sort_index()
    for sev, count in severity_counts.items():
        print(f"Severity {sev}: {count:,} ({count/len(df)*100:.2f}%)")
    
    return df

# 執行載入（建議先用小樣本測試）
file_path = 'us-accidents/US_Accidents_March23.csv'

# 選項1: 使用部分資料（推薦用於測試）
# df = load_data_optimized(file_path, sample_frac=0.1)  # 10%資料

# 選項2: 分塊載入
# df = load_data_optimized(file_path, chunksize=500000)  # 每次50萬行

# 選項3: 完整載入（需要大量記憶體）
df = load_data_optimized(file_path)


載入資料: us-accidents/US_Accidents_March23.csv
資料欄位預覽:
['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description', 'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']
載入資料大小: (7728394, 28)
記憶體使用: 773.89 MB

目標變數分布:
Severity 1: 67,366 (0.87%)
Severity 2: 6,156,981 (79.67%)
Severity 3: 1,299,337 (16.81%)
Severity 4: 204,710 (2.65%)


In [4]:
# ===========================
# Cell 4: 日期時間處理（優化版）
# ===========================

def process_datetime_features(df):
    """處理日期時間特徵"""
    print("\n處理日期時間特徵...")
    
    # 轉換日期時間
    df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')
    df['End_Time'] = pd.to_datetime(df['End_Time'], errors='coerce')
    
    # 計算持續時間
    df['Duration_minutes'] = (df['End_Time'] - df['Start_Time']).dt.total_seconds() / 60
    
    # 過濾異常值（使用更寬鬆的範圍）
    df = df[(df['Duration_minutes'] > 0) & (df['Duration_minutes'] < 1440*7)]  # 小於7天
    
    # 移除日期時間為空的記錄
    df = df.dropna(subset=['Start_Time'])
    
    # 提取時間特徵
    df['Hour'] = df['Start_Time'].dt.hour.astype('int8')
    df['DayOfWeek'] = df['Start_Time'].dt.dayofweek.astype('int8')
    df['Month'] = df['Start_Time'].dt.month.astype('int8')
    df['Year'] = df['Start_Time'].dt.year.astype('int16')
    
    # 衍生特徵
    df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype('int8')
    df['IsRushHour'] = df['Hour'].apply(
        lambda x: 1 if (6 <= x <= 9) or (16 <= x <= 19) else 0
    ).astype('int8')
    
    # 時段分類
    df['TimeOfDay'] = pd.cut(df['Hour'], 
                            bins=[-1, 6, 12, 18, 24], 
                            labels=[0, 1, 2, 3]).astype('int8')  # 轉換為數值
    
    # 季節
    df['Season'] = pd.cut(df['Month'], 
                         bins=[0, 3, 6, 9, 12], 
                         labels=[0, 1, 2, 3]).astype('int8')  # 轉換為數值
    
    # 刪除原始時間欄位以節省記憶體
    df = df.drop(['Start_Time', 'End_Time', 'Weather_Timestamp'], axis=1, errors='ignore')
    
    print(f"處理後大小: {df.shape}")
    clean_memory()
    
    return df

df = process_datetime_features(df)


處理日期時間特徵...
處理後大小: (7721568, 34)


In [5]:
# ===========================
# Cell 5: 天氣特徵處理
# ===========================

def process_weather_features(df):
    """處理天氣相關特徵"""
    print("\n處理天氣特徵...")
    
    if 'Weather_Condition' in df.columns:
        # 簡化天氣分類
        def categorize_weather(condition):
            if pd.isna(condition):
                return 0  # Unknown
            condition = str(condition).lower()
            if any(word in condition for word in ['clear', 'fair']):
                return 1  # Clear
            elif any(word in condition for word in ['cloud', 'overcast']):
                return 2  # Cloudy
            elif any(word in condition for word in ['rain', 'drizzle']):
                return 3  # Rain
            elif any(word in condition for word in ['snow', 'sleet']):
                return 4  # Snow
            elif any(word in condition for word in ['fog', 'mist']):
                return 5  # Fog
            elif any(word in condition for word in ['storm', 'thunder']):
                return 6  # Storm
            else:
                return 7  # Other
        
        df['Weather_Category'] = df['Weather_Condition'].apply(categorize_weather).astype('int8')
        df = df.drop('Weather_Condition', axis=1)
    
    # 處理其他天氣數值特徵的缺失值
    weather_numeric_cols = ['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 
                           'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)']
    
    for col in weather_numeric_cols:
        if col in df.columns:
            # 使用中位數填充
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
    
    clean_memory()
    return df

df = process_weather_features(df)



處理天氣特徵...


In [6]:
# ===========================
# Cell 6: 處理缺失值和編碼類別變數
# ===========================

def handle_missing_and_encode(df):
    """處理缺失值並編碼類別變數"""
    print("\n處理缺失值和編碼...")
    
    # 刪除缺失值過多的欄位
    missing_pct = df.isnull().sum() / len(df)
    high_missing_cols = missing_pct[missing_pct > 0.5].index.tolist()
    
    # 保留Severity
    if 'Severity' in high_missing_cols:
        high_missing_cols.remove('Severity')
    
    df = df.drop(columns=high_missing_cols, errors='ignore')
    print(f"刪除高缺失率欄位: {len(high_missing_cols)}")
    
    # 對類別變數進行標籤編碼
    categorical_cols = ['State', 'Side', 'Sunrise_Sunset']
    label_encoders = {}
    
    for col in categorical_cols:
        if col in df.columns:
            le = LabelEncoder()
            # 填充缺失值
            df[col] = df[col].fillna('Unknown')
            # 編碼
            df[col] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le
    
    # 填充數值型缺失值
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if col != 'Severity':
            df[col] = df[col].fillna(df[col].median())
    
    # 確保布林型欄位是整數
    bool_cols = df.select_dtypes(include=['bool']).columns
    for col in bool_cols:
        df[col] = df[col].astype('int8')
    
    print(f"處理後資料大小: {df.shape}")
    print(f"剩餘缺失值: {df.isnull().sum().sum()}")
    
    clean_memory()
    return df, label_encoders

df, label_encoders = handle_missing_and_encode(df)



處理缺失值和編碼...
刪除高缺失率欄位: 0
處理後資料大小: (7721568, 34)
剩餘缺失值: 0


In [7]:
# ===========================
# Cell 7: 特徵選擇和準備最終數據
# ===========================

def prepare_final_data(df):
    """準備最終的訓練數據"""
    print("\n準備最終數據...")
    
    # 刪除任何仍有缺失值的行
    df = df.dropna()
    
    # 確保Severity是正確的值
    df = df[df['Severity'].isin([1, 2, 3, 4])]
    
    # 根據Kaggle上的建議，考慮合併Severity 1和2
    # 因為Severity 1的樣本太少
    print("\n原始類別分布:")
    print(df['Severity'].value_counts().sort_index())
    
    # 選項：合併類別（可選）
    # df['Severity'] = df['Severity'].replace({1: 2})
    
    # 分離特徵和目標
    feature_cols = [col for col in df.columns if col != 'Severity']
    X = df[feature_cols].values
    y = df['Severity'].values - 1  # 轉換為0-3
    
    print(f"\n最終數據大小: X={X.shape}, y={y.shape}")
    print("最終類別分布:")
    unique, counts = np.unique(y, return_counts=True)
    for cls, cnt in zip(unique, counts):
        print(f"  類別 {cls} (Severity {cls+1}): {cnt:,} ({cnt/len(y)*100:.2f}%)")
    
    return X, y, feature_cols

X, y, feature_names = prepare_final_data(df)


準備最終數據...

原始類別分布:
1      67365
2    6151445
3    1299251
4     203507
Name: Severity, dtype: int64

最終數據大小: X=(7721568, 33), y=(7721568,)
最終類別分布:
  類別 0 (Severity 1): 67,365 (0.87%)
  類別 1 (Severity 2): 6,151,445 (79.67%)
  類別 2 (Severity 3): 1,299,251 (16.83%)
  類別 3 (Severity 4): 203,507 (2.64%)


In [8]:
# ===========================
# Cell 8: 資料分割
# ===========================

# 分層分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"訓練集: {X_train.shape}")
print(f"測試集: {X_test.shape}")

# 計算類別權重
class_weights = compute_class_weight('balanced', 
                                   classes=np.unique(y_train), 
                                   y=y_train)
class_weight_dict = dict(enumerate(class_weights))

print("\n類別權重:")
for cls, weight in class_weight_dict.items():
    print(f"  類別 {cls}: {weight:.4f}")


訓練集: (6177254, 33)
測試集: (1544314, 33)

類別權重:
  類別 0: 28.6557
  類別 1: 0.3138
  類別 2: 1.4858
  類別 3: 9.4857


In [9]:
# ===========================
# Cell 9: 處理不平衡 - 混合採樣策略
# ===========================

def apply_mixed_sampling(X_train, y_train, strategy='mixed'):
    """
    應用混合採樣策略
    參考Kaggle最佳實踐：結合過採樣和欠採樣
    """
    print(f"\n應用採樣策略: {strategy}")
    
    if strategy == 'none':
        return X_train, y_train
    
    # 計算各類別數量
    unique, counts = np.unique(y_train, return_counts=True)
    class_counts = dict(zip(unique, counts))
    print("原始分布:", class_counts)
    
    if strategy == 'mixed':
        # 混合策略：對多數類欠採樣，對少數類過採樣
        # 目標：讓所有類別接近中位數
        median_count = int(np.median(counts))
        target_count = int(median_count * 1.5)  # 目標數量設為中位數的1.5倍
        
        # 第一步：欠採樣 - 只對超過目標數量的類別進行欠採樣
        undersample_strategy = {}
        for cls, cnt in class_counts.items():
            if cnt > target_count:
                undersample_strategy[cls] = target_count
            else:
                undersample_strategy[cls] = cnt  # 保持原樣
        
        if len(undersample_strategy) > 0 and any(cnt < class_counts[cls] for cls, cnt in undersample_strategy.items()):
            rus = RandomUnderSampler(sampling_strategy=undersample_strategy, random_state=42)
            X_temp, y_temp = rus.fit_resample(X_train, y_train)
        else:
            X_temp, y_temp = X_train, y_train
        
        # 第二步：過採樣 - 只對少於目標數量的類別進行過採樣
        temp_unique, temp_counts = np.unique(y_temp, return_counts=True)
        temp_class_counts = dict(zip(temp_unique, temp_counts))
        
        oversample_strategy = {}
        for cls, cnt in temp_class_counts.items():
            if cnt < target_count:
                oversample_strategy[cls] = target_count
            else:
                oversample_strategy[cls] = cnt  # 保持原樣
        
        if len(oversample_strategy) > 0 and any(cnt > temp_class_counts[cls] for cls, cnt in oversample_strategy.items()):
            ros = RandomOverSampler(sampling_strategy=oversample_strategy, random_state=42)
            X_resampled, y_resampled = ros.fit_resample(X_temp, y_temp)
        else:
            X_resampled, y_resampled = X_temp, y_temp
            
    elif strategy == 'smote':
        # SMOTE策略：只過採樣到最多類別的50%
        max_count = max(counts)
        target_count = int(max_count * 0.5)
        
        # 確保目標數量不小於當前數量
        sampling_strategy = {}
        for cls, cnt in class_counts.items():
            if cnt < target_count:
                sampling_strategy[cls] = target_count
            else:
                sampling_strategy[cls] = cnt
        
        smote = SMOTE(sampling_strategy=sampling_strategy, k_neighbors=5, random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    elif strategy == 'undersample_only':
        # 只欠採樣到最少類別的2倍
        min_count = min(counts)
        target_count = min_count * 2
        
        sampling_strategy = {}
        for cls, cnt in class_counts.items():
            sampling_strategy[cls] = min(cnt, target_count)
        
        rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
    
    # 顯示新分布
    unique_new, counts_new = np.unique(y_resampled, return_counts=True)
    new_distribution = dict(zip(unique_new, counts_new))
    print("採樣後分布:", new_distribution)
    
    # 顯示變化
    print("\n採樣變化:")
    for cls in range(4):
        original = class_counts.get(cls, 0)
        new = new_distribution.get(cls, 0)
        change = ((new - original) / original * 100) if original > 0 else 0
        print(f"  類別 {cls}: {original:,} → {new:,} ({change:+.1f}%)")
    
    return X_resampled, y_resampled

# 應用混合採樣
# 可以嘗試不同策略
X_train_balanced, y_train_balanced = apply_mixed_sampling(X_train, y_train, 'mixed')

# 如果混合策略還是有問題，可以嘗試其他策略：
# X_train_balanced, y_train_balanced = apply_mixed_sampling(X_train, y_train, 'undersample_only')
# 或者不進行採樣：
# X_train_balanced, y_train_balanced = X_train, y_train



應用採樣策略: mixed
原始分布: {0: 53892, 1: 4921156, 2: 1039401, 3: 162805}
採樣後分布: {0: 901654, 1: 901654, 2: 901654, 3: 901654}

採樣變化:
  類別 0: 53,892 → 901,654 (+1573.1%)
  類別 1: 4,921,156 → 901,654 (-81.7%)
  類別 2: 1,039,401 → 901,654 (-13.3%)
  類別 3: 162,805 → 901,654 (+453.8%)


In [10]:
# ===========================
# Cell 10: LightGBM模型（優化版）
# ===========================

def train_lightgbm_optimized(X_train, X_test, y_train, y_test, class_weights):
    """訓練優化的LightGBM模型"""
    print("\n訓練 LightGBM (優化版)...")
    
    # 創建數據集
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
    
    # 參數設置（基於Kaggle最佳實踐）
    params = {
        'objective': 'multiclass',
        'num_class': 4,
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 127,
        'max_depth': -1,
        'learning_rate': 0.05,
        'n_estimators': 1000,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'min_child_samples': 20,
        'min_split_gain': 0.02,
        'class_weight': 'balanced',
        'device': 'gpu' if torch.cuda.is_available() else 'cpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'verbose': -1,
        'random_state': 42,
        'n_jobs': -1
    }
    
    # 訓練
    start_time = time.time()
    
    model = lgb.train(
        params,
        train_data,
        valid_sets=[valid_data],
        num_boost_round=1000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=100)
        ]
    )
    
    train_time = time.time() - start_time
    
    # 預測
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred = np.argmax(y_pred, axis=1)
    
    # 評估
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    
    print(f"\n訓練時間: {train_time:.2f} 秒")
    print(f"最佳迭代次數: {model.best_iteration}")
    print(f"準確率: {accuracy:.4f}")
    print(f"F1分數: {f1:.4f}")
    print(f"平衡準確率: {balanced_acc:.4f}")
    
    # 詳細報告
    print("\n分類報告:")
    print(classification_report(y_test, y_pred, 
                              target_names=[f'Severity {i+1}' for i in range(4)]))
    
    return model, accuracy, f1, balanced_acc

# 訓練模型
lgb_model, lgb_acc, lgb_f1, lgb_balanced_acc = train_lightgbm_optimized(
    X_train_balanced, X_test, y_train_balanced, y_test, class_weight_dict
)


訓練 LightGBM (優化版)...
Training until validation scores don't improve for 50 rounds
[100]	valid_0's multi_logloss: 0.651958
[200]	valid_0's multi_logloss: 0.604593
[300]	valid_0's multi_logloss: 0.584771
[400]	valid_0's multi_logloss: 0.570723
[500]	valid_0's multi_logloss: 0.558812
[600]	valid_0's multi_logloss: 0.548499
[700]	valid_0's multi_logloss: 0.539978
[800]	valid_0's multi_logloss: 0.53255
[900]	valid_0's multi_logloss: 0.52578
[1000]	valid_0's multi_logloss: 0.520255
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.520255

訓練時間: 2173.86 秒
最佳迭代次數: 1000
準確率: 0.7711
F1分數: 0.8043
平衡準確率: 0.8533

分類報告:
              precision    recall  f1-score   support

  Severity 1       0.39      0.96      0.56     13473
  Severity 2       0.98      0.75      0.85   1230289
  Severity 3       0.57      0.85      0.68    259850
  Severity 4       0.19      0.86      0.32     40702

    accuracy                           0.77   1544314
   macro avg       0.53    

In [11]:
# ===========================
# Cell 11: XGBoost模型（穩定版）
# ===========================

def train_xgboost_stable(X_train, X_test, y_train, y_test, use_sample_weight=True):
    """穩定版XGBoost"""
    print("\n訓練 XGBoost (穩定版)...")
    
    # 使用原始的類別權重，但不要太極端
    if use_sample_weight:
        # 溫和的類別權重
        unique, counts = np.unique(y_train, return_counts=True)
        weight_dict = {}
        max_count = max(counts)
        for cls, count in zip(unique, counts):
            # 權重不超過10倍
            weight_dict[cls] = min(max_count / count, 10.0)
        
        sample_weights = np.array([weight_dict[y] for y in y_train])
    else:
        sample_weights = None
    
    # XGBoost參數
    params = {
        'objective': 'multi:softprob',
        'num_class': 4,
        'max_depth': 6,
        'learning_rate': 0.1,
        'n_estimators': 300,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 5,  # 增加以防止過擬合
        'gamma': 0.1,
        'reg_alpha': 0.1,
        'reg_lambda': 1,
        'tree_method': 'gpu_hist' if torch.cuda.is_available() else 'hist',
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'mlogloss'
    }
    
    # 訓練
    model = xgb.XGBClassifier(**params)
    
    start_time = time.time()
    model.fit(
        X_train, y_train,
        sample_weight=sample_weights,
        eval_set=[(X_test, y_test)],
        # early_stopping_rounds=50,
        verbose=100
    )
    train_time = time.time() - start_time
    
    # 預測
    y_pred = model.predict(X_test)
    
    # 評估
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    
    print(f"\n訓練時間: {train_time:.2f} 秒")
    print(f"準確率: {accuracy:.4f}")
    print(f"F1分數: {f1:.4f}")
    print(f"平衡準確率: {balanced_acc:.4f}")
    
    print("\n分類報告:")
    print(classification_report(y_test, y_pred, 
                              target_names=[f'Severity {i+1}' for i in range(4)]))
    
    return model, accuracy, f1, balanced_acc

# 執行訓練
xgb_model, xgb_acc, xgb_f1, xgb_balanced_acc = train_xgboost_stable(
    X_train_balanced, X_test, y_train_balanced, y_test
)


訓練 XGBoost (穩定版)...
[0]	validation_0-mlogloss:1.32514
[100]	validation_0-mlogloss:0.73571
[200]	validation_0-mlogloss:0.67866
[299]	validation_0-mlogloss:0.64944

訓練時間: 13.84 秒
準確率: 0.6995
F1分數: 0.7459
平衡準確率: 0.8217

分類報告:
              precision    recall  f1-score   support

  Severity 1       0.26      0.96      0.40     13473
  Severity 2       0.97      0.67      0.79   1230289
  Severity 3       0.49      0.80      0.61    259850
  Severity 4       0.15      0.85      0.26     40702

    accuracy                           0.70   1544314
   macro avg       0.47      0.82      0.52   1544314
weighted avg       0.87      0.70      0.75   1544314



In [12]:
# ===========================
# Cell 12: CatBoost模型
# ===========================

def train_catboost_optimized(X_train, X_test, y_train, y_test):
    """訓練優化的CatBoost模型"""
    print("\n訓練 CatBoost (優化版)...")
    
    # CatBoost參數
    model = CatBoostClassifier(
        iterations=1000,
        depth=8,
        learning_rate=0.05,
        loss_function='MultiClass',
        eval_metric='TotalF1',
        auto_class_weights='Balanced',
        l2_leaf_reg=3,
        random_strength=1,
        bagging_temperature=1,
        od_type='Iter',
        od_wait=50,
        task_type='GPU' if torch.cuda.is_available() else 'CPU',
        devices='0',
        random_state=42,
        verbose=100
    )
    
    # 訓練
    start_time = time.time()
    
    model.fit(
        X_train, y_train,
        eval_set=(X_test, y_test),
        early_stopping_rounds=50,
        plot=False
    )
    
    train_time = time.time() - start_time
    
    # 預測
    y_pred = model.predict(X_test)
    
    # 評估
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    
    print(f"\n訓練時間: {train_time:.2f} 秒")
    print(f"準確率: {accuracy:.4f}")
    print(f"F1分數: {f1:.4f}")
    print(f"平衡準確率: {balanced_acc:.4f}")
    
    return model, accuracy, f1, balanced_acc

# 訓練CatBoost
cat_model, cat_acc, cat_f1, cat_balanced_acc = train_catboost_optimized(
    X_train_balanced, X_test, y_train_balanced, y_test
)



訓練 CatBoost (優化版)...




0:	learn: 0.6764956	test: 0.5434086	best: 0.5434086 (0)	total: 26.6ms	remaining: 26.6s
100:	learn: 0.7602058	test: 0.6402475	best: 0.6402475 (100)	total: 2.39s	remaining: 21.3s
200:	learn: 0.7830976	test: 0.6798540	best: 0.6798540 (200)	total: 4.71s	remaining: 18.7s
300:	learn: 0.7940204	test: 0.6984590	best: 0.6984590 (300)	total: 6.95s	remaining: 16.1s
400:	learn: 0.8020022	test: 0.7121693	best: 0.7121693 (400)	total: 9.19s	remaining: 13.7s
500:	learn: 0.8078044	test: 0.7209234	best: 0.7209234 (500)	total: 11.4s	remaining: 11.3s
600:	learn: 0.8124857	test: 0.7284292	best: 0.7284292 (600)	total: 13.6s	remaining: 9.02s
700:	learn: 0.8164636	test: 0.7344848	best: 0.7344848 (700)	total: 15.8s	remaining: 6.74s
800:	learn: 0.8195583	test: 0.7393810	best: 0.7393810 (800)	total: 18s	remaining: 4.48s
900:	learn: 0.8225203	test: 0.7437229	best: 0.7437229 (900)	total: 20.3s	remaining: 2.23s
999:	learn: 0.8247384	test: 0.7471531	best: 0.7471531 (999)	total: 22.5s	remaining: 0us
bestTest = 0.7471

In [13]:
# ===========================
# Cell 13: Balanced Random Forest
# ===========================

def train_balanced_rf(X_train, X_test, y_train, y_test):
    """訓練Balanced Random Forest"""
    print("\n訓練 Balanced Random Forest...")
    
    model = BalancedRandomForestClassifier(
        n_estimators=300,
        max_depth=20,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt',
        bootstrap=True,
        oob_score=True,
        class_weight='balanced_subsample',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # 預測
    y_pred = model.predict(X_test)
    
    # 評估
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    
    print(f"\n訓練時間: {train_time:.2f} 秒")
    print(f"OOB分數: {model.oob_score_:.4f}")
    print(f"準確率: {accuracy:.4f}")
    print(f"F1分數: {f1:.4f}")
    print(f"平衡準確率: {balanced_acc:.4f}")
    
    # 詳細報告
    print("\n分類報告:")
    print(classification_report(y_test, y_pred, 
                              target_names=[f'Severity {i+1}' for i in range(4)]))
    
    return model, accuracy, f1, balanced_acc

# 訓練Balanced RF
brf_model, brf_acc, brf_f1, brf_balanced_acc = train_balanced_rf(
    X_train, X_test, y_train, y_test  # 使用原始數據，因為模型內部會平衡
)



訓練 Balanced Random Forest...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.0min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    5.2s
[Parallel(n_jobs=16)]: Done 300 out of 300 | elapsed:    9.2s finished



訓練時間: 126.82 秒
OOB分數: 0.4670
準確率: 0.6434
F1分數: 0.6977
平衡準確率: 0.8005

分類報告:
              precision    recall  f1-score   support

  Severity 1       0.21      0.96      0.35     13473
  Severity 2       0.97      0.60      0.75   1230289
  Severity 3       0.44      0.78      0.56    259850
  Severity 4       0.13      0.86      0.23     40702

    accuracy                           0.64   1544314
   macro avg       0.44      0.80      0.47   1544314
weighted avg       0.85      0.64      0.70   1544314



In [14]:
# ===========================
# 改進的深度學習模型（替換 Cell 14）
# ===========================

class ImprovedNN(nn.Module):
    """改進的神經網路 - 加入更多技巧"""
    def __init__(self, input_size, num_classes=4):
        super(ImprovedNN, self).__init__()
        
        self.fc1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.5)
        
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.4)
        
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.dropout3 = nn.Dropout(0.3)
        
        self.fc4 = nn.Linear(128, 64)
        self.bn4 = nn.BatchNorm1d(64)
        self.dropout4 = nn.Dropout(0.2)
        
        self.fc5 = nn.Linear(64, num_classes)
        
        # 初始化權重
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
    
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.dropout3(x)
        
        x = F.relu(self.bn4(self.fc4(x)))
        x = self.dropout4(x)
        
        x = self.fc5(x)
        return x

def train_improved_nn(X_train, X_test, y_train, y_test, epochs=100):  # epochs在這裡
    """訓練改進的深度學習模型"""
    print("\n訓練改進的神經網路...")
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 標準化
    from sklearn.preprocessing import RobustScaler
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 轉換為張量
    X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
    y_train_tensor = torch.LongTensor(y_train).to(device)
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    
    # DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
    
    # 創建模型 - 不傳入epochs
    model = ImprovedNN(X_train.shape[1]).to(device)
    
    # 損失函數
    class_weights_tensor = torch.FloatTensor(list(class_weight_dict.values())).to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # 訓練循環
    start_time = time.time()
    best_balanced_acc = 0
    
    for epoch in range(epochs):  # epochs在這裡使用
        model.train()
        total_loss = 0
        
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        # 每10個epoch評估一次
        if (epoch + 1) % 10 == 0:
            model.eval()
            with torch.no_grad():
                val_outputs = model(X_test_tensor)
                _, predicted = torch.max(val_outputs, 1)
                val_balanced_acc = balanced_accuracy_score(y_test, predicted.cpu().numpy())
            
            print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}, "
                  f"Balanced Acc: {val_balanced_acc:.4f}")
            
            if val_balanced_acc > best_balanced_acc:
                best_balanced_acc = val_balanced_acc
                best_model_state = model.state_dict()
    
    # 載入最佳模型
    if best_balanced_acc > 0:
        model.load_state_dict(best_model_state)
    
    # 最終評估
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        y_pred = predicted.cpu().numpy()
    
    train_time = time.time() - start_time
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    
    print(f"\n訓練時間: {train_time:.2f} 秒")
    print(f"準確率: {accuracy:.4f}")
    print(f"F1分數: {f1:.4f}")
    print(f"平衡準確率: {balanced_acc:.4f}")
    
    return model, scaler, accuracy, f1, balanced_acc

# 執行訓練
nn_model, nn_scaler, nn_acc, nn_f1, nn_balanced_acc = train_improved_nn(
    X_train_balanced, X_test, y_train_balanced, y_test, epochs=50
)



訓練改進的神經網路...
Epoch 10/50, Loss: 0.1497, Balanced Acc: 0.5261
Epoch 20/50, Loss: 0.1412, Balanced Acc: 0.5640
Epoch 30/50, Loss: 0.1374, Balanced Acc: 0.5252
Epoch 40/50, Loss: 0.1349, Balanced Acc: 0.5327
Epoch 50/50, Loss: 0.1331, Balanced Acc: 0.5698

訓練時間: 1021.71 秒
準確率: 0.1202
F1分數: 0.1148
平衡準確率: 0.5698


In [15]:
# ===========================
# Cell 15: 模型比較和集成
# ===========================

# 收集所有結果
results = {
    'LightGBM': {'accuracy': lgb_acc, 'f1': lgb_f1, 'balanced_acc': lgb_balanced_acc},
    'XGBoost': {'accuracy': xgb_acc, 'f1': xgb_f1, 'balanced_acc': xgb_balanced_acc},
    'CatBoost': {'accuracy': cat_acc, 'f1': cat_f1, 'balanced_acc': cat_balanced_acc},
    'Balanced_RF': {'accuracy': brf_acc, 'f1': brf_f1, 'balanced_acc': brf_balanced_acc},
    'Neural_Network': {'accuracy': nn_acc, 'f1': nn_f1, 'balanced_acc': nn_balanced_acc}
}

print("\n" + "="*70)
print("模型性能比較")
print("="*70)
print(f"{'模型':<20} {'準確率':<10} {'F1分數':<10} {'平衡準確率':<10}")
print("-"*70)

for model_name, metrics in sorted(results.items(), key=lambda x: x[1]['balanced_acc'], reverse=True):
    print(f"{model_name:<20} {metrics['accuracy']:<10.4f} {metrics['f1']:<10.4f} {metrics['balanced_acc']:<10.4f}")

# 找出最佳模型
best_model_name = max(results.items(), key=lambda x: x[1]['balanced_acc'])[0]
print(f"\n🏆 最佳模型: {best_model_name}")
print(f"   平衡準確率: {results[best_model_name]['balanced_acc']:.4f}")



模型性能比較
模型                   準確率        F1分數       平衡準確率     
----------------------------------------------------------------------
LightGBM             0.7711     0.8043     0.8533    
CatBoost             0.7011     0.7472     0.8230    
XGBoost              0.6995     0.7459     0.8217    
Balanced_RF          0.6434     0.6977     0.8005    
Neural_Network       0.1202     0.1148     0.5698    

🏆 最佳模型: LightGBM
   平衡準確率: 0.8533


In [16]:
# ===========================
# Cell 16: 保存模型和結果（修正版）
# ===========================

def save_models_and_results(models, results, feature_names, label_encoders):
    """保存所有模型和結果"""
    output_dir = './model_output/'
    os.makedirs(output_dir, exist_ok=True)
    
    # 保存模型
    model_dict = {
        'LightGBM': lgb_model,
        'XGBoost': xgb_model,
        'CatBoost': cat_model,
        'Balanced_RF': brf_model
    }
    
    for name, model in model_dict.items():
        if name == 'LightGBM':
            model.save_model(f'{output_dir}{name.lower()}_model.txt')
        else:
            joblib.dump(model, f'{output_dir}{name.lower()}_model.pkl')
    
    # 保存神經網路
    torch.save(nn_model.state_dict(), f'{output_dir}neural_network_model.pth')
    joblib.dump(nn_scaler, f'{output_dir}nn_scaler.pkl')
    
    # 保存特徵名稱和編碼器
    joblib.dump(feature_names, f'{output_dir}feature_names.pkl')
    joblib.dump(label_encoders, f'{output_dir}label_encoders.pkl')
    
    # 保存結果
    import json
    with open(f'{output_dir}results.json', 'w') as f:
        json.dump(results, f, indent=4)
    
    # 保存訓練信息 - 修正：將numpy類型轉換為Python原生類型
    train_info = {
        'train_size': int(len(X_train)),  # 轉換為int
        'test_size': int(len(X_test)),    # 轉換為int
        'n_features': int(len(feature_names)),  # 轉換為int
        'class_distribution': {int(k): int(v) for k, v in zip(*np.unique(y, return_counts=True))},  # 轉換鍵值
        'best_model': best_model_name,
        'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }
    
    with open(f'{output_dir}train_info.json', 'w') as f:
        json.dump(train_info, f, indent=4)
    
    print(f"\n✅ 所有模型和結果已保存至: {output_dir}")

# 保存
save_models_and_results(
    {'nn_model': nn_model, 'nn_scaler': nn_scaler},
    results,
    feature_names,
    label_encoders
)


✅ 所有模型和結果已保存至: ./model_output/


In [17]:
# ===========================
# Cell 17: 準備時空預測數據
# ===========================

# 首先，我們需要重新載入包含地理位置的原始數據
print("重新載入地理數據...")

# 載入需要的欄位
geo_cols = ['Start_Lat', 'Start_Lng', 'Start_Time', 'Severity', 'State', 'City']
df_geo = pd.read_csv(file_path, usecols=geo_cols, nrows=1000000)  # 先用100萬筆測試

# 處理時間
df_geo['Start_Time'] = pd.to_datetime(df_geo['Start_Time'])
df_geo = df_geo.dropna(subset=['Start_Lat', 'Start_Lng'])

print(f"地理數據大小: {df_geo.shape}")
print(f"數據範圍: Lat [{df_geo['Start_Lat'].min():.2f}, {df_geo['Start_Lat'].max():.2f}], "
      f"Lng [{df_geo['Start_Lng'].min():.2f}, {df_geo['Start_Lng'].max():.2f}]")

# ===========================
# Cell 18: 創建網格化地圖數據
# ===========================

def create_grid_statistics(df_geo):
    """創建網格化的事故統計"""
    print("創建網格統計...")
    
    # 提取時間特徵
    df_geo['Hour'] = df_geo['Start_Time'].dt.hour
    df_geo['DayOfWeek'] = df_geo['Start_Time'].dt.dayofweek
    df_geo['Month'] = df_geo['Start_Time'].dt.month
    
    # 創建地理網格（0.5度 x 0.5度）
    df_geo['lat_grid'] = (df_geo['Start_Lat'] // 0.5) * 0.5
    df_geo['lng_grid'] = (df_geo['Start_Lng'] // 0.5) * 0.5
    
    # 統計每個網格的事故
    grid_stats = df_geo.groupby(['lat_grid', 'lng_grid']).agg({
        'Severity': ['count', 'mean'],
        'Hour': lambda x: x.mode()[0] if len(x) > 0 else 12,
        'DayOfWeek': lambda x: x.mode()[0] if len(x) > 0 else 1
    }).reset_index()
    
    # 簡化列名
    grid_stats.columns = ['lat', 'lng', 'accident_count', 'avg_severity', 'common_hour', 'common_day']
    
    # 只保留有足夠事故的網格
    grid_stats = grid_stats[grid_stats['accident_count'] >= 10]
    
    print(f"網格數量: {len(grid_stats)}")
    
    return grid_stats, df_geo

grid_stats, df_geo_processed = create_grid_statistics(df_geo)

# ===========================
# Cell 19: 創建互動式地圖（使用Plotly）
# ===========================

import plotly.graph_objects as go
import plotly.express as px

def create_interactive_map(grid_stats, selected_hour=None, selected_day=None):
    """創建互動式事故熱力圖"""
    
    # 篩選數據
    data = grid_stats.copy()
    if selected_hour is not None:
        # 篩選相似時間的數據
        data = data[np.abs(data['common_hour'] - selected_hour) <= 3]
    
    # 創建地圖
    fig = go.Figure()
    
    # 添加熱力圖層
    fig.add_trace(go.Scattermapbox(
        lat=data['lat'],
        lon=data['lng'],
        mode='markers',
        marker=dict(
            size=np.log1p(data['accident_count']) * 3,  # 對數縮放
            color=data['avg_severity'],
            colorscale='Reds',
            showscale=True,
            colorbar=dict(title="平均嚴重度"),
            opacity=0.7
        ),
        text=[f"位置: ({lat:.2f}, {lng:.2f})<br>"
              f"事故數: {count}<br>"
              f"平均嚴重度: {sev:.2f}"
              for lat, lng, count, sev in zip(
                  data['lat'], data['lng'], 
                  data['accident_count'], data['avg_severity'])],
        hovertemplate='%{text}<extra></extra>'
    ))
    
    # 設置地圖樣式
    fig.update_layout(
        mapbox=dict(
            style="open-street-map",
            center=dict(lat=39.8283, lon=-98.5795),  # 美國中心
            zoom=3
        ),
        showlegend=False,
        height=600,
        title=f"美國交通事故熱力圖" + 
              (f" - {selected_hour}:00" if selected_hour is not None else "")
    )
    
    return fig

# 創建基礎地圖
base_map = create_interactive_map(grid_stats)
base_map.show()

# ===========================
# Cell 20: 時間動態分析
# ===========================

# 創建按小時的事故分布
hourly_stats = df_geo_processed.groupby('Hour').agg({
    'Severity': ['count', 'mean']
}).reset_index()
hourly_stats.columns = ['Hour', 'Count', 'Avg_Severity']

# 繪製時間分布圖
fig_time = go.Figure()

# 事故數量
fig_time.add_trace(go.Bar(
    x=hourly_stats['Hour'],
    y=hourly_stats['Count'],
    name='事故數量',
    yaxis='y'
))

# 平均嚴重度
fig_time.add_trace(go.Scatter(
    x=hourly_stats['Hour'],
    y=hourly_stats['Avg_Severity'],
    name='平均嚴重度',
    yaxis='y2',
    line=dict(color='red', width=2)
))

fig_time.update_layout(
    title='24小時事故分布',
    xaxis=dict(title='小時'),
    yaxis=dict(title='事故數量', side='left'),
    yaxis2=dict(title='平均嚴重度', side='right', overlaying='y'),
    hovermode='x unified'
)

fig_time.show()

# ===========================
# Cell 21: 預測函數
# ===========================

def predict_accident_risk(lat, lng, hour, day_of_week, model, scaler, feature_template):
    """預測特定位置和時間的事故風險"""
    
    # 創建特徵向量（需要匹配訓練時的特徵）
    # 這裡簡化處理，實際需要完整的特徵工程
    features = np.zeros(len(feature_template))
    
    # 填入基本特徵
    features[0] = hour
    features[1] = day_of_week
    # ... 其他特徵
    
    # 標準化
    features_scaled = scaler.transform(features.reshape(1, -1))
    
    # 預測
    prediction = model.predict(features_scaled)[0]
    probability = model.predict_proba(features_scaled)[0]
    
    return prediction, probability

# ===========================
# Cell 22: 高風險區域識別
# ===========================

def identify_high_risk_areas(grid_stats, threshold_percentile=90):
    """識別高風險區域"""
    
    # 計算風險分數（結合事故數量和嚴重度）
    grid_stats['risk_score'] = (
        grid_stats['accident_count'] * 0.3 + 
        grid_stats['avg_severity'] * 100 * 0.7
    )
    
    # 找出高風險區域
    threshold = np.percentile(grid_stats['risk_score'], threshold_percentile)
    high_risk = grid_stats[grid_stats['risk_score'] >= threshold].copy()
    
    # 排序
    high_risk = high_risk.sort_values('risk_score', ascending=False)
    
    print(f"識別出 {len(high_risk)} 個高風險區域")
    print("\nTop 10 高風險區域:")
    for idx, row in high_risk.head(10).iterrows():
        print(f"  ({row['lat']:.2f}, {row['lng']:.2f}) - "
              f"風險分數: {row['risk_score']:.2f}, "
              f"事故數: {row['accident_count']}")
    
    return high_risk

high_risk_areas = identify_high_risk_areas(grid_stats)

重新載入地理數據...
地理數據大小: (1000000, 6)
數據範圍: Lat [24.55, 49.00], Lng [-124.50, -68.16]
創建網格統計...
網格數量: 1156


識別出 116 個高風險區域

Top 10 高風險區域:
  (34.00, -118.50) - 風險分數: 12674.57, 事故數: 41660.0
  (32.50, -97.00) - 風險分數: 10091.32, 事故數: 33088.0
  (33.50, -118.50) - 風險分數: 9189.00, 事故數: 30028.0
  (29.50, -95.50) - 風險分數: 9114.11, 事故數: 29854.0
  (30.00, -98.00) - 風險分數: 7943.38, 事故數: 25990.0
  (37.50, -122.50) - 風險分數: 7500.09, 事故數: 24411.0
  (33.50, -84.50) - 風險分數: 5995.84, 事故數: 19343.0
  (40.50, -74.00) - 風險分數: 5924.17, 事故數: 19161.0
  (33.50, -118.00) - 風險分數: 5603.35, 事故數: 18139.0
  (40.00, -75.50) - 風險分數: 5503.33, 事故數: 17858.0
