In [None]:
import pandas as pd

# 讀取你之前儲存的合併資料檔案
df = pd.read_csv(r"C:\Users\檔名.csv")

In [None]:
# 把 e_time 轉成 datetime 格式
df['e_time'] = pd.to_datetime(df['e_time'])

# 拆出時間特徵
df['Hour'] = df['e_time'].dt.hour
df['Minute'] = df['e_time'].dt.minute
df['Weekday'] = df['e_time'].dt.weekday

In [None]:
from geopy.distance import geodesic

# 使用官方提供的師大綜合大樓站牌座標
target_lat, target_lon = 25.026622, 121.53006  # 注意：緯度在前，經度在後

# 加入距離欄位（單位：公尺）
df['Distance'] = df.apply(
    lambda row: geodesic((row['PositionLat'], row['PositionLon']), (target_lat, target_lon)).meters
    if pd.notnull(row['PositionLat']) and pd.notnull(row['PositionLon']) else np.nan,
    axis=1
)

In [None]:
# 假設你的 DataFrame 名叫 df
df_filtered = df[df['Distance'] <= 20000].copy()

In [None]:
df_filtered['PlateNumb_encoded'] = df_filtered['PlateNumb'].astype('category').cat.codes

In [None]:
# 含 EstimateTime 的特徵組（模型 A）
feature_cols_A = ['PlateNumb_encoded','Speed', 'Distance', 'rt_delay_sec', 'Hour', 'Minute', 'Weekday', 
                  'peak', 'daytype', 'rain', 'temp', 'wind', 'EstimateTime']

# 不含 EstimateTime 的特徵組（模型 B）
feature_cols_B = ['PlateNumb_encoded','Speed', 'Distance', 'rt_delay_sec','Hour', 'Minute', 'Weekday', 
                  'peak', 'daytype', 'rain', 'temp', 'wind']

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# 準備資料
# 先過濾 true_arrival_sec < 5000 的資料
df_limited = df_filtered[df_filtered['true_arrival_sec'] < 5000]

# 準備資料
X = df_limited[feature_cols_A]
y = df_limited['true_arrival_sec']

def run_rf_model(X, y, feature_names, model_name):
    maes, rmses, r2s, models = [], [], [], []

    for i in range(20):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, shuffle=True, random_state=i
        )

        model = RandomForestRegressor(n_estimators=100, random_state=i)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        maes.append(mean_absolute_error(y_test, y_pred))
        rmses.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        r2s.append(r2_score(y_test, y_pred))
        models.append(model)

    # 平均與找出最接近平均的一次
    avg_mae, avg_rmse, avg_r2 = np.mean(maes), np.mean(rmses), np.mean(r2s)
    distances = [(mae - avg_mae)**2 + (rmse - avg_rmse)**2 + (r2 - avg_r2)**2
                 for mae, rmse, r2 in zip(maes, rmses, r2s)]
    best_index = int(np.argmin(distances))
    best_model = models[best_index]

    # 印出結果
    print(f"\n🌲 {model_name}（295）")
    print(f"平均 MAE: {avg_mae:.2f} ± {np.std(maes):.2f}")
    print(f"平均 RMSE: {avg_rmse:.2f} ± {np.std(rmses):.2f}")
    print(f"平均 R²: {avg_r2:.4f} ± {np.std(r2s):.4f}")
    print(f"\n🔍 最佳模型特徵重要性（第 {best_index} 次）：")
    for name, importance in zip(feature_names, best_model.feature_importances_):
        print(f"{name}: {importance:.4f}")

# 執行 RF 模型 A（含 EstimateTime）
run_rf_model(X, y, feature_cols_A, "Random Forest 模型 A（含 EstimateTime）")

In [None]:
# 準備模型 B 的資料
df_limited = df_filtered[df_filtered['true_arrival_sec'] < 5000]

# 準備資料
X_B = df_limited[feature_cols_B]
y = df_limited['true_arrival_sec']

def run_rf_model_B(X, y, feature_names, model_name):
    maes, rmses, r2s, models = [], [], [], []

    for i in range(20):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, shuffle=True, random_state=i
        )

        model = RandomForestRegressor(n_estimators=100, random_state=i)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        maes.append(mean_absolute_error(y_test, y_pred))
        rmses.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        r2s.append(r2_score(y_test, y_pred))
        models.append(model)

    # 平均與找出最接近平均的一次
    avg_mae, avg_rmse, avg_r2 = np.mean(maes), np.mean(rmses), np.mean(r2s)
    distances = [(mae - avg_mae)**2 + (rmse - avg_rmse)**2 + (r2 - avg_r2)**2
                 for mae, rmse, r2 in zip(maes, rmses, r2s)]
    best_index = int(np.argmin(distances))
    best_model = models[best_index]

    # 印出結果
    print(f"\n🌲 {model_name}（295）")
    print(f"平均 MAE: {avg_mae:.2f} ± {np.std(maes):.2f}")
    print(f"平均 RMSE: {avg_rmse:.2f} ± {np.std(rmses):.2f}")
    print(f"平均 R²: {avg_r2:.4f} ± {np.std(r2s):.4f}")
    print(f"\n🔍 最佳模型特徵重要性（第 {best_index} 次）：")
    for name, importance in zip(feature_names, best_model.feature_importances_):
        print(f"{name}: {importance:.4f}")

# 執行 RF 模型 B（不含 EstimateTime）
run_rf_model_B(X_B, y, feature_cols_B, "Random Forest 模型 B（不含 EstimateTime）")


In [None]:
# 取得模型 A 的資料
model_A = results['模型 A (含 EstimateTime)']['model']
features_A = results['模型 A (含 EstimateTime)']['features']
importances_A = model_A.feature_importances_

# 排序並顯示
sorted_idx = np.argsort(importances_A)[::-1]

print("模型 A（含 EstimateTime）特徵重要性排名：\n")
for i in sorted_idx:
    print(f"{features_A[i]:<10s}  →  {importances_A[i]:.4f}")
    
# 取得模型 B 的資料
model_B = results['模型 B (不含 EstimateTime)']['model']
features_B = results['模型 B (不含 EstimateTime)']['features']
importances_B = model_B.feature_importances_

# 排序並顯示
sorted_idx = np.argsort(importances_B)[::-1]

print("模型 B（不含 EstimateTime）特徵重要性排名：\n")
for i in sorted_idx:
    print(f"{features_B[i]:<10s}  →  {importances_B[i]:.4f}")

In [None]:
import joblib
joblib.dump(model_A, r'C:\Users\欲存成檔名.pkl')
joblib.dump(model_B, r'C:\Users\欲存成檔名.pkl')