# Data Aggregation

In [1]:
import os
import json
import pandas as pd

folder_path = "/Users/HuXiaoyuan/Desktop/Spring 2025 Courses/ORIE 5213 Discrete Optimization/Final Project_Bike Sharing/ORIE-5213-Spring2023/Bike sharing/Dataset"
all_trips = []

for day_idx, fname in enumerate(sorted(os.listdir(folder_path))):
    if fname.endswith('.json') and fname.startswith('simu0'):
        with open(os.path.join(folder_path, fname)) as f:
            trips = json.load(f)
            for trip in trips:
                all_trips.append({
                    'day': day_idx-1,
                    'time_departure': trip[0],
                    'station_departure': trip[1],
                    'time_arrival': trip[2],
                    'station_arrival': trip[3]
                })

df = pd.DataFrame(all_trips)
df


Unnamed: 0,day,time_departure,station_departure,time_arrival,station_arrival
0,1,4.0,1.0,22.0,11.0
1,1,15.0,1.0,37.0,18.0
2,1,19.0,12.0,45.0,8.0
3,1,48.0,11.0,72.0,5.0
4,1,50.0,22.0,73.0,27.0
...,...,...,...,...,...
732534,500,1431.0,16.0,15.0,10.0
732535,500,1434.0,15.0,18.0,14.0
732536,500,1437.0,8.0,2.0,0.0
732537,500,1437.0,8.0,24.0,1.0


In [2]:
# ----- Chunk 0 (改)：从原始 trips 表构造完整的 demand DataFrame -----
import pandas as pd
from itertools import product

# 假设原始骑行记录叫 df，columns=['day','time_departure','station_departure','time_arrival','station_arrival']

# 1) 拆出 departures 和 arrivals
departures = (
    df[['day','time_departure','station_departure']]
    .rename(columns={'time_departure':'time','station_departure':'station'})
)
departures['departure_count'] = 1

arrivals = (
    df[['day','time_arrival','station_arrival']]
    .rename(columns={'time_arrival':'time','station_arrival':'station'})
)
arrivals['arrival_count'] = 1

# 2) 合并并将 NaN 填为 0
events = pd.concat([departures, arrivals], ignore_index=True)
events[['departure_count','arrival_count']] = events[['departure_count','arrival_count']].fillna(0)

# 3) 计算半小时 interval（0–47）
events['interval_30min'] = (events['time'] // 30).astype(int)

# 4) 聚合出已有事件的组合
agg = (
    events
    .groupby(['day','interval_30min','station'])[['departure_count','arrival_count']]
    .sum()
    .reset_index()
)
agg['net_growth'] = agg['arrival_count'] - agg['departure_count']

# 5) 补全所有 day × interval × station 组合，缺失的行填 0
all_days      = df['day'].unique()
all_intervals = list(range(48))
all_stations  = sorted(pd.unique(
    pd.concat([
        df['station_departure'], 
        df['station_arrival']
    ])
))
# 构造完整索引
idx = pd.MultiIndex.from_product(
    [all_days, all_intervals, all_stations],
    names=['day','interval_30min','station']
)
# reindex 填充缺失组合
df_demand = (
    agg
    .set_index(['day','interval_30min','station'])
    .reindex(idx, fill_value=0)
    .reset_index()
)

# 6) 最终 df_demand 包含
#    day, interval_30min, station, departure_count, arrival_count, net_growth
# 行数 = len(all_days) * 48 * len(all_stations)
print(df_demand.shape)  # 应该是 (500*48*30, 6)
df_demand.head()


(720000, 6)


Unnamed: 0,day,interval_30min,station,departure_count,arrival_count,net_growth
0,1,0,0.0,0.0,0.0,0.0
1,1,0,1.0,2.0,0.0,-2.0
2,1,0,2.0,0.0,0.0,0.0
3,1,0,3.0,0.0,0.0,0.0
4,1,0,4.0,0.0,0.0,0.0


In [3]:
df_demand

Unnamed: 0,day,interval_30min,station,departure_count,arrival_count,net_growth
0,1,0,0.0,0.0,0.0,0.0
1,1,0,1.0,2.0,0.0,-2.0
2,1,0,2.0,0.0,0.0,0.0
3,1,0,3.0,0.0,0.0,0.0
4,1,0,4.0,0.0,0.0,0.0
...,...,...,...,...,...,...
719995,500,47,25.0,0.0,0.0,0.0
719996,500,47,26.0,1.0,1.0,0.0
719997,500,47,27.0,1.0,0.0,-1.0
719998,500,47,28.0,0.0,0.0,0.0


# Random Forest

In [4]:
import pandas as pd
import numpy as np

# 假设 df_demand 已经准备好，包含 day, interval_30min, station, departure_count, arrival_count
df = df_demand.copy()

# 时间特征
df['timeslot']   = df['interval_30min']                # 0–47
df['weekday']    = ((df['day'] - 1) % 7) + 1           # 1–7
df['is_weekend'] = df['weekday'].isin([6,7]).astype(int)

# 排序并计算滞后特征
df = df.sort_values(['station','day','timeslot'])
for lag in [1,2,3,48]:
    df[f'dep_lag_{lag}'] = df.groupby('station')['departure_count'].shift(lag).fillna(0)
    df[f'arr_lag_{lag}'] = df.groupby('station')['arrival_count'].shift(lag).fillna(0)

features = [
    'station','timeslot','weekday','is_weekend',
    'dep_lag_1','dep_lag_2','dep_lag_3','dep_lag_48',
    'arr_lag_1','arr_lag_2','arr_lag_3','arr_lag_48'
]


In [5]:
# 以第 1–400 天做训练，401–500 天做测试
train_df = df[df['day'] <= 400]
test_df  = df[df['day'] >  400]

X_train     = train_df[features]
y_dep_train = train_df['departure_count']
y_arr_train = train_df['arrival_count']

X_test      = test_df[features]
y_dep_test  = test_df['departure_count']
y_arr_test  = test_df['arrival_count']


In [6]:
from sklearn.ensemble import RandomForestRegressor

# 用 n_jobs=-1 自动使用所有 CPU 核心
model_dep = RandomForestRegressor(
    n_estimators=100,
    n_jobs=-1,
    random_state=42
)
model_arr = RandomForestRegressor(
    n_estimators=100,
    n_jobs=-1,
    random_state=42
)

# 训练
model_dep.fit(X_train, y_dep_train)
model_arr.fit(X_train, y_arr_train)


In [7]:
pred_dep = model_dep.predict(X_test)
pred_arr = model_arr.predict(X_test)


In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def mape(y_true, y_pred):
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

metrics = {
    'Metric':   ['MAE_dep','RMSE_dep','MAPE_dep','MAE_arr','RMSE_arr','MAPE_arr'],
    'Value': [
        mean_absolute_error(y_dep_test, pred_dep),
        np.sqrt(mean_squared_error(y_dep_test, pred_dep)),
        mape(y_dep_test, pred_dep),
        mean_absolute_error(y_arr_test, pred_arr),
        np.sqrt(mean_squared_error(y_arr_test, pred_arr)),
        mape(y_arr_test, pred_arr)
    ]
}

metrics_df = pd.DataFrame(metrics)
print(metrics_df)


     Metric      Value
0   MAE_dep   0.517791
1  RMSE_dep   0.755297
2  MAPE_dep  37.947360
3   MAE_arr   0.514194
4  RMSE_arr   0.760041
5  MAPE_arr  37.751693


# XGBoost

In [9]:
import pandas as pd
import numpy as np

# 假设 df_demand 已经准备好
df = df_demand.copy()
df['timeslot']   = df['interval_30min']
df['weekday']    = ((df['day'] - 1) % 7) + 1
df['is_weekend'] = df['weekday'].isin([6,7]).astype(int)

df = df.sort_values(['station','day','timeslot'])
for lag in [1,2,3,48]:
    df[f'dep_lag_{lag}'] = df.groupby('station')['departure_count'].shift(lag).fillna(0)
    df[f'arr_lag_{lag}'] = df.groupby('station')['arrival_count'].shift(lag).fillna(0)

features = [
    'station','timeslot','weekday','is_weekend',
    'dep_lag_1','dep_lag_2','dep_lag_3','dep_lag_48',
    'arr_lag_1','arr_lag_2','arr_lag_3','arr_lag_48'
]


In [10]:
train_df = df[df['day'] <= 400]
test_df  = df[df['day'] >  400]

X_train     = train_df[features]
y_dep_train = train_df['departure_count']
y_arr_train = train_df['arrival_count']

X_test      = test_df[features]
y_dep_test  = test_df['departure_count']
y_arr_test  = test_df['arrival_count']


In [11]:
from xgboost import XGBRegressor

# departure 模型
model_dep = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    n_jobs=-1    # 多核并行
)

# arrival 模型
model_arr = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)

# 训练
model_dep.fit(X_train, y_dep_train)
model_arr.fit(X_train, y_arr_train)


In [12]:
pred_dep = model_dep.predict(X_test)
pred_arr = model_arr.predict(X_test)

In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def mape(y_true, y_pred):
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

metrics = {
    'Metric':   ['MAE_dep','RMSE_dep','MAPE_dep','MAE_arr','RMSE_arr','MAPE_arr'],
    'Value': [
        mean_absolute_error(y_dep_test, pred_dep),
        np.sqrt(mean_squared_error(y_dep_test, pred_dep)),
        mape(y_dep_test, pred_dep),
        mean_absolute_error(y_arr_test, pred_arr),
        np.sqrt(mean_squared_error(y_arr_test, pred_arr)),
        mape(y_arr_test, pred_arr)
    ]
}

metrics_df = pd.DataFrame(metrics)
print(metrics_df)


     Metric      Value
0   MAE_dep   0.514922
1  RMSE_dep   0.751205
2  MAPE_dep  35.434004
3   MAE_arr   0.512601
4  RMSE_arr   0.752852
5  MAPE_arr  35.460983


# LightGBM

In [14]:
import pandas as pd
import numpy as np

# 假设 df_demand 已经准备好
df = df_demand.copy()

# 时间特征
df['timeslot']   = df['interval_30min']
df['weekday']    = ((df['day'] - 1) % 7) + 1
df['is_weekend'] = df['weekday'].isin([6,7]).astype(int)

# 排序后构造滞后特征
df = df.sort_values(['station','day','timeslot'])
for lag in [1,2,3,48]:
    df[f'dep_lag_{lag}'] = df.groupby('station')['departure_count'].shift(lag).fillna(0)
    df[f'arr_lag_{lag}'] = df.groupby('station')['arrival_count'].shift(lag).fillna(0)

features = [
    'station','timeslot','weekday','is_weekend',
    'dep_lag_1','dep_lag_2','dep_lag_3','dep_lag_48',
    'arr_lag_1','arr_lag_2','arr_lag_3','arr_lag_48'
]


In [15]:
train_df = df[df['day'] <= 400]
test_df  = df[df['day'] >  400]

X_train     = train_df[features]
y_dep_train = train_df['departure_count']
y_arr_train = train_df['arrival_count']

X_test      = test_df[features]
y_dep_test  = test_df['departure_count']
y_arr_test  = test_df['arrival_count']


In [16]:
import lightgbm as lgb

# departure 模型
model_dep = lgb.LGBMRegressor(
    objective='regression',
    learning_rate=0.1,
    num_leaves=31,
    n_estimators=500,
    n_jobs=-1,
    random_state=42
)
# arrival 模型
model_arr = lgb.LGBMRegressor(
    objective='regression',
    learning_rate=0.1,
    num_leaves=31,
    n_estimators=500,
    n_jobs=-1,
    random_state=42
)

# 用 callbacks 接口做 early stopping
callbacks = [
    lgb.early_stopping(stopping_rounds=50),
    lgb.log_evaluation(period=0)
]

model_dep.fit(
    X_train, y_dep_train,
    eval_set=[(X_test, y_dep_test)],
    eval_metric=['l1','l2'],
    callbacks=callbacks
)
model_arr.fit(
    X_train, y_arr_train,
    eval_set=[(X_test, y_arr_test)],
    eval_metric=['l1','l2'],
    callbacks=callbacks
)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 224
[LightGBM] [Info] Number of data points in the train set: 576000, number of used features: 12
[LightGBM] [Info] Start training from score 1.017470
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[496]	valid_0's l1: 0.501601	valid_0's l2: 0.531522
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002768 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 224
[LightGBM] [Info] Number of data points in the train set: 576000, number of used features: 12
[LightGBM] [Info] Start training from score 1.017470
Training until validation s

In [17]:
pred_dep = model_dep.predict(X_test)
pred_arr = model_arr.predict(X_test)


In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def mape(y_true, y_pred):
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

metrics = {
    'Metric':   ['MAE_dep','RMSE_dep','MAPE_dep',
                 'MAE_arr','RMSE_arr','MAPE_arr'],
    'Value': [
        mean_absolute_error(y_dep_test, pred_dep),
        np.sqrt(mean_squared_error(y_dep_test, pred_dep)),
        mape(y_dep_test, pred_dep),
        mean_absolute_error(y_arr_test, pred_arr),
        np.sqrt(mean_squared_error(y_arr_test, pred_arr)),
        mape(y_arr_test, pred_arr)
    ]
}

metrics_df = pd.DataFrame(metrics)
print(metrics_df)


     Metric      Value
0   MAE_dep   0.501601
1  RMSE_dep   0.729056
2  MAPE_dep  34.749083
3   MAE_arr   0.499056
4  RMSE_arr   0.730096
5  MAPE_arr  34.901525


# SARIMA

In [19]:
import os
import pandas as pd

# —— 并行加速 —— 
os.environ["OMP_NUM_THREADS"] = "8"
os.environ["MKL_NUM_THREADS"] = "8"

# —— 构造 station=1 的平坦时序 —— 
# 按 day（1–500）与 interval_30min（0–47）排序取出 departure_count
series = (
    df_demand[df_demand.station == 1]
    .sort_values(['day','interval_30min'])
    ['departure_count']
    .reset_index(drop=True)
)
# series 是长度 500*48 的 pandas Series，索引 0…23999
# 划分训练 (前 400 天 = 400*48) 与测试 (后 100 天 = 100*48)
n_train = 400 * 48
train_ser = series.iloc[:n_train]
test_ser  = series.iloc[n_train:]


In [20]:
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX

# 拟合 SARIMA(1,0,1)x(1,1,1,48)
model = SARIMAX(
    train_ser,
    order=(1,0,1),
    seasonal_order=(1,1,1,48),
    enforce_stationarity=False,
    enforce_invertibility=False
)
res = model.fit(disp=False)

# 预测后 100*48 步
steps  = len(test_ser)
pred   = res.get_forecast(steps=steps)
y_pred = pred.predicted_mean.values
y_true = test_ser.values


  return _core_matmul(x1, x2)
  return _core_matmul(x1, x2)
  return _core_matmul(x1, x2)


In [21]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

def mape(y_true, y_pred):
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

mae   = mean_absolute_error(y_true, y_pred)
rmse  = np.sqrt(mean_squared_error(y_true, y_pred))
mape_ = mape(y_true, y_pred)

print("SARIMA (station=1) on days 401–500:")
print(f" MAE:  {mae:.3f}")
print(f" RMSE: {rmse:.3f}")
print(f" MAPE: {mape_:.2f}%")


SARIMA (station=1) on days 401–500:
 MAE:  0.458
 RMSE: 0.729
 MAPE: 34.61%


# Prophet

In [22]:
# 先升级 pip（可选，但有助于找到新包）
%pip install --upgrade pip --quiet
# 然后安装 Prophet，或者 fbprophet
%pip install prophet --quiet
# 如果上面报错，再试：
%pip install fbprophet --quiet


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mPreparing metadata [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[2 lines of output][0m
  [31m   [0m Cython>=0.22 and NumPy are required.
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Note: you may need to restart the kernel to use updated packages.


In [23]:
from prophet import Prophet   # 或者 from fbprophet import Prophet


Importing plotly failed. Interactive plots will not work.


In [24]:
import pandas as pd

# —— 构造人工时间戳（修正版） —— 
# 以 station=1 为例
base = pd.to_datetime("2000-01-01")
df1 = (
    df_demand[df_demand.station == 1]
    .sort_values(['day','interval_30min'])
    .copy()
)

# 1) 天的偏移
df1['delta_days'] = (df1['day'] - 1)
# 2) 半小时槽对应的分钟偏移
df1['delta_minutes'] = df1['interval_30min'] * 30

# 3) 计算 ds：base + 天偏移 + 分钟偏移
df1['ds'] = (
    base
    + pd.to_timedelta(df1['delta_days'],    unit='D')
    + pd.to_timedelta(df1['delta_minutes'], unit='m')
)

# 划分：前 400 天（1–400）训练，401–500 测试
train_prophet = df1[df1.day <= 400]
test_prophet  = df1[df1.day >  400]

# 构建 Prophet 输入
df_dep_train = train_prophet[['ds','departure_count']].rename(columns={'departure_count':'y'})
df_dep_test  = test_prophet [['ds','departure_count']].rename(columns={'departure_count':'y'})
df_arr_train = train_prophet[['ds','arrival_count']].rename(columns={'arrival_count':'y'})
df_arr_test  = test_prophet [['ds','arrival_count']].rename(columns={'arrival_count':'y'})


In [25]:
from prophet import Prophet

# departure 模型
m_dep = Prophet(daily_seasonality=False, weekly_seasonality=True, seasonality_mode='additive')
m_dep.add_seasonality('half_hour', period=0.5, fourier_order=3)
m_dep.fit(df_dep_train)

# arrival 模型
m_arr = Prophet(daily_seasonality=False, weekly_seasonality=True, seasonality_mode='additive')
m_arr.add_seasonality('half_hour', period=0.5, fourier_order=3)
m_arr.fit(df_arr_train)


17:28:24 - cmdstanpy - INFO - Chain [1] start processing
17:28:25 - cmdstanpy - INFO - Chain [1] done processing
17:28:25 - cmdstanpy - INFO - Chain [1] start processing
17:28:25 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x17880a210>

In [26]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 构造 future DF（测试集的 ds 列）
future = test_prophet[['ds']].reset_index(drop=True)

# 预测
fc_dep = m_dep.predict(future)['yhat'].values
fc_arr = m_arr.predict(future)['yhat'].values

y_dep_true = df_dep_test['y'].values
y_arr_true = df_arr_test['y'].values

def mape(y, yhat):
    mask = y != 0
    return np.mean(np.abs((y[mask] - yhat[mask]) / y[mask])) * 100

metrics = {
    'Metric': ['MAE_dep','RMSE_dep','MAPE_dep','MAE_arr','RMSE_arr','MAPE_arr'],
    'Value': [
        mean_absolute_error(y_dep_true, fc_dep),
        np.sqrt(mean_squared_error(y_dep_true, fc_dep)),
        mape(y_dep_true, fc_dep),
        mean_absolute_error(y_arr_true, fc_arr),
        np.sqrt(mean_squared_error(y_arr_true, fc_arr)),
        mape(y_arr_true, fc_arr)
    ]
}
import pandas as pd
print(pd.DataFrame(metrics))


  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  comp = np.matmul(X, beta_c.transpose())
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_a = np.matmul(seasonal_features.values,
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)
  Xb_m = np.matmul(seasonal_features.values, beta * s_m.values)


     Metric      Value
0   MAE_dep   1.501250
1  RMSE_dep   2.258210
2  MAPE_dep  53.205064
3   MAE_arr   1.322351
4  RMSE_arr   1.999567
5  MAPE_arr  60.440652


# LSTM

In [None]:
# # 安装 CPU-only 的 PyTorch
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --quiet


In [28]:
# import torch
# # print(torch.__version__)


2.7.0


In [29]:
# import numpy as np
# import pandas as pd
# import torch
# from sklearn.preprocessing import MinMaxScaler

# # 取 station=1 的 departure/arrival 序列并排序
# ser_dep = (
#     df_demand[df_demand.station == 1]
#     .sort_values(['day','interval_30min'])['departure_count']
#     .values
# )
# ser_arr = (
#     df_demand[df_demand.station == 1]
#     .sort_values(['day','interval_30min'])['arrival_count']
#     .values
# )

# # 划分：前400天（400*48=19200）训练，后100天（4800）测试
# n_train = 400 * 48
# dep_train, dep_test = ser_dep[:n_train], ser_dep[n_train:]
# arr_train, arr_test = ser_arr[:n_train], ser_arr[n_train:]

# # 标准化
# scaler_dep = MinMaxScaler()
# scaler_arr = MinMaxScaler()

# dep_train_s = scaler_dep.fit_transform(dep_train.reshape(-1,1))
# dep_test_s  = scaler_dep.transform(dep_test.reshape(-1,1))
# arr_train_s = scaler_arr.fit_transform(arr_train.reshape(-1,1))
# arr_test_s  = scaler_arr.transform(arr_test.reshape(-1,1))


In [None]:
# def make_seq_strided(data: np.ndarray, window: int = 48):
#     arr = data.reshape(-1) 
#     n   = arr.shape[0] - window
#     # 构造滑动视图 [n,window]
#     X = np.lib.stride_tricks.as_strided(
#         arr,
#         shape=(n, window),
#         strides=(arr.strides[0], arr.strides[0])
#     )
#     y = arr[window:]
#     # 转 torch: [n,window,1], [n]
#     return (
#         torch.from_numpy(X).float().unsqueeze(-1),
#         torch.from_numpy(y).float()
#     )

# window = 48
# # 训练集
# X_dep_tr, y_dep_tr = make_seq_strided(dep_train_s, window)
# X_arr_tr, y_arr_tr = make_seq_strided(arr_train_s, window)
# # 测试集：拼接最后 window 条训练数据确保滑窗连续
# full_dep = np.vstack([dep_train_s[-window:], dep_test_s])
# full_arr = np.vstack([arr_train_s[-window:], arr_test_s])
# X_dep_te, y_dep_te = make_seq_strided(full_dep, window)
# X_arr_te, y_arr_te = make_seq_strided(full_arr, window)


In [None]:
# import torch.nn as nn

# class LSTMForecaster(nn.Module):
#     def __init__(self, hidden_size=64):
#         super().__init__()
#         self.lstm = nn.LSTM(input_size=1, hidden_size=hidden_size, batch_first=True)
#         self.fc   = nn.Linear(hidden_size, 1)
#     def forward(self, x):
#         out, _ = self.lstm(x)         
#         return self.fc(out[:, -1, :])  # 取最后时刻输出

# device     = torch.device('cpu')
# model_dep  = LSTMForecaster().to(device)
# model_arr  = LSTMForecaster().to(device)
# criterion  = nn.MSELoss()
# opt_dep    = torch.optim.Adam(model_dep.parameters(), lr=1e-3)
# opt_arr    = torch.optim.Adam(model_arr.parameters(), lr=1e-3)


In [None]:
# def train(model, opt, X, y, epochs=10, batch_size=64):
#     model.train()
#     for _ in range(epochs):
#         perm = torch.randperm(len(X))
#         for i in range(0, len(X), batch_size):
#             idx = perm[i:i+batch_size]
#             xb, yb = X[idx].to(device), y[idx].to(device).unsqueeze(-1)
#             pred   = model(xb)
#             loss   = criterion(pred, yb)
#             opt.zero_grad()
#             loss.backward()
#             opt.step()
#     return model

# model_dep = train(model_dep, opt_dep, X_dep_tr, y_dep_tr)
# model_arr = train(model_arr, opt_arr, X_arr_tr, y_arr_tr)


In [None]:
# from sklearn.metrics import mean_absolute_error, mean_squared_error

# model_dep.eval(); model_arr.eval()
# with torch.no_grad():
#     pred_dep_s = model_dep(X_dep_te.to(device)).cpu().numpy()
#     pred_arr_s = model_arr(X_arr_te.to(device)).cpu().numpy()

# # 反标准化
# y_dep_pred = scaler_dep.inverse_transform(pred_dep_s).ravel()
# y_arr_pred = scaler_arr.inverse_transform(pred_arr_s).ravel()
# y_dep_true = scaler_dep.inverse_transform(y_dep_te.unsqueeze(-1).numpy()).ravel()
# y_arr_true = scaler_arr.inverse_transform(y_arr_te.unsqueeze(-1).numpy()).ravel()

# def mape(y_true, y_pred):
#     mask = y_true != 0
#     return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

# metrics = {
#     'Metric': ['MAE_dep','RMSE_dep','MAPE_dep','MAE_arr','RMSE_arr','MAPE_arr'],
#     'Value': [
#         mean_absolute_error(y_dep_true, y_dep_pred),
#         np.sqrt(mean_squared_error(y_dep_true, y_dep_pred)),
#         mape(y_dep_true, y_dep_pred),
#         mean_absolute_error(y_arr_true, y_arr_pred),
#         np.sqrt(mean_squared_error(y_arr_true, y_arr_pred)),
#         mape(y_arr_true, y_arr_pred),
#     ]
# }
# print(pd.DataFrame(metrics))
