# modeling multivariate with nn model

前面我們建立過了baseline, 單變量時間序列模型, 接下來要做的是把nn model的實力發揚光大  
本節將要進行多變數的時間序列預測，透過我們前一節所做的特徵篩選來強化模型  
看能否有機會更進一步提升解釋能力  

在我們前面所選擇的模型中，只有TSMixerx, MLP, MLPMultivariate支援多變數  
因此接下來只會引用這三種模型框架結合變數進行建模

# Lib

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import time
import sys

# 將 hwttp 資料夾加入 Python 搜尋路徑
sys.path.append(os.path.abspath('..'))
import hwttp.hwtoolkit as tk

# neuralnetwork methods
from neuralforecast import NeuralForecast
from neuralforecast.models import (
    TSMixerx,
    MLPMultivariate,
)

# evaluate
from utilsforecast.losses import mse, mae, rmse, smape
from neuralforecast.losses.pytorch import MQLoss, DistributionLoss, GMM, MAE, RMSE # modeling use
from utilsforecast.evaluation import evaluate
from utilsforecast.plotting import plot_series


# extra
import importlib

# 必要
os.environ['NIXTLA_ID_AS_COL'] = '1'

In [2]:
from os import listdir
from os.path import isfile, join

In [3]:
from pathlib import Path

# Load data

## all features

In [4]:
parquet_path = "../data/features/all_features_by_intergrantry/"

p = Path(parquet_path)
all_feature_dfs = dict()

for f in p.glob("*.parquet"):
    all_feature_dfs[f.name.replace('_b_p_c_h_t_r.parquet', '')] = pd.read_parquet(f)

In [5]:
all_feature_dfs['05F0528N-05F0438N'].keys()

Index(['gf_gt', 'GantryFrom', 'GantryTo', 'TimeStamp', 'WeightedAvgTravelTime',
       'TotalTraffic', 'ds_prev_1_WATT', 'ds_prev_2_WATT', 'ds_prev_3_WATT',
       'ds_prev_4_WATT', 'ds_prev_5_WATT', 'congestion_syndrome',
       'holiday_continue', 'holiday_length', 'dayofweek', 'holiday_name_七夕情人節',
       'holiday_name_中元節', 'holiday_name_中秋節', 'holiday_name_二二八紀念日',
       'holiday_name_元宵節', 'holiday_name_兒童與清明節', 'holiday_name_勞動節',
       'holiday_name_國慶日', 'holiday_name_教師節', 'holiday_name_母親節',
       'holiday_name_父親節', 'holiday_name_白色情人節', 'holiday_name_端午節',
       'holiday_name_聖誕節', 'holiday_name_萬聖節', 'holiday_name_西洋情人節',
       'holiday_name_跨年元旦', 'holiday_name_農曆新年', 'accident_mileage',
       'event_occurrence', 'event_exclusion', 'handling_minutes',
       'accident_type', 'death_count', 'injuries_count', 'inner_shoulder_flag',
       'inner_lane_flag', 'middle_inner_lane_flag', 'middle_lane_flag',
       'middle_outer_lane_flag', 'outer_lane_flag', 'outer_should

## selected feature

In [6]:
parquet_path = "../data/features/feature_selection_results/"

p = Path(parquet_path)
fs_results = dict()

for f in p.glob("*.csv"):
    fs_results[f.name.replace('_b_p_c_h_t_r.csv', '')] = pd.read_csv(f)

# Preprocessing

前處理部分，我們設置一個門檻，篩選大於門檻的特徵值出來

In [7]:
threshold = 0.00005

train_df_sets = dict()
common_cols = ['gf_gt', 'GantryFrom', 'GantryTo', 'TimeStamp', 'WeightedAvgTravelTime']

for df_name, df in all_feature_dfs.items():
    # initialize
    use_cols = ['gf_gt', 'GantryFrom', 'GantryTo', 'TimeStamp', 'WeightedAvgTravelTime']
    
    # get selected features, filtered with threshold
    sf_df = fs_results[df_name].copy()
    sf_df = sf_df[sf_df.importance > threshold].copy()
    target_selected_features = list(sf_df['feature'])
    use_cols.extend(target_selected_features)
    
    print(f'{df_name}, {len(use_cols)}')
    
    # assign train df
    train_df_sets[df_name] = df[use_cols].copy()

05F0055N-05F0001N, 38
05F0438N-05FR143N, 29
05F0001N-03F0150N, 25
05F0438N-05F0309N, 32
05F0287N-05F0055N, 37
05F0309N-05F0287N, 32
05F0528N-05F0438N, 34
05F0001N-03F0201S, 25


# Modeling

## common function

In [8]:
def timeit(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Function '{func.__name__}' executed in {elapsed_time:.4f} seconds")
        return result
    return wrapper

In [9]:
# 這是要轉成modeling可以用的
def p_df_formatter(target_df):
    df = target_df.copy()
    df.rename(columns={'gf_gt': 'unique_id', 
                       'TimeStamp': 'ds',\
                       'WeightedAvgTravelTime': 'y'}, inplace=True)
    df['ds'] = df['ds'].astype('datetime64[ns]')
    df.drop(columns={'GantryFrom', 'GantryTo'}, inplace=True)
    return df

In [10]:
@timeit
def cv_holder(model, df):
    '''
    input model, data to get cv results
    '''
    nf = NeuralForecast(models=[model], freq='15min',)
    cv_df = nf.cross_validation(df, 
                                n_windows=50, 
                                step_size=7*24*4, 
                                refit=True, # refit=-1 stands for no retrain
                                verbose=False
                               )
    return cv_df
    

## var config

In [11]:
def set_aux_list(df_col_list, exclude_cols):
    'modify version'
    # initialize exog list
    hist_exog_list = list()
    futr_exog_list = list()
    
    hist_exog_pools = ['congestion_syndrome',
                       'ds_prev_1_WATT', 
                       'ds_prev_2_WATT',
                       'ds_prev_3_WATT', 
                       'ds_prev_4_WATT', 
                       'ds_prev_5_WATT'
                      ]
    
    futr_exog_pools = ['holiday_continue', 'holiday_length',
                      'dayofweek', 'holiday_name_七夕情人節', 'holiday_name_中元節',
                      'holiday_name_中秋節', 'holiday_name_二二八紀念日', 'holiday_name_元宵節',
                      'holiday_name_兒童與清明節', 'holiday_name_勞動節', 'holiday_name_國慶日',
                      'holiday_name_教師節', 'holiday_name_母親節', 'holiday_name_父親節',
                      'holiday_name_白色情人節', 'holiday_name_端午節', 'holiday_name_聖誕節',
                      'holiday_name_萬聖節', 'holiday_name_西洋情人節', 'holiday_name_跨年元旦',
                      'holiday_name_農曆新年', 
                       'accident_mileage', 'event_occurrence',
                       'event_exclusion', 'handling_minutes', 'accident_type', 'death_count',
                       'injuries_count', 'inner_shoulder_flag', 'inner_lane_flag',
                       'middle_inner_lane_flag', 'middle_lane_flag', 'middle_outer_lane_flag',
                       'outer_lane_flag', 'outer_shoulder_flag', 'ramp_flag',
                       'overturn_accident_flag', 'construction_accident_flag',
                       'hazardous_material_vehicle_flag', 'on_fire_vehicle_flag',
                       'smoking_vehicle_flag', 'mainlane_disruption_flag',
                       'accident_vehicle_count', 'light_truck_count', 'passenger_car_count',
                       'bus_count', 'heavy_truck_count',
                       'road_build', 'total_block_count', 'road_block_count'
                      ]
    
    # assign exog_list
    for col in df_col_list:
        if col in hist_exog_pools:
            hist_exog_list.append(col)
        if col in futr_exog_pools:
            futr_exog_list.append(col)
    
    # check if not, make them equal to None
    if len(hist_exog_list) == 0:
        hist_exog_list = None
    if len(futr_exog_list) == 0:
        futr_exog_list = None

    return hist_exog_list, futr_exog_list

In [12]:
train_df_sets.keys()

dict_keys(['05F0055N-05F0001N', '05F0438N-05FR143N', '05F0001N-03F0150N', '05F0438N-05F0309N', '05F0287N-05F0055N', '05F0309N-05F0287N', '05F0528N-05F0438N', '05F0001N-03F0201S'])

In [12]:
decision = dict()
decision['05F0001N-03F0201S'] = 'MLPMultivariate'#
decision['05F0055N-05F0001N'] = 'TSMixerx'#
decision['05F0528N-05F0438N'] = 'MLP'
decision['05F0438N-05FR143N'] = 'MLPMultivariate' #
decision['05F0001N-03F0150N'] = 'MLP'#
decision['05F0287N-05F0055N'] = 'TSMixerx'#
decision['05F0309N-05F0287N'] = 'MLPMultivariate'#
decision['05F0438N-05F0309N'] = 'TSMixerx'#

## crossvalidation main

In [13]:
# current_file_group = os.listdir('./final_cv_modelling/')
current_file_group = os.listdir('../outputs/multi_w_nn/')

In [None]:
train_steps = 500
horizon = 4*24*7 # 未來一個禮拜，每15分鐘一筆

model_list = ['MLPMultivariate', 'TSMixerx']

for gantry_name, train_df in train_df_sets.items():
    # training format
    p_df = p_df_formatter(train_df)
    
    # set exog_list
    exclude_list = ['unique_id', 'ds', 'y']
    hist_exog_list, futr_exog_list = set_aux_list(list(p_df.keys()), exclude_list)
    # stat_exog_list = None
    
    # pick model from candidate_list
    for model_name in model_list:
        if model_name == 'TSMixerx':
            model = TSMixerx(h=horizon,
                               input_size=horizon*2,
                               n_series=1,
                               n_block=4,
                               ff_dim=4,
                               revin=True,
                               scaler_type='standard',
                               max_steps=train_steps,
                               early_stop_patience_steps=-1,
                               accelerator='cpu',
                               val_check_steps=5,
                               learning_rate=1e-5,
                               loss=RMSE(),
                               valid_loss=RMSE(),
                               batch_size=32,
                               stat_exog_list=None,
                               hist_exog_list=hist_exog_list,
                               futr_exog_list=futr_exog_list
                              )
        elif model_name == 'MLPMultivariate':
            model = MLPMultivariate(h=horizon, 
                                     input_size=horizon*2,
                                     n_series=1,
                                     max_steps=train_steps,
                                     scaler_type='standard',
                                     hidden_size=256,
                                     loss=RMSE(),
                                     accelerator='cpu',
                                     stat_exog_list=None,
                                     hist_exog_list=hist_exog_list,
                                     futr_exog_list=futr_exog_list
                                    )
        else:
            print('Model not supported, please check the model name')
        
        # run model cv
        # 判斷是否已經跑過了
        output_name = f'{gantry_name}-{model_name}.csv'
        if output_name in current_file_group:
            print(f'{output_name} already complete, pass')
            continue

        print(f'Handling {model_name} ......')
        cv_df = cv_holder(model, p_df)
        cv_df.to_csv(f'../outputs/multi_w_nn/{gantry_name}-{model_name}.csv', index=False)
        print(f'Complete, check {gantry_name}-{model_name}.csv')
        
        import gc
        gc.collect()

Seed set to 1


Handling MLPMultivariate ......


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name   | Type          | Params | Mode 
-------------------------------------------------
0 | loss   | RMSE          | 0      | train
1 | padder | ConstantPad1d | 0      | train
2 | scaler | TemporalNorm  | 0      | train
3 | mlp    | ModuleList    | 16.6 M | train
4 | out    | Linear        | 172 K  | train
-------------------------------------------------
16.8 M    Trainable params
0         Non-trainable params
16.8 M    Total params
67.015    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

In [None]:
train_steps = 500
horizon = 4*24*7 # 未來一個禮拜，每15分鐘一筆

for gantry_name, train_df in train_df_sets.items():
    p_df = p_df_formatter(train_df)
     # set exog_list
    exclude_list = ['unique_id', 'ds', 'y']
    hist_exog_list, futr_exog_list = set_aux_list(list(p_df.keys()), exclude_list)
    # stat_exog_list = None
    
    # set model
    model_name = decision[gantry_name]
    if model_name == 'TSMixerx':
        model = TSMixerx(h=horizon,
                           input_size=horizon*2,
                           n_series=1,
                           n_block=4,
                           ff_dim=4,
                           revin=True,
                           scaler_type='standard',
                           max_steps=train_steps,
                           early_stop_patience_steps=-1,
                           accelerator='cpu',
                           val_check_steps=5,
                           learning_rate=1e-5,
                           loss=RMSE(),
                           valid_loss=RMSE(),
                           batch_size=32,
                           stat_exog_list=None,
                           hist_exog_list=hist_exog_list,
                           futr_exog_list=futr_exog_list
                          )
    elif model_name == 'MLP':
        model = MLP(h=horizon, 
                     input_size=horizon*2,
                     loss=DistributionLoss(distribution='Normal', level=[80, 90]),
                     scaler_type='robust',
                     learning_rate=1e-5,
                     max_steps=train_steps,
                     val_check_steps=10,
                     early_stop_patience_steps=-1,
                     accelerator='cpu',
                     stat_exog_list=None,
                     hist_exog_list=hist_exog_list,
                     futr_exog_list=futr_exog_list
                    )
    elif model_name == 'MLPMultivariate':
        model = MLPMultivariate(h=horizon, 
                                 input_size=horizon*2,
                                 n_series=1,
                                 max_steps=train_steps,
                                 scaler_type='standard',
                                 hidden_size=256,
                                 loss=RMSE(),
                                 accelerator='cpu',
                                 stat_exog_list=None,
                                 hist_exog_list=hist_exog_list,
                                 futr_exog_list=futr_exog_list
                                )
    else:
        print('Model not supported, please check the model name')
    
    # run model cv

    # 判斷是否已經跑過了
    target_name = f'{gantry_name}-{model_name}.csv'
    if target_name in current_file_group:
        print(f'{target_name} already complete, pass')
        continue

    print(f'Handling {model_name} ......')
    cv_df = cv_holder(model, p_df)
    cv_df.to_csv(f'./final_cv_modelling/{gantry_name}-{model_name}.csv', index=False)
    print(f'Complete, check {gantry_name}-{model_name}.csv')
    import gc
    gc.collect()


Seed set to 1
Seed set to 1


05F0001N-03F0150N-MLP.csv already complete, pass
05F0001N-03F0201S-MLPMultivariate.csv already complete, pass


Seed set to 1
Seed set to 1
Seed set to 1


05F0055N-05F0001N-TSMixerx.csv already complete, pass
05F0287N-05F0055N-TSMixerx.csv already complete, pass


Seed set to 1
Seed set to 1


05F0309N-05F0287N-MLPMultivariate.csv already complete, pass
05F0438N-05F0309N-TSMixerx.csv already complete, pass


Seed set to 1


05F0438N-05FR143N-MLPMultivariate.csv already complete, pass
Handling MLP ......


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type             | Params | Mode 
----------------------------------------------------------
0 | loss         | DistributionLoss | 5      | train
1 | padder_train | ConstantPad1d    | 0      | train
2 | scaler       | TemporalNorm     | 0      | train
3 | mlp          | ModuleList       | 58.2 M | train
4 | out          | Linear           | 1.4 M  | train
----------------------------------------------------------
59.5 M    Trainable params
5         Non-trainable params
59.5 M    Total params
238.171   Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=500` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type             | Params | Mode 
----------------------------------------------------------
0 | loss         | DistributionLoss | 5      | train
1 | padder_train | ConstantPad1d    | 0      | train
2 | scaler       | TemporalNorm     | 0      | train
3 | mlp          | ModuleList       | 58.2 M | train
4 | out          | Linear           | 1.4 M  | train
----------------------------------------------------------
59.5 M    Trainable params
5         Non-trainable params
59.5 M    Total params
238.171   Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=500` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type             | Params | Mode 
----------------------------------------------------------
0 | loss         | DistributionLoss | 5      | train
1 | padder_train | ConstantPad1d    | 0      | train
2 | scaler       | TemporalNorm     | 0      | train
3 | mlp          | ModuleList       | 58.2 M | train
4 | out          | Linear           | 1.4 M  | train
----------------------------------------------------------
59.5 M    Trainable params
5         Non-trainable params
59.5 M    Total params
238.171   Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

保存到此

# Evaluation 

In [26]:
mypath = './TSMixerx_cv_results/'

df_list = []        
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
keep_cols = ['ds', 'cutoff', 'y', 'unique_id', 'TSMixerx'] # 使用時需要改動

for file in onlyfiles:
    if ('.csv' in file) & (file.startswith('05')): # 使用時需要改動
        df = pd.read_csv(join(mypath, file))
        df = df[keep_cols].copy()
        df.rename(columns={'TSMixerx':file.replace('.csv', '')}, inplace=True) # 使用時需要改動
        df_list.append(df)
        print(f'read_csv = {file}, shape = {df.shape}')

common_cols = ['ds', 'cutoff', 'y', 'unique_id']
cv_df = pd.DataFrame()
for idx, df in enumerate(df_list):
    if idx==0:
        cv_df = df.copy()
    else:
        temp_df = df.copy()
        temp_df.drop(columns=common_cols, inplace=True)
        cv_df = pd.concat([cv_df, temp_df], axis=1)

read_csv = 05F0001N-03F0150N_b_p_t_r.csv, shape = (8064, 5)
read_csv = 05F0001N-03F0201S_b_p_t.csv, shape = (8064, 5)
read_csv = 05F0055N-05F0001N_b_p_r.csv, shape = (8064, 5)
read_csv = 05F0287N-05F0055N_b_p_c.csv, shape = (8064, 5)
read_csv = 05F0309N-05F0287N_b_p.csv, shape = (8064, 5)
read_csv = 05F0438N-05F0309N_b_p_r.csv, shape = (8064, 5)
read_csv = 05F0438N-05FR143N_b.csv, shape = (8064, 5)
read_csv = 05F0528N-05F0438N_b_t_r.csv, shape = (8064, 5)


In [27]:
# 注意，因為分開建模，evaluate後的unique_id已經不具參考價值了
cv_df.keys()

Index(['ds', 'cutoff', 'y', 'unique_id', '05F0001N-03F0150N_b_p_t_r',
       '05F0001N-03F0201S_b_p_t', '05F0055N-05F0001N_b_p_r',
       '05F0287N-05F0055N_b_p_c', '05F0309N-05F0287N_b_p',
       '05F0438N-05F0309N_b_p_r', '05F0438N-05FR143N_b',
       '05F0528N-05F0438N_b_t_r'],
      dtype='object')

In [28]:
evaluation_df = evaluate(cv_df.loc[:, cv_df.columns != 'cutoff'], 
                         metrics=[mse, mae, rmse, smape])

# evaluation_df['best_model'] = evaluation_df.drop(columns=['metric', 'unique_id']).idxmin(axis=1)
# evaluation_df.to_csv('./cv_results/uni_holi_cong_tra_eval.csv', index=False)

In [29]:
evaluation_df

Unnamed: 0,unique_id,metric,05F0001N-03F0150N_b_p_t_r,05F0001N-03F0201S_b_p_t,05F0055N-05F0001N_b_p_r,05F0287N-05F0055N_b_p_c,05F0309N-05F0287N_b_p,05F0438N-05F0309N_b_p_r,05F0438N-05FR143N_b,05F0528N-05F0438N_b_t_r
0,05F0001N-03F0150N,mse,118.411638,28624.791845,44829.345858,1296254.0,257764.69303,606520.487885,3197.829664,82698.236487
1,05F0001N-03F0150N,mae,2.848443,168.68253,210.097928,1112.471,441.726797,753.270063,56.057632,287.289659
2,05F0001N-03F0150N,rmse,10.881711,169.188628,211.729417,1138.532,507.705321,778.794253,56.549356,287.573011
3,05F0001N-03F0150N,smape,0.023991,0.621339,0.669859,0.9118233,0.767933,0.873988,0.353523,0.736297


# History check

因為目前neuralforecasts有bug，我只能用cross_validation的方式來確認問題在哪  
predict的step_size  
有點糟糕但是只能這樣做，cv會延長整個訓練與驗證的時間  

In [7]:
path = './final_cv_modelling/'
current_file_group = os.listdir(path)
final_cv_results = dict()

for file in current_file_group:
    if '.csv' not in file:
        continue
    file_path = join(path, file)
    print(file_path)
    final_cv_results[file] = pd.read_csv(file_path)

./final_cv_modelling/05F0001N-03F0201S-MLPMultivariate.csv
./final_cv_modelling/05F0309N-05F0287N-MLPMultivariate.csv
./final_cv_modelling/05F0438N-05F0309N-TSMixerx.csv
./final_cv_modelling/05F0001N-03F0150N-MLP.csv
./final_cv_modelling/05F0055N-05F0001N-TSMixerx.csv
./final_cv_modelling/05F0287N-05F0055N-TSMixerx.csv
./final_cv_modelling/05F0438N-05FR143N-MLPMultivariate.csv
./final_cv_modelling/05F0528N-05F0438N-MLP.csv


In [10]:
final_cv_results['05F0001N-03F0201S-MLPMultivariate.csv'].head()

Unnamed: 0,unique_id,ds,cutoff,MLPMultivariate,y
0,05F0001N-03F0201S,2023-01-16 00:00:00,2023-01-15 23:45:00,204.58502,200.2524
1,05F0001N-03F0201S,2023-01-16 00:15:00,2023-01-15 23:45:00,205.03546,195.920635
2,05F0001N-03F0201S,2023-01-16 00:30:00,2023-01-15 23:45:00,166.00372,197.14245
3,05F0001N-03F0201S,2023-01-16 00:45:00,2023-01-15 23:45:00,190.01875,198.444444
4,05F0001N-03F0201S,2023-01-16 01:00:00,2023-01-15 23:45:00,194.5278,210.333333


In [12]:
for file_name, df in final_cv_results.items():
    # append original feautures on df, so we can examine the results at the final evaluation
    gantry_name = file_name[0:17]
    all_feature_df = all_feature_dfs[gantry_name].copy()
    full_cv_results = df.merge(all_feature_df, left_on='ds', right_on='TimeStamp', how='left')
    full_cv_results.to_csv(f'./final_cv_modelling_w_features/{file_name}-w-features.csv', index=False)

05F0001N-03F0201S
05F0309N-05F0287N
05F0438N-05F0309N
05F0001N-03F0150N
05F0055N-05F0001N
05F0287N-05F0055N
05F0438N-05FR143N
05F0528N-05F0438N


In [11]:
all_feature_dfs.keys()

dict_keys(['05F0001N-03F0150N', '05F0001N-03F0201S', '05F0055N-05F0001N', '05F0287N-05F0055N', '05F0309N-05F0287N', '05F0438N-05F0309N', '05F0438N-05FR143N', '05F0528N-05F0438N'])

In [4]:
all_feature_dfs['05F0528N-05F0438N'].keys()

Index(['gf_gt', 'GantryFrom', 'GantryTo', 'TimeStamp', 'WeightedAvgTravelTime',
       'TotalTraffic', 'ds_prev_1_WATT', 'ds_prev_2_WATT', 'ds_prev_3_WATT',
       'ds_prev_4_WATT', 'ds_prev_5_WATT', 'congestion_syndrome',
       'holiday_continue', 'holiday_length', 'dayofweek', 'holiday_name_七夕情人節',
       'holiday_name_中元節', 'holiday_name_中秋節', 'holiday_name_二二八紀念日',
       'holiday_name_元宵節', 'holiday_name_兒童與清明節', 'holiday_name_勞動節',
       'holiday_name_國慶日', 'holiday_name_教師節', 'holiday_name_母親節',
       'holiday_name_父親節', 'holiday_name_白色情人節', 'holiday_name_端午節',
       'holiday_name_聖誕節', 'holiday_name_萬聖節', 'holiday_name_西洋情人節',
       'holiday_name_跨年元旦', 'holiday_name_農曆新年', 'accident_mileage',
       'event_occurrence', 'event_exclusion', 'handling_minutes',
       'accident_type', 'death_count', 'injuries_count', 'inner_shoulder_flag',
       'inner_lane_flag', 'middle_inner_lane_flag', 'middle_lane_flag',
       'middle_outer_lane_flag', 'outer_lane_flag', 'outer_should

# Evaluation not used

In [None]:
以下是要用來做一些研究用的，像是怎麼樣更好視覺化各個因子的表現

In [59]:
import pandas as pd
import numpy as np
import os
from utilsforecast.evaluation import evaluate
from utilsforecast.plotting import plot_series
from utilsforecast.losses import mse, mae, rmse, smape

In [3]:
eval_path = './base_cv_results/'

In [73]:
for file in os.listdir(eval_path):
    if 'cv_results.csv' in file:
        print(file)

uni_holi_cong_cv_results.csv
uni_holi_cong_tra_cv_results.csv
uni_holi_cv_results.csv
uni_congi_cv_results.csv
uni_dsprev_cv_results.csv
uni_base_cv_results.csv
