In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

data_set = "Rotterdam"
lau_id = "GM1969"

df_parquet = pd.read_parquet(f"/data/dev/bus-benchmark-new/data/parquet_data/{lau_id}.parquet")
df_parquet['from_time'] = df_parquet['from_time'].dt.tz_convert('Europe/Amsterdam')
df_parquet['to_time'] = df_parquet['to_time'].dt.tz_convert('Europe/Amsterdam')
df_parquet

In [None]:
route = df_parquet['route'].mode()[0]
df = df_parquet[df_parquet["route"] == route]
data_set_name = f"{data_set}_{route}"
data_set_name

In [None]:
import matplotlib.pyplot as plt

df_time = df.copy()
df_time.index = df_time['from_time']

fig, axes = plt.subplots(6, 1, figsize=(15, 30))

weekly_counts = df_time.resample('W').size()
weekly_counts.plot(ax=axes[0])
axes[0].set_title('Weekly Number of Data Points')
axes[0].set_ylabel('Count')
axes[0].grid(True)

monthly_counts = df_time.resample('M').size()
monthly_counts.plot(ax=axes[1])
axes[1].set_title('Monthly Number of Data Points')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Count')
axes[1].grid(True)

weekly_mean_travel = df_time.resample('W')['travel_time'].mean()
weekly_mean_travel.plot(ax=axes[2])
axes[2].set_title('Weekly Mean Travel Time')
axes[2].set_ylabel('Time (seconds)')
axes[2].grid(True)

monthly_mean_travel = df_time.resample('M')['travel_time'].mean()
monthly_mean_travel.plot(ax=axes[3])
axes[3].set_title('Monthly Mean Travel Time')
axes[3].set_xlabel('Date')
axes[3].set_ylabel('Time (seconds)')
axes[3].grid(True)

weekly_std_travel = df_time.resample('W')['travel_time'].std()
weekly_std_travel.plot(ax=axes[4])
axes[4].set_title('Weekly Std Travel Time')
axes[4].set_ylabel('Time (seconds)')
axes[4].grid(True)

monthly_std_travel = df_time.resample('M')['travel_time'].std()
monthly_std_travel.plot(ax=axes[5])
axes[5].set_title('Monthly Std Travel Time')
axes[5].set_xlabel('Date')
axes[5].set_ylabel('Time (seconds)')
axes[5].grid(True)

plt.suptitle(f'Number of Data Points Over Time for Route {route} in {data_set}', fontsize=16)
plt.tight_layout()
plt.subplots_adjust(top=0.95)
plt.show()

In [None]:
from bus_benchmark.experiments.fixed_interval_dataset import FixedIntervalDataset

fixed_interval_dataset = FixedIntervalDataset(
    df,
    freq="15T",
    drop_ha_below_n_count=100,
    ha_agg_func="median",
    ffill_limit=16,
    mad_thresh=3*1.4826,
    interpolate_ha=True,
    add_time_feat=False,
    calculate_residuals=False,
    normalize=False
)
fixed_interval_dataset

In [None]:
for train, val, test in fixed_interval_dataset.splits:
    print(f'Train: {len(train)}, Val: {len(val)}, Test: {len(test)}')
    print(f'NaN: {val["travel_time"].isna().sum()}')

In [None]:
from bus_benchmark.experiments.multi_step_dataset import MultiStepDataset

for df_train, df_val, df_test, df_ha, scaler, absolut_matrix_train in fixed_interval_dataset.data_set:
    print(f'Train: {df_train.shape}, Val: {df_val.shape}, Test: {df_test.shape}')
    print(f'Train timestamps: {df_train.index[0]} to {df_train.index[-1]}')
    print(f'Val timestamps: {df_val.index[0]} to {df_val.index[-1]}')
    print(f'Test timestamps: {df_test.index[0]} to {df_test.index[-1]}')
    
    ds_train = MultiStepDataset(df_train, seq_len=16)
    ds_val = MultiStepDataset(df_val, seq_len=16)
    ds_test = MultiStepDataset(df_test, seq_len=16)

    print(f'Train MSD: {len(ds_train)}, Val: {len(ds_val)}, Test: {len(ds_test)}')


In [None]:
from bus_benchmark.experiments.model.lstm_model_trainer import LSTMModelTrainer

fixed_interval_dataset = FixedIntervalDataset(
    df,
    n_splits=5,
    freq="15T",
    drop_ha_below_n_count=100,
    ha_agg_func="median",
    ffill_limit=16,
    mad_thresh=3*1.4826,
    interpolate_ha=True,
    add_time_feat=False,
)

fixed_interval_model_trainer =  LSTMModelTrainer(
    base_dataset = fixed_interval_dataset,
    seq_len=16,
    n_epochs=50,
    data_set_name = data_set_name
)
fixed_interval_model_trainer._setup_seed(42)
fixed_interval_model_trainer.run_experiment()
fixed_interval_model_trainer.metrics_manager.final_metrics_list

In [None]:
from bus_benchmark.experiments.model.convlstm_model_trainer import ConvLSTMModelTrainer

fixed_interval_dataset = FixedIntervalDataset(
    df,
    n_splits=5,
    freq="15T",
    drop_ha_below_n_count=100,
    ha_agg_func="median",
    ffill_limit=16,
    mad_thresh=3*1.4826,
    interpolate_ha=True,
    add_time_feat=False,
)

fixed_interval_model_trainer = ConvLSTMModelTrainer(
    base_dataset = fixed_interval_dataset,
    seq_len=16,
    n_epochs=50,
    data_set_name = data_set_name
)

fixed_interval_model_trainer._setup_seed(42)  # For reproducibility
fixed_interval_model_trainer.run_experiment()
fixed_interval_model_trainer.metrics_manager.final_metrics_list