In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import IFrame, Image
from entur_collector.dataanalysis.deviationssampler import refine_deviations

In [2]:
# Convenience function to allow changing between interactive and static images
# Needed to show figures on GitHub notebook preview
plotly_interactive = True
def show_figure(fig: go.Figure):
    global fig_count
    if plotly_interactive:
        return fig
    else:
        return Image(data=fig.to_image(format='png', scale=2))

In [3]:
df = refine_deviations()

In [4]:
df.tail()

Unnamed: 0,aimed_arrival,timestamp,expected_arrival,expected_delay,day_of_week,time_of_day,month,day_number,day_since_start
1067,2025-10-29 06:34:00+01:00,2025-10-29 05:35:51+01:00,2025-10-29 06:38:22+01:00,0 days 00:04:22,2,06:34:00,10,332,331
1068,2025-10-29 07:04:00+01:00,2025-10-29 06:19:16+01:00,2025-10-29 07:21:32+01:00,0 days 00:17:32,2,07:04:00,10,332,331
1069,2025-10-29 07:34:00+01:00,2025-10-29 06:40:12+01:00,2025-10-29 07:43:04+01:00,0 days 00:09:04,2,07:34:00,10,332,331
1070,2025-10-29 08:04:00+01:00,2025-10-29 07:22:46+01:00,2025-10-29 08:25:58+01:00,0 days 00:21:58,2,08:04:00,10,332,331
1071,2025-10-29 08:34:00+01:00,2025-10-29 07:47:39+01:00,2025-10-29 08:51:35+01:00,0 days 00:17:35,2,08:34:00,10,332,331


In [5]:
fig = px.scatter(df, x="aimed_arrival", y=df["expected_delay"].dt.total_seconds() / 60, color="time_of_day")
fig.layout.yaxis.title = "Estimated Delay (minutes)"
fig.layout.xaxis.title = "Day"
fig.layout.title = "Estimated delay throughout time period"

fig.layout.yaxis.showgrid = True
fig.layout.yaxis.gridwidth = 1
fig.layout.yaxis.gridcolor = 'lightgray'

fig.layout.xaxis.showgrid = True
fig.layout.xaxis.gridwidth = 1
fig.layout.xaxis.gridcolor = 'lightgray'
fig.layout.xaxis.tickvals = [f"2025-{i:02d}-01" for i in range(1,11)] + ["2024-12-01"]

show_figure(fig)

In [6]:
df.groupby(["day_of_week", "time_of_day"])

mean_delays = df.groupby(["day_of_week", "time_of_day"])["expected_delay"].mean().dt.total_seconds() / 60
# std_delays = df.groupby(["day_of_week", "time_of_day"])["expected_delay"].std().dt.total_seconds() / 60
mean_delays = mean_delays.reset_index()


fig = px.box(
    df,
    x="day_of_week",
    y=df["expected_delay"].dt.total_seconds() / 60,
    # color=list(mean_delays["time_of_day"].values.astype(str)),
    color="time_of_day",
    labels={'day_of_week': 'Day of Week', 'time_of_day': 'Time of Day'},
    title='Delay by Day of Week and Time of Day',
)

fig.layout.yaxis.showgrid = True
fig.layout.yaxis.gridwidth = 1
fig.layout.yaxis.gridcolor = 'lightgray'
fig.layout.yaxis.title = "Estimated Delay (minutes)"

fig.layout.yaxis.dtick = 10
fig.layout.yaxis.zeroline = True
fig.layout.yaxis.zerolinewidth = 1
fig.layout.yaxis.zerolinecolor = 'lightgray'

fig.layout.xaxis.title = ""

day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
fig.update_xaxes(ticktext=day_labels, tickvals=[0, 1, 2, 3, 4])

show_figure(fig)

In [7]:
# For a given time of day, plot the delay on a week-by-week basis, with one series per day of week

from datetime import time

# Select a specific time of day (you can change this)
# Available times: time(6, 34), time(7, 4), time(7, 34), time(8, 4), time(8, 34)
selected_time = time(8, 4)  # Change this to any time_of_day from the data

# Filter data for the selected time
df_filtered = df[df['time_of_day'] == selected_time].copy()

# Ensure aimed_arrival is datetime (handle timezone-aware datetimes)
if not pd.api.types.is_datetime64_any_dtype(df_filtered['aimed_arrival']):
    df_filtered['aimed_arrival'] = pd.to_datetime(df_filtered['aimed_arrival'], utc=True)

# Add week number and year-week for grouping
df_filtered['week_number'] = df_filtered['aimed_arrival'].dt.isocalendar().week
df_filtered['year'] = df_filtered['aimed_arrival'].dt.year
df_filtered['year_week'] = df_filtered['year'].astype(str) + '-W' + df_filtered['week_number'].astype(str).str.zfill(2)

# Convert delay to minutes
df_filtered['delay_minutes'] = df_filtered['expected_delay'].dt.total_seconds() / 60

# Group by year-week and day of week, calculate mean delay
weekly_delays = df_filtered.groupby(['year_week', 'day_of_week'])['delay_minutes'].mean().reset_index()

# Get the first date of each week for plotting
week_dates = df_filtered.groupby('year_week')['aimed_arrival'].min().reset_index()
week_dates.columns = ['year_week', 'week_start']

# Merge to get dates
weekly_delays = weekly_delays.merge(week_dates, on='year_week')

# Create the plot
fig = go.Figure()

day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
colors = px.colors.qualitative.Plotly

for day in sorted(weekly_delays['day_of_week'].unique()):
    df_day = weekly_delays[weekly_delays['day_of_week'] == day].sort_values('week_start')
    
    fig.add_trace(go.Scatter(
        x=df_day['week_start'],
        y=df_day['delay_minutes'],
        mode='lines+markers',
        name=day_names[day] if day < len(day_names) else f'Day {day}',
        line=dict(color=colors[day % len(colors)], width=2),
        marker=dict(size=6),
        hovertemplate='Week: %{x|%Y-%m-%d}<br>Delay: %{y:.1f} min<extra></extra>'
    ))

fig.update_layout(
    title=f'Weekly Average Delay by Day of Week (Time: {selected_time.strftime("%H:%M")})',
    xaxis_title='Week',
    yaxis_title='Average Delay (minutes)',
    hovermode='x unified',
    yaxis=dict(
        showgrid=True,
        gridwidth=1,
        gridcolor='lightgray',
        zeroline=True,
        zerolinewidth=1,
        zerolinecolor='gray'
    ),
    xaxis=dict(
        showgrid=True,
        gridwidth=1,
        gridcolor='lightgray'
    ),
    legend=dict(
        title='Day of Week',
        yanchor='top',
        y=0.99,
        xanchor='right',
        x=0.99
    )
)

show_figure(fig)