In [None]:
import pandas as pd
from sqlalchemy import create_engine, text
import matplotlib.pyplot as plt

engine = create_engine(
    "postgresql+psycopg2://redacted:redacted@db:5432/bison"
)

In [None]:
# embed fonts in a way so that papercept doesn't complain

import matplotlib.font_manager
matplotlib.font_manager.fontManager.addfont('/usr/share/texmf/fonts/opentype/public/tex-gyre/texgyretermes-regular.otf')

# embed TrueType fonts rather than rasterize
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype']  = 42

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif']  = ['Times New Roman','TeX Gyre Termes','Times']

In [None]:
bins_min = 0
bins_max = 300
bin_width = 5
bins_count = (bins_max - bins_min) // bin_width

hist_sql = text("""
    SELECT
        width_bucket(diff, :min, :max, :bins_count) AS bin,
        COUNT(*) AS freq
    FROM (
        SELECT extract(epoch FROM (to_time - from_time)) AS diff
        FROM travel_time_final
    ) sub
    GROUP BY bin
    ORDER BY bin;
""")

hist_df = pd.read_sql_query(
    hist_sql,
    con=engine,
    params={"min": bins_min, "max": bins_max, "bins_count": bins_count}
)
hist_df = hist_df[(hist_df['bin'] > 0) & (hist_df['bin'] <= bins_count)]

hist_df['left_edge'] = (hist_df['bin'] - 1) * bin_width
hist_df['right_edge'] = hist_df['bin'] * bin_width
hist_df['center'] = hist_df['left_edge'] + bin_width / 2

fig, ax = plt.subplots(figsize=(4, 2))
ax.bar(
    hist_df['center'],
    hist_df['freq'] / 1000000,
    width=bin_width,
    align='center',
    color='#4E79A7',
)
ax.set_xlabel('Travel time (s)')
ax.set_ylabel('Frequency (millions)')
ax.ticklabel_format(
    axis='y',
    style='scientific',
    useOffset=False
)
plt.tight_layout(pad=0)
plt.savefig('travel_time_histogram.pdf', dpi=300, bbox_inches="tight", pad_inches=0)
plt.show()

In [None]:
bins_min = 0
bins_max = 300
bin_width = 5
bins_count = (bins_max - bins_min) // bin_width

hist_sql = text("""
    SELECT
        width_bucket(diff, :min, :max, :bins_count) AS bin,
        COUNT(*) AS freq
    FROM (
        SELECT extract(epoch FROM (to_time - from_time)) AS diff
        FROM dwell_time_final
    ) sub
    WHERE diff > 0
    GROUP BY bin
    ORDER BY bin;
""")

hist_df = pd.read_sql_query(
    hist_sql,
    con=engine,
    params={"min": bins_min, "max": bins_max, "bins_count": bins_count}
)
hist_df = hist_df[(hist_df['bin'] > 0) & (hist_df['bin'] <= bins_count)]

hist_df['left_edge'] = (hist_df['bin'] - 1) * bin_width
hist_df['right_edge'] = hist_df['bin'] * bin_width
hist_df['center'] = hist_df['left_edge'] + bin_width / 2

fig, ax = plt.subplots(figsize=(4, 2))
ax.bar(
    hist_df['center'],
    hist_df['freq'] / 1000000,
    width=bin_width,
    align='center',
    color='#4E79A7',
)
ax.set_xlabel('Dwell time (s)')
ax.set_ylabel('Frequency (millions)')
ax.ticklabel_format(axis='y',
                    style='scientific',
                    useOffset=False)
plt.tight_layout(pad=0)
plt.savefig('dwell_time_histogram.pdf', dpi=300, bbox_inches="tight", pad_inches=0)
plt.show()

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

def calculate_walk_forward_splits(start_date, end_date, n_splits):
    total_days = (end_date - start_date).days + 1
    window_size = total_days // (n_splits + 1)
    splits = []
    for i in range(1, n_splits + 1):
        train_end = start_date + dt.timedelta(days=window_size * i - 1)
        test_start = train_end + dt.timedelta(days=1)
        test_end = test_start + dt.timedelta(days=window_size - 1)
        if i == n_splits:
            test_end = end_date
        splits.append({
            'train': (start_date, train_end),
            'test':  (test_start, test_end)
        })
    return splits

def plot_splits_no_val(splits, figsize=(6, 4)):
    fig, ax = plt.subplots(figsize=figsize)
    for i, split in enumerate(splits):
        # training bar
        start, end = split['train']
        ax.barh(
            i,
            (end - start).days + 1,
            left=mdates.date2num(start),
            height=0.8,                  # thicker bars → less gap
            color='#4E79A7',
            label='Training' if i == 0 else ''
        )
        # test bar
        start, end = split['test']
        ax.barh(
            i,
            (end - start).days + 1,
            left=mdates.date2num(start),
            height=0.8,
            color='#E15759',
            label='Test' if i == 0 else ''
        )

    ax.set_yticks(range(len(splits)))
    ax.set_yticklabels([f"Split {i+1}" for i in range(len(splits))])
    ax.invert_yaxis()

    ax.xaxis.set_major_locator(mdates.YearLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
    plt.xticks(rotation=0, ha='center')

    ax.set_xlabel("Date")
    ax.legend(
        frameon=False,
        loc='upper right',
        handlelength=2,
        handleheight=1
    )

    plt.tight_layout()
    plt.savefig('time_series_split.pdf', dpi=300, bbox_inches="tight", pad_inches=0)
    plt.show()

start = dt.date(2022, 1, 1)
end = dt.date(2024, 12, 31)
splits = calculate_walk_forward_splits(start, end, 5)
plot_splits_no_val(splits, figsize=(4, 2))
