### Load input data

In [50]:
import pandas as pd

data = ["est_hourly", "est_daily", "emp_hourly", "emp_daily"]
dfs = {name: pd.read_csv(f"data/data_{name}.csv.bz2", index_col=0) for name in data}

### Group message types with negligible contribution to network traffic into 'other' type

- Add `inout` column, corresponding to the sum of the `in` and `out` flows
- Group messages types whose maximum `inout` traffic contribution across the measurement period is below a certain threshold
- Add normalized versions of the `in` and `out` columns

In [51]:
def recategorize_by_threshold(df_orig: pd.DataFrame, threshold: float) -> pd.DataFrame:
    """
    1. Compute the maximum 'inout' value for each msg_type across the entire dataset.
    2. Determine which msg_types exceed the threshold (keep_types).
    3. For sub-threshold msg_types, group by 'timestamp' to sum 'in', 'out', and 'inout'.
    4. Return a new DataFrame with columns [timestamp, msg_type, in, out, inout].
    """
    max_inout_by_type = df.groupby("msg_type")["inout"].max()
    keep_types = max_inout_by_type[max_inout_by_type > threshold].index
    df_keep = df[df["msg_type"].isin(keep_types)].copy()
    df_other = df[~df["msg_type"].isin(keep_types)].copy()
    df_keep.reset_index(inplace=True)
    df_other.reset_index(inplace=True)
    df_other_agg = df_other.groupby("timestamp", as_index=False)[
        ["in", "out", "inout"]
    ].sum()
    df_other_agg["msg_type"] = "other"
    result_df = pd.concat([df_keep, df_other_agg], ignore_index=True)
    result_df.sort_values(by=["timestamp", "msg_type"], inplace=True)
    return result_df[["timestamp", "msg_type", "in", "out", "inout"]]


thresholds = {
    "est_daily": 50_000_000,
    "est_hourly": 2_000_000,
}

for name, threshold in thresholds.items():
    df = dfs[name]
    df["inout"] = df["in"] + df["out"]
    df = recategorize_by_threshold(df, threshold)
    df["in_norm"] = df["in"] / df["inout"]
    df["out_norm"] = df["out"] / df["inout"]
    dfs[name] = df

## TODO:
- [ ] Validate outflow with new `nix-bitcoin-monitor` approach

## Plot daily in (absolute)

In [None]:
import plotly.express as px
import plotly.graph_objects as go

fig = px.area(
    dfs["est_daily"],
    x="timestamp",
    y="in",
    color="msg_type",
    title="Stacked Area Chart of net_size over Time by msg_type",
    labels={"net_size": "Net Size", "hour_groups": "Time", "msg_type": "Message Type"},
)

fig.add_trace(
    go.Scatter(
        x=dfs["emp_daily"].index,
        y=dfs["emp_daily"]["in"],
        mode="lines",
        name="emp_daily",
        line=dict(color="black", width=2),
    )
)

fig.update_layout(
    xaxis_title="Time",
    yaxis_title="Net Size",
    legend_title="Message Type",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
    ),
)
fig.update_yaxes(tickformat=".2s", ticksuffix="B")
fig.show()

## Plot out (absolute)

In [None]:
import plotly.express as px
import plotly.graph_objects as go

fig = px.area(
    dfs["est_daily"],
    x="timestamp",
    y="out",
    color="msg_type",
    title="Stacked Area Chart of net_size over Time by msg_type",
    labels={"net_size": "Net Size", "hour_groups": "Time", "msg_type": "Message Type"},
)

fig.add_trace(
    go.Scatter(
        x=dfs["emp_daily"].index,
        y=dfs["emp_daily"]["out"],
        mode="lines",
        name="emp_daily",
        line=dict(color="black", width=2),
    )
)

fig.update_layout(
    xaxis_title="Time",
    yaxis_title="Net Size",
    legend_title="Message Type",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
    ),
)
fig.update_yaxes(tickformat=".2s", ticksuffix="B")
fig.show()

## Out hourly (should peak near getrawaddrman?)

In [None]:
import plotly.express as px
import plotly.graph_objects as go

fig = px.area(
    dfs["est_hourly"],
    x="timestamp",
    y="out",
    color="msg_type",
    title="Stacked Area Chart of net_size over Time by msg_type",
    labels={"net_size": "Net Size", "hour_groups": "Time", "msg_type": "Message Type"},
)

fig.add_trace(
    go.Scatter(
        x=dfs["emp_hourly"].index,
        y=dfs["emp_hourly"]["out"],
        mode="lines",
        name="emp_hourly",
        line=dict(color="black", width=2),
    )
)

fig.update_layout(
    xaxis_title="Time",
    yaxis_title="Net Size",
    legend_title="Message Type",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
    ),
)
fig.update_yaxes(tickformat=".2s", ticksuffix="B")
fig.show()