# Extract

### Load input data

In [1]:
import pandas as pd

df_tp = pd.read_csv("tracepoints_raw.csv", index_col=0, parse_dates=True)
df_sys = pd.read_csv("systemd_raw.csv", index_col=0, parse_dates=True)

# Transform

First, data is cropped to when continuous measurement began.

Next, unnecessary data is stripped:

- Tracepoint data includes `peer_id`, `peer_conn_type`, `peer_addr`, `flow`
(traffic directory, i.e. in- or outbound), `msg_type` and `size`. Of these, only
`flow`, `msg_type` and `size` are retained in their original form. A new `ipv6`
column is introduced to indicate whether a message was sent via IPv4 or IPv6,
since the version affects IP header sizes used for the traffic estimate.

- Systemd IP accounting data includes rows for `IPIngressPackets`,
`IPEgressPackets`, `IPIngressBytes`, and `IPEgressBytes`. Packet data is
discarded and byte data is converted from absolute to relative (i.e. from bytes
since measurement was started to bytes since previous row/sample).

### Incorporate below

In [2]:
import numpy as np


def filter_incomplete_days(df):
    """Discard days that have less than 99.9% of expected samples."""
    THRESHOLD = ((24 * 60 * 60) / 5) * 0.999
    samples_per_day = df.groupby(df.index.date).size()
    days_above_threshold = samples_per_day[samples_per_day > THRESHOLD].index
    filter_mask = np.in1d(df.index.date, days_above_threshold)
    df = df[filter_mask]
    return df

In [None]:
from pandarallel import pandarallel
import numpy as np

pandarallel.initialize(progress_bar=True)


def filter_incomplete_days(df):
    """Discard days that have less than 99.9% of expected samples."""
    THRESHOLD = ((24 * 60 * 60) / 5) * 0.999
    samples_per_day = df.groupby(df.index.date).size()
    days_above_threshold = samples_per_day[samples_per_day > THRESHOLD].index
    filter_mask = np.in1d(df.index.date, days_above_threshold)
    df = df[filter_mask]
    return df


df_tp_t = filter_incomplete_days(df_tp)
df_sys_t = filter_incomplete_days(df_sys)

# visualize
# _ = df_sys.IPIngressPackets.plot()

df_tp_t = df_tp_t.dropna()
df_tp_t["ipv6"] = df_tp_t["peer_addr"].parallel_apply(
    lambda x: True if "[" in x else False
)
df_tp_t = df_tp_t[["ipv6", "flow", "msg_type", "size"]]

df_sys_t = df_sys_t.dropna()
df_sys_t = df_sys_t[["IPIngressBytes", "IPEgressBytes"]]
df_sys_t = df_sys_t.diff()[1:]

# Load

Store data as bz2-compressed CSV format.

In [3]:
df_tp_t.to_csv("tracepoints_preprocessed.csv.bz2", compression="bz2")
df_sys_t.to_csv("systemd_preprocessed.csv.bz2", compression="bz2")