# Fetch

### Fetch input data

In [1]:
SOURCE_HOST="tau"
SOURCE_DIRS = ["systemd/ip_accounting", "tracepoints/net"]

for dir in SOURCE_DIRS:
    !mkdir -p $SOURCE_HOST/$dir
    !rsync -az --info=progress2 $SOURCE_HOST:/home/nix-bitcoin-monitor/$dir/ $SOURCE_HOST/$dir

      1,592,267   6%    3.41MB/s    0:00:00 (xfr#2, to-chk=0/23)
     85,777,113   4%   72.26MB/s    0:00:01 (xfr#2, to-chk=0/23)


# Extract

### Load input data

In [2]:
import os
import glob
import pandas as pd


def read_csvs(directory: str) -> pd.DataFrame:
    files = sorted(glob.glob(os.path.join(directory, "*.csv")))
    df_list = [pd.read_csv(file, index_col=0, parse_dates=True) for file in files]
    return pd.concat(df_list).sort_index()


df_emp = read_csvs(f"{SOURCE_HOST}/systemd/ip_accounting/")
df_tp = read_csvs(f"{SOURCE_HOST}/tracepoints/net/")

# Transform

First, data is cropped to when continuous measurement began.

Next, unnecessary data is stripped:

- Tracepoint data includes `peer_id`, `peer_conn_type`, `peer_addr`, `flow`
(traffic directory, i.e. in- or outbound), `msg_type` and `size`. Of these, only
`flow`, `msg_type` and `size` are retained in their original form. A new `ipv6`
column is introduced to indicate whether a message was sent via IPv4 or IPv6,
since the version affects IP header sizes used for the traffic estimate.

- Systemd IP accounting data includes rows for `IPIngressPackets`,
`IPEgressPackets`, `IPIngressBytes`, and `IPEgressBytes`. Packet data is
discarded and byte data is converted from absolute to relative (i.e. from bytes
since measurement was started to bytes since previous row/sample).

In [None]:
from pandarallel import pandarallel
import numpy as np

pandarallel.initialize(progress_bar=True)


#
# TODO: Remove inactive code once visualization with non-clean data works
#
# def filter_incomplete_days(df):
#     """Discard days that have less than 99.9% of expected samples."""
#     THRESHOLD = ((24 * 60 * 60) / 5) * 0.999
#     samples_per_day = df.groupby(df.index.date).size()
#     days_above_threshold = samples_per_day[samples_per_day > THRESHOLD].index
#     filter_mask = np.in1d(df.index.date, days_above_threshold)
#     df = df[filter_mask]
#     return df


# df_tp_t = filter_incomplete_days(df_tp)
# df_emp_t = filter_incomplete_days(df_emp)

# visualize
# _ = df_emp.IPIngressPackets.plot()

df_tp_t = df_tp.dropna()
df_tp_t["ipv6"] = df_tp_t["peer_addr"].parallel_apply(
    lambda x: True if "[" in x else False
)
df_tp_t = df_tp_t[["ipv6", "flow", "msg_type", "size"]]

df_emp_t = df_emp.dropna()
df_emp_t = df_emp_t[["IPIngressBytes", "IPEgressBytes"]]
df_emp_t = df_emp_t.diff()[1:]

# Load

Store data as bz2-compressed CSV format.

In [5]:
df_tp_t.to_csv("tracepoints_preprocessed.csv.bz2", compression="bz2")
df_emp_t.to_csv("systemd_preprocessed.csv.bz2", compression="bz2")