## Load input data

In [None]:
import pandas as pd

df_map = pd.read_csv(f"data/id-addr-mapping.csv.bz2", index_col=0)

df_d = pd.read_csv(
    f"data/data_daily.csv.bz2", index_col=0, parse_dates=True
).reset_index()
df_d = df_d.merge(df_map, on=["peer_id", "peer_addr"], how="left").set_index(
    "timestamp"
)

df_h = pd.read_csv(
    f"data/data_hourly.csv.bz2", index_col=0, parse_dates=True
).reset_index()
df_h = df_h.merge(df_map, on=["peer_id", "peer_addr"], how="left").set_index(
    "timestamp"
)

# Peer traffic

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.dates as mdates
from matplotlib.ticker import EngFormatter
import matplotlib.pyplot as plt

df = df_d.copy()
df = df.reset_index()
df["connection_time"] = pd.to_timedelta(df["connection_time"])

# retain only those where connection time is larger than one hour
df = df[df["connection_time"].dt.total_seconds() > (60 * 60)]

df["date"] = pd.to_datetime(df["timestamp"]).dt.date
df["peer_key"] = df["peer_id"].astype(str) + "-" + df["peer_addr"]
daily_traffic = df.groupby(["date", "peer_key"])["tcpip_size"].sum().reset_index()
peer_conn = df[["peer_key", "peer_conn_type"]].drop_duplicates(subset="peer_key")
daily_traffic = daily_traffic.merge(peer_conn, on="peer_key", how="left")

fig, ax = plt.subplots()
fig.suptitle("Daily peer traffic")
ax.set_title(
    "Daily TCP/IP traffic broken down by individual peers and their connection type"
)

sns.stripplot(
    data=daily_traffic,
    x="date",
    y="tcpip_size",
    hue="peer_conn_type",
    jitter=True,
    ax=ax,
    size=3,
)

ax.set_yscale("log")
ax.set_ylabel(None)
ax.set_xlabel(None)
ax.set_ylim(10 * 10**3, 400 * 10**9)
formatter = EngFormatter(unit="B")
ax.yaxis.set_major_formatter(formatter)
ax.legend(ncol=4, title="")
ax.xaxis.set_major_locator(mdates.DayLocator(interval=3))
ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %d"))
ax.xaxis.set_minor_locator(mdates.DayLocator(interval=1))

plt.tight_layout()
plt.show()
fig.savefig(f"daily-peer-traffic.png", dpi=300, bbox_inches="tight", facecolor="white")

## Inbound peer message share

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns

df = df_d.copy()

# select only data received by inbound connections connected for at least one hour
df = df[
    (df["peer_conn_type"] == "inbound")
    & (df["flow"] == "in")
    & (pd.to_timedelta(df["connection_time"]) >= pd.Timedelta(hours=1))
].copy()

# compute shares
grp = (
    df.groupby(["peer_id", "peer_addr", "msg_type"])["tcpip_size"]
    .sum()
    .unstack(fill_value=0)
)
peer_total = grp.sum(axis=1)
for msg in grp.columns:
    grp[msg + "_share"] = grp[msg] / peer_total


# plot
df_sorted = grp.sort_values(by=["pong_share", "ping_share"], ascending=False)
share_cols = [col for col in df_sorted.columns if col.endswith("_share")]
df_shares = df_sorted[share_cols].reset_index(drop=True)
valid_cols = [col for col in df_shares.columns if df_shares[col].max() > 0.01]
df_shares = df_shares[valid_cols]

cmap = plt.get_cmap("tab20")
n_colors = len(valid_cols)
colors = [cmap((i % 20) / 19.0) for i in range(n_colors)]


fig, ax = plt.subplots()
fig.suptitle("Peer message analysis")
ax.set_title("Peer share of total traffic by message")
ax.stackplot(
    df_shares.index,
    *[df_shares[col] for col in df_shares.columns],
    labels=df_shares.columns,
    colors=colors,
)
ax.legend(ncol=6, title="", fontsize=10)

ax.set_ylabel(None)
ax.set_xlabel(None)
ax.set_ylim(0, 1.09)
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1))
ax.xaxis.set_major_locator(plt.NullLocator())

plt.tight_layout()
plt.show()
fig.savefig(
    f"peer-message-analysis.png", dpi=300, bbox_inches="tight", facecolor="white"
)

## Inspect suspicious peers

In [None]:
df_susp = df_sorted[df_sorted["pong_share"] > 0.5].reset_index()
df_susp["peer_ip"] = df_susp["peer_addr"].apply(
    lambda x: x.split("[")[1].split("]")[0] if "[" in x else x.split(":")[0]
)
df_susp["peer_ip"].value_counts().head(15)