# Description

### Read annotated nodes data

In [None]:
import pandas as pd
import os

ANNOTATED_NODES_CSV = "reachable-nodes-annotated-2023-05-07.csv"

assert os.path.exists(ANNOTATED_NODES_CSV), "Annotated node data missing! Run 01_prep_input_data.ipynb to create."

df = pd.read_csv(ANNOTATED_NODES_CSV)

# address statistics
num_ipv4 = sum(df.network == 'ipv4')
num_ipv6 = sum(df.network == 'ipv6')
num_ip = len(df)
assert num_ipv4 + num_ipv6 == num_ip, "Inconsistent dichotomy"
print(f'reachable nodes: ipv4={num_ipv4}, ipv6={num_ipv6}, ipv4/ipv6={num_ip}')

# netgroup statistics
ip_netgroups = df.netgroup.value_counts().values
ipv4_netgroups = df[df.network == 'ipv4'].netgroup.value_counts().values
ipv6_netgroups = df[df.network == 'ipv6'].netgroup.value_counts().values
assert sum(ipv4_netgroups) + sum(ipv6_netgroups) + sum(ip_netgroups) == 2 * len(df), "Inconsistent dichotomy!"
print(f'netgroups: ipv4={len(ipv4_netgroups)}, ipv6={len(ipv6_netgroups)}, ipv4/ipv6={len(ip_netgroups)}, total_addresses={sum(ip_netgroups)}')

# asn statistics
ip_asns = df.asn.value_counts().values
ipv4_asns = df[df.network == 'ipv4'].asn.value_counts().values
ipv6_asns = df[df.network == 'ipv6'].asn.value_counts().values
assert sum(ipv4_asns) + sum(ipv6_asns) + sum(ip_asns) == 2 * len(df), "Inconsistent dichotomy!"
print(f'asn: ipv4={len(ipv4_asns)}, ipv6={len(ipv6_asns)}, ipv4/ipv6={len(ip_asns)}, total_addresses={sum(ip_asns)}')

# Compare attack costs

### Helper functions

In [None]:
from tqdm import tqdm
from scipy.stats import hypergeom
import numpy as np
import pandas as pd
import seaborn as sns
from itertools import product
tqdm.pandas()

# BiasedUrn R package
import rpy2.robjects.packages as rpackages
from rpy2.robjects.vectors import IntVector, FloatVector
import rpy2.robjects as robjects
biasedurn = rpackages.importr('BiasedUrn')

def p_eclipse_wallenius(good_bucket_sizes: tuple[int], bad_bucket_size: float, num_bad_buckets: int, num_outbound_conns: int = 10) -> float:
    x = IntVector([0] * len(good_bucket_sizes) + [num_outbound_conns])  # number of balls of each color sampled: zero benign balls, number of outgoing conns. bad balls
    m = IntVector([1] * len(good_bucket_sizes) + [num_bad_buckets])     # initial number of balls of each color: one ball for each benign color, number of buckets balls for bad color
    n = num_outbound_conns                                              # number of balls sampled: number of outbound connections
    odds = FloatVector(good_bucket_sizes + tuple([bad_bucket_size]))    # weight for each color, arbitrarily scaled: number of nodes in bucket
    p = biasedurn.dMWNCHypergeo(x=x, m=m, n=n, odds=odds)               # returns an R FloatVector that contains one element
    return p[0]

def p_eclipse_wallenius_fast(good_bucket_sizes: tuple[int], bad_bucket_size: float, num_bad_buckets: int, num_outbound_conns: int = 10) -> float:
    x = IntVector([0] + [num_outbound_conns])  # number of balls of each color sampled: zero benign balls, number of outgoing conns. bad balls
    m = IntVector([1] + [num_bad_buckets])     # initial number of balls of each color: one ball for each benign color, number of buckets balls for bad color
    n = num_outbound_conns                                              # number of balls sampled: number of outbound connections
    odds = FloatVector(tuple([sum(good_bucket_sizes)]) + tuple([bad_bucket_size]))    # weight for each color, arbitrarily scaled: number of nodes in bucket
    p = biasedurn.dMWNCHypergeo(x=x, m=m, n=n, odds=odds)               # returns an R FloatVector that contains one element
    return p[0]


def p_eclipse(num_good_nodes: int, num_bad_nodes: int, num_outbound_conns: int = 10) -> float:
    """
    Compute eclipse probability (i.e. all outbound to attacker nodes).
    Probability distribution is hypergeometric distribution.
    """
    k = num_outbound_conns              # number of observed successes
    M = num_good_nodes + num_bad_nodes  # population size (total number of balls)
    n = num_bad_nodes                   # number of success states in the population
    N = num_outbound_conns              # number of draws
    p = hypergeom.pmf(k, M, n, N)
    return p


def extract(df, ps):
    """Extract parameters satisfying minimum required probability"""
    data = []
    for p in ps:
        for num_bad in df.num_bad_nodes.unique().tolist():
            df_sel = df[(df.num_bad_nodes == num_bad) & (df.p_netgroup >= p)]
            num_bad_netgroups = df_sel.iloc[0].num_bad_buckets if len(df_sel) else np.nan

            df_sel = df[(df.num_bad_nodes == num_bad) & (df.p_asn >= p)]
            num_bad_asn = df_sel.iloc[0].num_bad_buckets if len(df_sel) else np.nan

            data.append({'num_bad_nodes': num_bad, 'p': p, 'num_bad_netgroups': num_bad_netgroups, 'num_bad_asn': num_bad_asn})
    return pd.DataFrame(data)

## Cost of IPv4/IPv6 attack

In [None]:
# define parameters
num_bad_nodes = range(1000, 3000000+1, 5000)
num_bad_buckets = range(10, 1000, 5)

df_in = pd.DataFrame(list(product(num_bad_nodes, num_bad_buckets)), columns=['num_bad_nodes', 'num_bad_buckets'])
df_in['bad_bucket_size'] = df_in.num_bad_nodes / df_in.num_bad_buckets
df_in['good_netgroup_sizes'] = [tuple(ip_netgroups)] * len(df_in)
df_in['good_asn_sizes'] = [tuple(ip_asns)] * len(df_in)

# run parameter studies
df_in['p_netgroup'] = df_in.progress_apply(lambda x: p_eclipse_wallenius_fast(tuple(x.good_netgroup_sizes), x.bad_bucket_size, x.num_bad_buckets), axis=1)
df_in['p_asn'] = df_in.progress_apply(lambda x: p_eclipse_wallenius_fast(tuple(x.good_asn_sizes), x.bad_bucket_size, x.num_bad_buckets), axis=1)

# extract bucket sizes and plot
df_ip = extract(df_in, ps=[0.1, 0.3, 0.5, 0.75, 0.9])
g = sns.lineplot(data=df_ip, x='num_bad_nodes', y='num_bad_asn', hue='p', palette='tab10', alpha=1.00, legend=True)
g.set_xscale('log')
g.set_yscale('log')
g.set_ylabel('Attacker buckets')
g.set_xlabel('Attacker nodes')

### ASMAP-induced cost trends

In [None]:
trends = []

for p in df_ip.p.unique():
    d = df_ip[df_ip.p == p].dropna().head(1).to_dict(orient='records')
    if d:
        locals().update(d[0])
        assert num_bad_netgroups == num_bad_asn, "Internal error!"
        num_bad_nodes_no_bucket = num_bad_nodes

        for buckets in [10, 15, 20]:
            d = df_ip[(df_ip.p == p) & (df_ip.num_bad_netgroups == buckets)].dropna().head(1).to_dict(orient='records')
            if d:
                locals().update(d[0])
                assert num_bad_netgroups == num_bad_asn, "Internal error!"
                trends.append({'p': p, 'buckets': buckets, 'num_bad_nodes': num_bad_nodes, 'cost_increase': num_bad_nodes/num_bad_nodes_no_bucket-1})

df_trends = pd.DataFrame(trends, index=None)
g = sns.barplot(x='p', y='cost_increase', hue='buckets', hue_order=[20, 15, 10], palette=['tab:purple', 'tab:orange', 'tab:blue'], data=df_trends, width=0.7, dodge=True)
g.set_xlabel('Targeted eclipse probability')