# Description

### Read annotated nodes data

In [None]:
import pandas as pd
import os

ANNOTATED_NODES_CSV = "reachable-nodes-annotated-2023-05-07.csv"

assert os.path.exists(ANNOTATED_NODES_CSV), "Annotated node data missing! Run 01_prep_input_data.ipynb to create."

df = pd.read_csv(ANNOTATED_NODES_CSV)

# address statistics
num_ipv4 = sum(df.network == 'ipv4')
num_ipv6 = sum(df.network == 'ipv6')
num_ip = len(df)
assert num_ipv4 + num_ipv6 == num_ip, "Inconsistent dichotomy"
print(f'reachable nodes: ipv4={num_ipv4}, ipv6={num_ipv6}, ipv4/ipv6={num_ip}')

# netgroup statistics
ip_netgroups = df.netgroup.value_counts().values
ipv4_netgroups = df[df.network == 'ipv4'].netgroup.value_counts().values
ipv6_netgroups = df[df.network == 'ipv6'].netgroup.value_counts().values
assert sum(ipv4_netgroups) + sum(ipv6_netgroups) + sum(ip_netgroups) == 2 * len(df), "Inconsistent dichotomy!"
print(f'netgroups: ipv4={len(ipv4_netgroups)}, ipv6={len(ipv6_netgroups)}, ipv4/ipv6={len(ip_netgroups)}, total_addresses={sum(ip_netgroups)}')

# asn statistics
ip_asns = df.asn.value_counts().values
ipv4_asns = df[df.network == 'ipv4'].asn.value_counts().values
ipv6_asns = df[df.network == 'ipv6'].asn.value_counts().values
assert sum(ipv4_asns) + sum(ipv6_asns) + sum(ip_asns) == 2 * len(df), "Inconsistent dichotomy!"
print(f'asn: ipv4={len(ipv4_asns)}, ipv6={len(ipv6_asns)}, ipv4/ipv6={len(ip_asns)}, total_addresses={sum(ip_asns)}')

## Simulation of netgroup-asn mapping

- Draw ten netgroups at random
- Count number of unique ASN
- Run Monte Carlo simulation

In [None]:
import numpy as np

from tqdm.notebook import tqdm

ip_netgroups = df.netgroup.value_counts().to_dict()

netgroup_asn_map = pd.Series(df.asn.values,index=df.netgroup).to_dict()

def count_unique_asn(netgroup_list):
    unique_asn = {netgroup_asn_map[netgroup] for netgroup in netgroup_list}
    return len(unique_asn)

def monte_carlo(netgroups, num_draws: int = 10, iterations: int = 10000):
    netgroup_names = list(netgroups.keys())
    netgroup_sizes = np.array(list(netgroups.values()))
    population = netgroup_names
    size = num_draws
    replace = False
    p = netgroup_sizes/netgroup_sizes.sum()
    result = []
    for _ in tqdm(range(iterations)):
        draw = np.random.choice(population, size, replace, p=p)
        num_unique_asn = count_unique_asn(draw)
        result.append(num_unique_asn)
    return result

params = {
    'IPv4': df[df.network == 'ipv4'].netgroup.value_counts().to_dict(),
    'IPv6': df[df.network == 'ipv6'].netgroup.value_counts().to_dict(),
    'IPv4/IPv6': df.netgroup.value_counts().to_dict(),
    }

results = {}
for label, data in params.items():
    results[label] = monte_carlo(data)

### Plot

In [None]:
import seaborn as sns

data = pd.DataFrame(results)
data_melted = data.melt(var_name='label', value_name='Number of unique ASN')

_ = sns.histplot(data=data_melted, x='Number of unique ASN', hue='label', discrete=True, shrink=0.4, multiple='dodge', stat='percent', common_norm=False, legend=True, edgecolor=None)