# Visualize empirical data

### Read annotated nodes data

In [None]:
import pandas as pd
import os

ANNOTATED_NODES_CSV = "reachable-nodes-annotated-2023-05-07.csv"

assert os.path.exists(ANNOTATED_NODES_CSV), "Annotated node data missing! Run 01_prep_input_data.ipynb to create."

df = pd.read_csv(ANNOTATED_NODES_CSV)

# address statistics
num_ipv4 = sum(df.network == 'ipv4')
num_ipv6 = sum(df.network == 'ipv6')
num_ip = len(df)
assert num_ipv4 + num_ipv6 == num_ip, "Inconsistent dichotomy"
print(f'reachable nodes: ipv4={num_ipv4}, ipv6={num_ipv6}, ipv4/ipv6={num_ip}')

# netgroup statistics
ip_netgroups = df.netgroup.value_counts().values
ipv4_netgroups = df[df.network == 'ipv4'].netgroup.value_counts().values
ipv6_netgroups = df[df.network == 'ipv6'].netgroup.value_counts().values
assert sum(ipv4_netgroups) + sum(ipv6_netgroups) + sum(ip_netgroups) == 2 * len(df), "Inconsistent dichotomy!"
print(f'netgroups: ipv4={len(ipv4_netgroups)}, ipv6={len(ipv6_netgroups)}, ipv4/ipv6={len(ip_netgroups)}, total_addresses={sum(ip_netgroups)}')

# asn statistics
ip_asns = df.asn.value_counts().values
ipv4_asns = df[df.network == 'ipv4'].asn.value_counts().values
ipv6_asns = df[df.network == 'ipv6'].asn.value_counts().values
assert sum(ipv4_asns) + sum(ipv6_asns) + sum(ip_asns) == 2 * len(df), "Inconsistent dichotomy!"
print(f'asn: ipv4={len(ipv4_asns)}, ipv6={len(ipv6_asns)}, ipv4/ipv6={len(ip_asns)}, total_addresses={sum(ip_asns)}')

## Analysis

### Top 30 buckets: IPv4 netgroup vs IPv6 netgroups

In [None]:
import seaborn as sns

selection = 30
df = pd.DataFrame({'IPv4': ipv4_netgroups[:selection], 'IPv6': ipv6_netgroups[:selection]})
df = df.reset_index().melt('index') # melt into long format for seaborn

g = sns.barplot(x='index', y='value', hue='variable', data=df, width=0.7, dodge=True)
g.set_ylabel('Number of addresses')

### Top 30 buckets: Netgroup vs ASN

In [None]:
selection = 30
df = pd.DataFrame({'Netgroup': ip_netgroups[:selection], 'ASN': ip_asns[:selection]})
df = df.reset_index().melt('index') # melt into long format for seaborn

g = sns.barplot(x='index', y='value', hue='variable', data=df, width=0.7, dodge=True)
g.set_ylabel('Number of addresses')

### Reversed Lorenz curve: bucket share  vs. cumulative address share

In [None]:
import seaborn as sns
import numpy as np

def get_bucket_share_and_cum_addr_share(counts):
    assert len(counts) > 1, "Must have more than one element"
    cumulative_shares = np.cumsum(counts) / np.sum(counts)
    positions = np.arange(len(counts)) / (len(counts) - 1)
    return positions, cumulative_shares

bshare1, cashare1 = get_bucket_share_and_cum_addr_share(ip_netgroups[::1])
bshare2, cashare2 = get_bucket_share_and_cum_addr_share(ip_asns[::1])

sns.lineplot(x=bshare1, y=cashare1, label='Netgroup')
g = sns.lineplot(x=bshare2, y=cashare2, label='ASN')
g.set_xlabel('Buckets share')
g.set_ylabel('Cumulative address share')