# Prepare input data

1. Read reachable node data (including IP address and network type)
2. Annotate netgroups
3. Annotate ASN
4. Store node data with annotations

### Helper functions

In [None]:
import bz2
import ipaddress
import pandas as pd
from tqdm import tqdm

def load_node_data(date: str):
    """Load node data (node address and network type) from CSV."""
    df = pd.read_csv(f"data/reachable-nodes-{date}.csv")
    num_ipv4 = sum(df.network == 'ipv4')
    num_ipv6 = sum(df.network == 'ipv6')
    num_ip = len(df)
    assert num_ipv4 + num_ipv6 == num_ip, "Inconsistent dichotomy"
    print(f'reachable nodes: ipv4={num_ipv4}, ipv6={num_ipv6}, ipv4/ipv6={num_ip}')
    return df

def load_asmap(date: str):
    """Read ASMAP data."""
    asmap = []
    with bz2.open(f"data/asmap-kartograf-{date}.txt.bz2") as f:
        for line in f.readlines():
            net, asn = line.decode('ascii').strip().split(' ')
            network = ipaddress.ip_network(net)
            netw = int(network.network_address)
            mask = int(network.netmask)
            asmap.append((netw, mask, asn))
    return asmap


def addr_to_netgroup(addr_str: str):
    """Derive netgroup from IP address."""
    addr = ipaddress.ip_address(addr_str)
    if isinstance(addr, ipaddress.IPv4Address):
        return ipaddress.ip_interface(f'{addr}/16').network
    if addr.exploded.startswith("2001:0470:"):
        return ipaddress.ip_interface(f'{addr}/36').network
    return ipaddress.ip_interface(f'{addr}/32').network

def addr_to_asn(addr_str: str, asmap):
    """Derive ASN from IP Address."""
    ip = ipaddress.ip_address(addr_str)
    ip_int = int(ip)
    for netw, mask, asn in asmap:
        if (ip_int & mask) == netw:
            return asn
    return "unknown"

In [None]:
dates = ["2023-02-06", "2024-01-25"]

tqdm.pandas()
for date in dates:
    print(f"date={date}: annotating netgroups and autonomous system numbers")
    df = load_node_data(date)
    asmap = load_asmap(date)
    df['netgroup'] = df.address.progress_apply(addr_to_netgroup)
    df['asn'] = df.address.progress_apply(addr_to_asn, args=(asmap,))

    target = f"data/reachable-nodes-annotated-{date}.csv"
    print(f"storing results in {target}")
    df.to_csv(target, index=False)