# Prepare input data

1. Read reachable nodes data (including IP address and network type)
2. Annotate netgroups
3. Annotate ASN
4. Store nodes data with annotations

### Read nodes data

In [None]:
import pandas as pd

df = pd.read_csv("reachable-nodes-2023-05-07.csv")

num_ipv4 = sum(df.network == 'ipv4')
num_ipv6 = sum(df.network == 'ipv6')
num_ip = len(df)
assert num_ipv4 + num_ipv6 == num_ip, "Inconsistent dichotomy"

print(f'reachable nodes: ipv4={num_ipv4}, ipv6={num_ipv6}, ipv4/ipv6={num_ip}')

### Annotate netgroups

- For IPv4, always use /16
- For IPv6, use /32 by default; /36 for Hurrican Electric (2001:470::/32)

In [None]:
import ipaddress

def netgroup_from_addr(addr_str):
    addr = ipaddress.ip_address(addr_str)
    if isinstance(addr, ipaddress.IPv4Address):
        return ipaddress.ip_interface(f'{addr}/16').network
    if addr.exploded.startswith("2001:0470:"):
        return ipaddress.ip_interface(f'{addr}/36').network
    return ipaddress.ip_interface(f'{addr}/32').network

df['netgroup'] = df.address.apply(netgroup_from_addr)

### Annotate ASN

- Uses ASMAP created with [Kartograf](https://github.com/fjahr/kartograf)

In [None]:
import bz2
from tqdm import tqdm
tqdm.pandas()

def load_asmap(asmap_file):
    asmap = []
    with bz2.open(asmap_file) as f:
        for line in f.readlines():
            net, asn = line.decode('ascii').strip().split(' ')
            network = ipaddress.ip_network(net)
            netw = int(network.network_address)
            mask = int(network.netmask)
            asmap.append((netw, mask, asn))
    return asmap

def asn_from_addr(addr_str):
    ip = ipaddress.ip_address(addr_str)
    ip_int = int(ip)
    for netw, mask, asn in asmap:
        if (ip_int & mask) == netw:
            return asn
    return "unknown"

asmap_filename = "asmap-kartograf-2023-02-06.txt.bz2"
asmap = load_asmap(asmap_filename)

df['asn'] = df.address.progress_apply(asn_from_addr)

total, covered, missing = len(df), sum(df.asn != "unknown"), sum(df.asn == "unknown")
print(f"total={total}, covered={covered} ({100*covered/total:.1f}%), missing={missing} ({100*missing/total:.1f}%)")
assert missing/total < 0.002, "Insufficient coverage by asmap"

### Store annotated data

In [None]:
ANNOTATED_NODES_CSV = "reachable-nodes-annotated-2023-05-07.csv"
df.to_csv(ANNOTATED_NODES_CSV, index=False)