In [1]:
import aiohttp
import lzma
import io

from datetime import datetime
from os.path import expanduser
from pathlib import Path
from typing import NamedTuple

import pandas as pd
import netaddr

from rpki_analysis.delegated_stats import read_delegated_extended_stats, StatsCombinedAllocations, RirLookup
from rpki_analysis.routinator import read_jsonext

In [2]:
df = await read_jsonext("https://rpki-validator.ripe.net/jsonext")

In [3]:
async with aiohttp.ClientSession() as session:
    async with session.get('https://ftp.ripe.net/pub/stats/ripencc/nro-stats/latest/nro-delegated-stats') as resp:
        df_delext_stats = read_delegated_extended_stats(io.StringIO(await resp.text()))

        rir_lookup = RirLookup(df_delext_stats)
        lookup = StatsCombinedAllocations(df_delext_stats[df_delext_stats.status == 'assigned'])

ClientPayloadError: Response payload is not completed

In [None]:
for prefix in set(df[df.containing_rir.isnull()].prefix):
    display(rir_lookup[prefix], prefix)

In [None]:
df_mismatch[df_mismatch.containing_rir == None]

In [None]:
df_delext_stats.status.unique()

In [None]:
misses = list()

class PrefixDetails(NamedTuple):
    rir: str
    opaque_id: str
    covering_rir: str

def maybe_lookup(row: pd.DataFrame) -> PrefixDetails:
    containing_rir = rir_lookup.get(row.prefix)
    try:
        entry = lookup[row.prefix]
        return PrefixDetails(entry.rir, entry.opaque_id, containing_rir)
    except KeyError:
        misses.append(row)
        children = list(lookup.children(row.prefix))
        if children:
            child_resources = netaddr.IPSet([c.resource for c in children])
            child_opaque_ids = set(map(lambda r: r.opaque_id, children))
            child_rirs = set(map(lambda r: r.rir, children))
            
            if child_resources.issuperset(netaddr.IPSet([row.prefix])):
                return PrefixDetails(f"{'-'.join(child_rirs)}-multi-opaque-id", "_".join(child_opaque_ids), containing_rir)
                
            return PrefixDetails(f"unknown-more-specific-allocs-{"-".join(child_rirs)}", "_".join(child_opaque_ids), containing_rir)

df[['rir', 'opaque_id', 'containing_rir']] = df.apply(maybe_lookup, axis=1, result_type='expand')
df['publication_point'] = df.uri.apply(lambda uri: "/".join(uri.split("/")[:-1]))
df_unmatched = pd.DataFrame.from_records(misses)

In [None]:
display(df_unmatched.prefix.unique())
display(df.keys())

# Check the tal against the RIR according to delegated extended statistics.

The code below detects some mis-alignment. Many of these will becaused by registration artifacts (two continous registrations controlled by one entity, that are under two IDs in the delegated stats.

In [None]:
#rir_lookup['43.236.0.0/16']
rir_lookup.get('91.207.19.0/24')

In [None]:
home = Path(expanduser("~"))
now = datetime.now()

df_mismatch = df.loc[df.tal.str.replace("ripe", "ripencc") != df.rir, ['uri', 'tal', 'asn', 'prefix', 'max_length', 'rir', 'opaque_id', 'containing_rir']]
df_mismatch.to_excel(home / f"Desktop/{now.strftime('%Y%m%d')}-results-results.xlsx")
display(df_mismatch)

In [None]:
df[df.containing_rir.isnull()]

# Duplicate VRPs

Look at what VRPs are duplicated the most often and how this happens

In [None]:
top_10 = df.groupby(["prefix", "asn", "rir", "max_length"]).count().nlargest(10, ['not_before'])
top_10

# Maximum number of prefixes per ROA
Recall that a ROA has a single AS by definition: The grouping by AS is only to clarify what AS-es have this number

In [None]:
df.groupby(["uri", "asn"]).count().nlargest(10, 'index')

# Analysis by publication point:
Wat is the maximum number of ROAs, total VRPs per publication point (~= certificate for most CAs)

```
$ rsync rsync://rpki.arin.net/repository/arin-rpki-ta/5e4a23ea-e80a-403e-b08c-2171da2157d3/2a246947-2d62-4a6c-ba05-87187f0099b2/4e95a28e-27fe-479a-b086-2cc9809d54f6/ | wc -l
20729
```

In [None]:
df.groupby(['publication_point']).count().nlargest(10, ['uri'])

The total number of files per publicationpoint:

In [None]:
df.drop_duplicates(['publication_point', 'uri']).groupby(['publication_point']).count().nlargest(10, ['uri'])

publication points generally contain one ROA for an AS, let's check

# TODO

Now count prefixes per publication point

# Now let's work on unique VRPs

In [None]:
df = df.reset_index().drop_duplicates(['asn', 'prefix', 'max_length'])

The ROA with the most prefixes:

```python
```

In [None]:
df.groupby(["asn"]).count().nlargest(10, ['index'])

Prefix with most ROAs:

In [None]:
df.groupby(['prefix']).count().nlargest(10, ['roa'])