In [1]:
# Add parent dir to path
import os,sys,inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [2]:
import gzip
import logging

from typing import List
from pathlib import Path

import pandas as pd
import altair as alt

import netaddr

from rpki_analysis.rsynclogs import parse_log

logging.basicConfig()
LOG = logging.getLogger(__name__)

In [3]:
rsync_logs = list(Path('../logs/').glob("*/rsync*"))

In [79]:
# Do not print logs in a public workbook
#[f for f in rsync_logs if not f.name.endswith('.gz')][0].open().readlines()[0:10]

In [5]:
def blind_mask(ip) -> str:
    addr = netaddr.IPAddress(ip)
    if addr.version == 4:
        return addr & netaddr.IPAddress('255.255.255.0')
    if addr.version == 6:
        return addr & netaddr.IPAddress('FFFF:FFFF:FFFF:FFFF::')

def parse_all_logs(rsync_logs: List[str]):
    dfs = []
    for file in rsync_logs:
        LOG.info("Reading %s", file)
        if file.name.endswith('.gz'):
            with gzip.open(file, 'rt') as f:
                lines = parse_log(f.readlines())
        else:
            with file.open('rt') as f:
                lines = parse_log(f.readlines())
            
        df = pd.DataFrame(lines)
        df['file'] = file.name
        dfs.append(df)
        
    df = pd.concat(dfs)
    df.timestamp = pd.to_datetime(df.timestamp)
    df.totalbytes = df.totalbytes.astype(int)
    df.sentbytes = df.sentbytes.astype(int)
    df.receivedbytes = df.receivedbytes.astype(int)
    
    df['date'] = df.timestamp.dt.date
    df['masked_clientip'] = df.clientip.map(blind_mask)
    return df

In [6]:
df = parse_all_logs(rsync_logs)

In [7]:
df.shape

(2135927, 17)

In [26]:
# Get NLNog ring nodes
import json
import requests

nodes = requests.get("https://api.ring.nlnog.net/1.0/nodes").json()
node_df = pd.DataFrame.from_dict(nodes['results']['nodes'])
node_df['ip_type'] = 'nlnog-ring'

node_nodes: set = frozenset(
    str(node_df.ipv4.str)
)
def is_node_node(addr: str) -> bool:
    return node_df.ipv4.str.contains(str(addr)).any() or node_df.ipv6.str.contains(str(addr)).any()
    

In [27]:
node_long = pd.melt(
    node_df[['ipv4', 'ipv6', 'ip_type']],
    id_vars=['ip_type'],
    value_vars=['ipv4', 'ipv6']
).rename(columns={
    'variable': 'ip_af'
}).set_index(['value'])

In [28]:
display(node_long.keys())
display(df.keys())
node_long

Index(['ip_type', 'ip_af'], dtype='object')

Index(['timestamp', 'pid', 'logtype', 'sentbytes', 'receivedbytes',
       'totalbytes', 'module', 'directory', 'hostname', 'clientip', 'endtime',
       'geohash', 'city', 'hosthash', 'file', 'date', 'masked_clientip'],
      dtype='object')

Unnamed: 0_level_0,ip_type,ip_af
value,Unnamed: 1_level_1,Unnamed: 2_level_1
46.16.72.7,nlnog-ring,ipv4
37.46.195.238,nlnog-ring,ipv4
94.186.187.3,nlnog-ring,ipv4
196.10.55.133,nlnog-ring,ipv4
52.22.172.27,nlnog-ring,ipv4
...,...,...
2a02:10:0:1::90:10,nlnog-ring,ipv6
2a10:3780:2:52:185:93:175:22,nlnog-ring,ipv6
2a01:8800:0:250::9,nlnog-ring,ipv6
2604:9a00:2100:af05:baaa::1,nlnog-ring,ipv6


In [31]:
df_ring = df.merge(node_long, left_on='clientip', how='left', right_index=True)
df_ring.loc[df_ring.ip_type.isnull(), 'ip_type'] = 'other'


In [33]:
df_ring.keys()

Index(['timestamp', 'pid', 'logtype', 'sentbytes', 'receivedbytes',
       'totalbytes', 'module', 'directory', 'hostname', 'clientip', 'endtime',
       'geohash', 'city', 'hosthash', 'file', 'date', 'masked_clientip',
       'ip_type', 'ip_af'],
      dtype='object')

In [37]:
df_ring.groupby(['ip_type', 'module']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,timestamp,pid,logtype,sentbytes,receivedbytes,totalbytes,directory,hostname,clientip,endtime,geohash,city,hosthash,file,date,masked_clientip,ip_af
ip_type,module,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
nlnog-ring,repository,218347,218347,218347,218347,218347,218347,218347,218347,218347,218347,218347,218347,218347,218347,218347,218347,218347
nlnog-ring,ta,218859,218859,218859,218859,218859,218859,218859,218859,218859,218859,218859,218859,218859,218859,218859,218859,218859
other,repository,180617,180617,180617,180617,180617,180617,180617,180617,180617,180617,180617,180617,180617,180617,180617,180617,0
other,ta,1518104,1518104,1518104,1518104,1518104,1518104,1518104,1518104,1518104,1518104,1518104,1518104,1518104,1518104,1518104,1518104,0


In [66]:
df_ring.module.unique()

array(['ta', 'repository'], dtype=object)

In [72]:
by_time = df_ring[['module', 'timestamp', 'totalbytes', 'ip_type']]\
    .groupby([
        'module',
        'ip_type',
        pd.Grouper(key="timestamp", freq="60min")
    ]).count()\
    .reset_index()\

fig = alt.Chart(by_time).transform_joinaggregate(
    order='count(*)',
    groupby=['module']
).mark_line().encode(
    x=alt.X(
        'timestamp:T',
        axis=alt.Axis(title='Time')
    ),
    y=alt.Y('totalbytes:Q',
            axis=alt.Axis(title='Number of clients')
    ),
    color=alt.Color(
        'ip_type:N',
        legend=alt.Legend(title='IP type')
    ),
    row=alt.Row(
        'module:N',
        title='rsync module'
    )
).properties(
    title='Number of connecting rsync clients per hour'
)

fig.save('20210615-rsync-clients-type.png', scale=2)
display(fig)

In [78]:
# Get data per unique IP
by_time = df_ring[['module', 'timestamp', 'totalbytes', 'ip_type', 'clientip']]\
    .groupby([
        'module',
        'ip_type',
        'clientip',
        pd.Grouper(key="timestamp", freq="60min")
    ]).count()\
    .reset_index()\
    .drop_duplicates()
by_time = by_time.groupby(['module', 'ip_type', pd.Grouper(key="timestamp", freq="60min")]).count().reset_index()

fig = alt.Chart(by_time).transform_joinaggregate(
    order='count(*)',
    groupby=['module']
).mark_line().encode(
    x=alt.X(
        'timestamp:T',
        axis=alt.Axis(title='Time')
    ),
    y=alt.Y('totalbytes:Q',
            axis=alt.Axis(title='Number of clients')
    ),
    color=alt.Color(
        'ip_type:N',
        legend=alt.Legend(title='IP type')
    ),
    row=alt.Row(
        'module:N',
        title='rsync module'
    )
).properties(
    title='Number of unique IPs of rsync clients per hour'
)

fig.save('20210615-rsync-clients-type-unique-ip.png', scale=2)
display(fig)

# Note that this assumes logfiles are available for the dates in the code!

Furthermore: You likely want to group by a longer period than 1h.

In [11]:
by_ip = df[['timestamp', 'module', 'clientip']].drop_duplicates().groupby([pd.Grouper(key="timestamp", freq="1H"), 'module']).count().rename(columns={'clientip': 'unique_ips'})

by_mask = df[['timestamp', 'module', 'masked_clientip']].drop_duplicates().groupby([pd.Grouper(key="timestamp", freq="1H"), 'module']).count().rename(columns={'masked_clientip': 'per_24_or_64'})

ips_by_date = pd.concat([by_ip, by_mask], axis=1).reset_index()

In [12]:
def stats_by_week(df):
    LOG.info("This may be broken because it's mid-refactor between date and timestamp")
    first_week = ((df.timestamp >= datetime.datetime(year=2021, month=3, day=26)) & (df.timestamp < datetime.datetime(year=2021, month=4, day=2)))
    last_week = ((df.timestamp >= datetime.datetime(year=2021, month=4, day=2)) & (df.timestamp < datetime.datetime(year=2021, month=4, day=9)))
    
    display(df[first_week].groupby(['module']).mean())
    display(df[last_week].groupby(['module']).mean())

In [13]:
stats_by_week(ips_by_date)

2021-05-12 15:32:58,307 - rsyncstats - INFO - This may be broken because it's mid-refactor between date and timestamp


Unnamed: 0_level_0,unique_ips,per_24_or_64
module,Unnamed: 1_level_1,Unnamed: 2_level_1


Unnamed: 0_level_0,unique_ips,per_24_or_64
module,Unnamed: 1_level_1,Unnamed: 2_level_1


Because of the structure of our repository we can split out clients connecting over rsync to retrieve the trust anchor from those connecting to the main repository.

We do see a change on the 2nd of April so I'm providing data both for the week before and after this date.

In the week leading up to the 2nd of april, on average per dag we see:
  * 192 unique IPs (from 182 /24's/64's) creating 8636 connections to /repository
  * 911 unique IPs (from 721 /24's/64's) creating 81855 connections to /ta
In the week starting on the 2nd of april on average per day we see:
  * 598 unique IPs (from 582 /24's/64's) creating 17594 connections to /repository
  * 1301 unique IPs (from 1114 /24's/64's) creating 89675 connections to /ta
  
We see ~1086 unique IPs accessing the TA certificate over HTTPS per day.

In [14]:
stats_by_week(df[['timestamp', 'module', 'clientip']].groupby([pd.Grouper(key='timestamp', freq='1d'), 'module']).count().reset_index())

2021-05-12 15:32:58,369 - rsyncstats - INFO - This may be broken because it's mid-refactor between date and timestamp


Unnamed: 0_level_0,clientip
module,Unnamed: 1_level_1


Unnamed: 0_level_0,clientip
module,Unnamed: 1_level_1


In [15]:
df.totalbytes.astype(int).sum()

1075693423678

In [16]:
traffic_by_day = df[['timestamp', 'receivedbytes', 'sentbytes', 'totalbytes']].groupby([pd.Grouper(key="timestamp", freq="1H")]).sum().reset_index()
#traffic_by_day = traffic_by_day[traffic_by_day.timestamp <= datetime.datetime(year=2021, month=4, day=13)]

In [17]:
traffic_by_day

Unnamed: 0,timestamp,receivedbytes,sentbytes,totalbytes
0,2021-05-05 03:00:00,52216592,3580178072,25112338250
1,2021-05-05 04:00:00,70188834,4950026547,43663783339
2,2021-05-05 05:00:00,86375488,5617980832,43675446252
3,2021-05-05 06:00:00,100732378,6289180834,44462386201
4,2021-05-05 07:00:00,108864813,6731189202,43227341643
5,2021-05-05 08:00:00,101913718,6133358500,41887338739
6,2021-05-05 09:00:00,132488729,7705188688,46002098268
7,2021-05-05 10:00:00,98290767,5966936213,44458956152
8,2021-05-05 11:00:00,77231371,5451907745,45800438247
9,2021-05-05 12:00:00,79530931,5472386352,44458207220


In [18]:
traffic_by_day_long = pd.melt(traffic_by_day, id_vars='timestamp', value_vars=['receivedbytes', 'sentbytes', 'totalbytes'], var_name='traffic')

alt.Chart(traffic_by_day_long).mark_line().encode(
    x='timestamp:T',
    y='value:Q',
    color='traffic:N'
)

# Start of some code that works with riswhois
## TODO: extract into library!

In [19]:
import bz2
import json
import io
import ipaddress
import logging

import altair as alt
import pandas as pd
import pytricia
import requests

from typing import Generator, NamedTuple, Optional, Set

from pandas.api.types import CategoricalDtype
from pandas.core.series import Series

LOG = logging.getLogger(__name__)
LOG.setLevel(logging.DEBUG)

In [20]:
class RouteOriginAuthorization(NamedTuple):
    asn: int
    prefix: str
    max_length: int

    prefix_length: Optional[int] = None
        
class ExpandedRisEntry(NamedTuple):
    origin: str
    prefix: str
    seen_by_peers: int
    prefix_length: int
    roa_validity: str

In [21]:
# https://www.ris.ripe.net/dumps/riswhoisdump.IPv4.gz
# https://www.ris.ripe.net/dumps/riswhoisdump.IPv6.gz
ris_v4_gz = requests.get("https://www.ris.ripe.net/dumps/riswhoisdump.IPv4.gz").content
ris_v6_gz = requests.get("https://www.ris.ripe.net/dumps/riswhoisdump.IPv6.gz").content

def read_ris_dump(url: str) -> pd.DataFrame:
    # Get file, accept that there are comment lines in there
    df = pd.read_csv(url,
                     compression='gzip',
                     sep="\t",
                     names=["origin", "prefix", "seen_by_peers"]
    )
    
    if df.origin.str.startswith('{').any():
        LOG.error("RIS dump contains row(s) with AS_SET! These will never be RPKI valid (https://tools.ietf.org/html/rfc6907#section-7.1.8)")
    # select the rows that do not have the '%' prefix
    df = df[~df.origin.str.startswith('%')].copy()
    
    # separate prefix length
    df['prefix_length'] = df.prefix.map(lambda p: ipaddress.ip_network(p).prefixlen)
    
    return df

In [22]:
ris_v4 = read_ris_dump("https://www.ris.ripe.net/dumps/riswhoisdump.IPv4.gz")
ris_v6 = read_ris_dump("https://www.ris.ripe.net/dumps/riswhoisdump.IPv6.gz")

RIS dump contains row(s) with AS_SET! These will never be RPKI valid (https://tools.ietf.org/html/rfc6907#section-7.1.8)
RIS dump contains row(s) with AS_SET! These will never be RPKI valid (https://tools.ietf.org/html/rfc6907#section-7.1.8)


In [23]:
class RisWhoisLookup:
    trie: pytricia.PyTricia
        
    def __init__(self, data: pd.DataFrame, visibility_threshold : int = 10) -> None:
        af = data.prefix.apply(lambda p: ipaddress.ip_network(p).version)
        assert af.nunique() == 1
        length = 128 if af.unique()[0] == 6 else 32
        
        self.trie = pytricia.PyTricia(length)
        data[data.seen_by_peers >= visibility_threshold].apply(self.__build_trie, axis=1)
        
    def __build_trie(self, row: Series) -> None:
        if not self.trie.has_key(row.prefix):
            # Add entry
            self.trie[row.prefix] = set()
            
        self.trie[row.prefix].add(
            ExpandedRisEntry(row.origin, row.prefix, row.seen_by_peers, row.prefix_length, row.roa_validity)
        )
       
    def lookup(self, prefix) -> Generator[ExpandedRisEntry, None, None]:
        key = self.trie.get_key(prefix)
        while key is not None:
            yield from self.trie[key]
            key = self.trie.parent(key)
            
    def __getitem__(self, prefix) -> Set[ExpandedRisEntry]:
        return set(self.lookup(prefix))

In [24]:
# And build patricia trie
ris_v4['roa_validity'] = ''
ris_v6['roa_validity'] = ''

ris_v4_lookup = RisWhoisLookup(ris_v4)
ris_v6_lookup = RisWhoisLookup(ris_v6)

In [25]:
def lookup_afi(ip_str) -> Generator[ExpandedRisEntry, None, None]:
    ip = netaddr.IPAddress(ip_str)
    if ip.version == 4:
        return ris_v4_lookup.lookup(ip_str)
    else:
        return ris_v6_lookup.lookup(ip_str)
    

def origin_as(ip_str) -> str:
    try:
        return next(iter(sorted(lookup_afi(ip_str)))).origin
    except:
        return -1
    
def origin_prefix(ip_str) -> str:
    try:
        return next(iter(sorted(lookup_afi(ip_str)))).prefix
    except:
        return -1

In [26]:
df['origin_as'] = df.clientip.map(origin_as)
df['origin_prefix'] = df.clientip.map(origin_prefix)


In [27]:
# First group by hour, then actually count the number of different origin as|prefixes
by_origin_as = df[['timestamp', 'module', 'origin_as']]\
    .groupby([pd.Grouper(key="timestamp", freq="1H"), 'module', 'origin_as']).count()\
    .reset_index()\
    .drop_duplicates()\
    .groupby([pd.Grouper(key="timestamp", freq="1H"), 'module'])\
    .count()\
    .rename(columns={'origin_as': 'origin_as'})

by_origin_prefix = df[['timestamp', 'module', 'origin_prefix']]\
    .groupby([pd.Grouper(key="timestamp", freq="1H"), 'module', 'origin_prefix']).count()\
    .reset_index()\
    .drop_duplicates()\
    .groupby([pd.Grouper(key="timestamp", freq="1H"), 'module'])\
    .count()\
    .rename(columns={'origin_prefix': 'origin_prefix'})

ips_by_date = pd.concat([by_ip, by_mask, by_origin_as, by_origin_prefix], axis=1).reset_index()

In [28]:
ips_by_date_long = pd.melt(ips_by_date,
                           id_vars=['timestamp', 'module'],
                           value_vars=['unique_ips', 'per_24_or_64', 'origin_as', 'origin_prefix'],
                           var_name='type')

alt.Chart(ips_by_date_long).mark_line().encode(
    x='timestamp:T',
    y='value:Q',
    color='type:N',
    row='module'
)

In [29]:
stats_by_week(ips_by_date_long)

Unnamed: 0_level_0,value
module,Unnamed: 1_level_1


Unnamed: 0_level_0,value
module,Unnamed: 1_level_1
