# Parse CloudFront logs and create RP statistics

This workbook provides a sample for working with CloudFront logs from a RRDP repository.

The directory containing the logs should be in the `LOG_PATH` environment variable.
```
$ LOG_PATH=.../logs pipenv run jupyter lab
```

In [1]:
# Add parent dir to path
import os,sys

sys.path.insert(0,os.environ['PWD'])

In [2]:
from rpki_analysis.cloudfrontlogs import *
from rpki_analysis.rpsoftware import *

import altair as alt
import numpy as np

import datetime

import altair_saver
alt.renderers.set_embed_options(scaleFactor=2)

RendererRegistry.enable('default')

In [3]:
LOG_PATH = os.environ['LOG_PATH']

In [4]:
FIELDS = ['datetime', 'c-ip', 'rp_software', 'rp_software_version']

def notification_xml_client(df):
    df_not = df[df['cs-uri-stem'] == '/notification.xml'].copy()
        
    df_not['rp_software'] = df_not['cs(User-Agent)'].map(group_clients)
    df_not['rp_software_version'] = df_not['cs(User-Agent)'].map(detailed_group_clients)
    
    return df_not[FIELDS + ['cs-uri-stem']]\
        .groupby([pd.Grouper(key="datetime", freq="24h"), "c-ip", "rp_software", "rp_software_version"])\
        .count()\
        .rename(columns={'cs-uri-stem': 'count'})\
        .reset_index()


def second_group(df):
    return df.groupby(['datetime', 'c-ip', 'rp_software', 'rp_software_version']).sum().reset_index()

Load all logsfiles and summarize their data. This process _will_ take a long time when processing a large amount of logfiles.

Data is summarized by grouping twice:
  * once per file that is read
  * and once when joining files.
  
This reduces memory usage a lot.

In [5]:
df_full = read_from_glob(
    [f"{LOG_PATH}/*/E12X42CFS5M88P.2021-08-01*.gz"],
    ['date', 'time', 'c-ip', 'cs(User-Agent)', 'cs-uri-stem'],
    notification_xml_client,
    second_group
)

INFO:rpki_analysis.cloudfrontlogs:Read 336/336 files
INFO:rpki_analysis.cloudfrontlogs:rows: 229731, columns: 5
INFO:rpki_analysis.cloudfrontlogs:Index                     128
datetime                17976
c-ip                   166159
rp_software            149337
rp_software_version    148312
count                   17976
dtype: int64
INFO:rpki_analysis.cloudfrontlogs:datetime               datetime64[ns]
c-ip                           object
rp_software                    object
rp_software_version            object
count                           int64
dtype: object


In [6]:
# Select one additional column an sacrifice it to be able to `.count()`
df_full[['rp_software', 'rp_software_version', 'datetime']]\
    .groupby(['rp_software', 'rp_software_version'])\
    .count()\
    .reset_index()

Unnamed: 0,rp_software,rp_software_version,datetime
0,blackbox-exporter,2.0,1
1,fort,1.2.0,26
2,fort,1.2.1,20
3,fort,1.3.0,5
4,fort,1.4.0,4
...,...,...,...
63,validator3,3.2-2021.03.01.14.58.34,1
64,validator3,3.2-2021.03.02.15.08,7
65,validator3,3.2-2021.04.07.12.55,66
66,validator3,3.2-2021.06.15.05.49.38,1


The following code needs a lot of data (for all days in your chart) to work but may get you started with a similar analysis as shared in discord:

In [7]:
def build_charts(df, base_name=None, prefix="", color_range=None):
    assert base_name
        
    for period in [None, 180, 365]:
        if period:
            then = datetime.datetime.now() - datetime.timedelta(days=period)
            period_data = df[df.datetime >= then]
            period_title = f", {period} days"
            period_file_postfix = f"{period}d"
        else:
            period_data  = df
            period_title = ""
            period_file_postfix = ""
            
        overall = alt.Chart(period_data).transform_joinaggregate(
            order='sum(c-ip)',
            groupby=['rp_software']
        ).mark_line().encode(
            x=alt.X('datetime:T', axis=alt.Axis(title='Time')),
            y=alt.Y("c-ip:Q",
              scale=alt.Scale(domain=(0, 1.1*df['c-ip'].max())),
              axis=alt.Axis(title='Number of clients')
            ),    
            color=alt.Color('rp_software:N',
              scale=alt.Scale(domain=color_range),
              legend=alt.Legend(title='Relying Party implementation')
            ),
            order=alt.Order('order:Q', sort='descending')
        ).properties(
            title=f'Number of unique IPs by RP software ({prefix}{period_title})'
        )

        overall.save(f'outputs/{base_name}-ip-by-rp-{prefix}{period_file_postfix}.png',
                     scale_factor=2.0)
        overall.save(f'outputs/{base_name}-ip-by-rp-{prefix}{period_file_postfix}.svg')
        display(overall)
        
        # Calculate max/min for area:
        df_summed = df.groupby(['datetime']).sum().reset_index()

        overall_area = alt.Chart(period_data).transform_joinaggregate(
            order='sum(c-ip)',
            groupby=['rp_software']
        ).mark_area().encode(
            x=alt.X('datetime:T', axis=alt.Axis(title='Time')),
            y=alt.Y("c-ip:Q",
              scale=alt.Scale(domain=(0, 1.1*df_summed['c-ip'].max())),
              axis=alt.Axis(title='Number of clients')
            ),    
            color=alt.Color('rp_software:N',
              scale=alt.Scale(domain=color_range),
              legend=alt.Legend(title='Relying Party implementation')
            ),
            order=alt.Order('order:Q', sort='descending')
        ).properties(
            title=f'Number of unique IPs by RP software ({prefix}{period_title})'
        )

        overall_area.save(f'outputs/{base_name}-ip-by-rp-area-{prefix}{period_file_postfix}.png',
                          scale_factor=2.0)
        overall_area.save(f'outputs/{base_name}-ip-by-rp-area-{prefix}{period_file_postfix}.svg')
        display(overall_area)

In [8]:
def calc_color_range(df, field='rp_software', last_n_days=90):
    if last_n_days is not None and last_n_days > 0:
        rel_data = df[df.datetime > datetime.datetime.now() - datetime.timedelta(days=last_n_days)]
    else:
        LOG.info("color range: selecting all data")
        rel_data = df
    by_rp = rel_data[[field, 'c-ip', 'datetime']].drop_duplicates().groupby([field]).count().rename(columns={'c-ip': 'count'})

    return list(
        by_rp.sort_values('count', ascending=False).index
    )

color_range = calc_color_range(df_full.reset_index(), last_n_days=180)
display(color_range)

['routinator',
 'validator3',
 'octorpki',
 'fort',
 'rpki-client',
 'unknown',
 'rpki-prover',
 'rpstir2',
 'blackbox-exporter',
 'rpki-monitoring',
 'validator2']