# Parse CloudFront logs and create RP statistics

This workbook provides a sample for working with CloudFront logs from a RRDP repository.

The directory containing the logs should be in the `LOG_PATH` environment variable.
```
$ LOG_PATH=.../logs pipenv run jupyter lab
```

In [1]:
# Add parent dir to path
import os
import sys
import logging

logging.basicConfig()
LOG = logging.getLogger(__name__)

sys.path.insert(0,os.environ['PWD'])

In [2]:
from rpki_analysis.cloudfrontlogs import read_from_glob
from rpki_analysis.rpsoftware import group_clients, detailed_group_clients

import altair as alt
import pandas as pd

import datetime

alt.renderers.set_embed_options(scaleFactor=2)

RendererRegistry.enable('default')

In [3]:
LOG_PATH = os.environ['LOG_PATH']

In [4]:
FIELDS = ['datetime', 'c-ip', 'rp_software', 'rp_software_version']

def notification_xml_client(df):
    df_not = df[df['cs-uri-stem'] == '/notification.xml'].copy()
        
    df_not['rp_software'] = df_not['cs(User-Agent)'].map(group_clients)
    df_not['rp_software_version'] = df_not['cs(User-Agent)'].map(detailed_group_clients)
    
    return df_not[FIELDS + ['cs-uri-stem']]\
        .groupby([pd.Grouper(key="datetime", freq="24h"), "c-ip", "rp_software", "rp_software_version"])\
        .count()\
        .rename(columns={'cs-uri-stem': 'count'})\
        .reset_index()


def second_group(df):
    return df.groupby(['datetime', 'c-ip', 'rp_software', 'rp_software_version']).sum().reset_index()

Load all logsfiles and summarize their data. This process _will_ take a long time when processing a large amount of logfiles.

Data is summarized by grouping twice:
  * once per file that is read
  * and once when joining files.
  
This reduces memory usage a lot.

In [5]:
df_full = read_from_glob(
    [f"{LOG_PATH}/*/E12X42CFS5M88P.2021-08-05*.gz"],
#    ['date', 'time', 'c-ip', 'cs(User-Agent)', 'cs-uri-stem'],
#    notification_xml_client,
#    second_group
)

INFO:rpki_analysis.cloudfrontlogs:Read 168/168 files
INFO:rpki_analysis.cloudfrontlogs:rows: 431422, columns: 34
INFO:rpki_analysis.cloudfrontlogs:Index                           3451376
date                           28905274
time                           28042430
x-edge-location                27955740
sc-bytes                        3451376
c-ip                           31356971
cs-method                      25886779
cs(Host)                       37102292
cs-uri-stem                    39246180
sc-status                       3451376
cs(Referer)                    25049486
cs(User-Agent)                 35575854
cs-uri-query                   25022549
cs(Cookie)                     25022476
x-edge-result-type             25973008
x-edge-request-id              48750686
x-host-header                  30200292
cs-protocol                    26748131
cs-bytes                        3451376
time-taken                      3451376
x-forwarded-for                25022736
ssl-protocol 

In [7]:
#df_full

In [10]:
!host testbed-1.rpki.ripe.net

testbed-1.rpki.ripe.net has address 193.0.19.107
testbed-1.rpki.ripe.net has IPv6 address 2001:67c:2e8:11::c100:136b


In [11]:
df_full.keys()

Index(['date', 'time', 'x-edge-location', 'sc-bytes', 'c-ip', 'cs-method',
       'cs(Host)', 'cs-uri-stem', 'sc-status', 'cs(Referer)', 'cs(User-Agent)',
       'cs-uri-query', 'cs(Cookie)', 'x-edge-result-type', 'x-edge-request-id',
       'x-host-header', 'cs-protocol', 'cs-bytes', 'time-taken',
       'x-forwarded-for', 'ssl-protocol', 'ssl-cipher',
       'x-edge-response-result-type', 'cs-protocol-version', 'fle-status',
       'fle-encrypted-fields', 'c-port', 'time-to-first-byte',
       'x-edge-detailed-result-type', 'sc-content-type', 'sc-content-len',
       'sc-range-start', 'sc-range-end', 'datetime'],
      dtype='object')

In [22]:
df_full[(df_full['c-ip'] == '193.0.19.107') & (~df_full['sc-status'].isin([200, 304]))]

Unnamed: 0,date,time,x-edge-location,sc-bytes,c-ip,cs-method,cs(Host),cs-uri-stem,sc-status,cs(Referer),...,fle-status,fle-encrypted-fields,c-port,time-to-first-byte,x-edge-detailed-result-type,sc-content-type,sc-content-len,sc-range-start,sc-range-end,datetime


In [26]:
df_full[~df_full['sc-status'].isin([200, 304])]['cs-uri-stem'].unique()

array(['/', '/robots.txt',
       '/c85a5e87-ad1a-4b5a-b73f-8325877826fd/5714/delta.xml',
       '/c85a5e87-ad1a-4b5a-b73f-8325877826fd/5378/delta.xml',
       '/90ed241f-125b-492a-b99b-63bc61cad9aa/1087/snapshot.xml',
       '/90ed241f-125b-492a-b99b-63bc61cad9aa/960/delta.xml',
       '/90ed241f-125b-492a-b99b-63bc61cad9aa/958/delta.xml',
       '/90ed241f-125b-492a-b99b-63bc61cad9aa/868/delta.xml',
       '/90ed241f-125b-492a-b99b-63bc61cad9aa/891/delta.xml',
       '/90ed241f-125b-492a-b99b-63bc61cad9aa/854/delta.xml',
       '/c85a5e87-ad1a-4b5a-b73f-8325877826fd/5852/delta.xml',
       '/1c834cf6-ad06-42ba-a82f-68bae98a30d6/1343/delta.xml',
       '/90ed241f-125b-492a-b99b-63bc61cad9aa/257/delta.xml',
       '/90ed241f-125b-492a-b99b-63bc61cad9aa/572/delta.xml',
       '/c85a5e87-ad1a-4b5a-b73f-8325877826fd/5180/delta.xml',
       '/90ed241f-125b-492a-b99b-63bc61cad9aa/269/delta.xml',
       '/90ed241f-125b-492a-b99b-63bc61cad9aa/1053/snapshot.xml',
       '/c85a5e87-ad1a-4b5a-b7

Get the active instances by filtering for 20 requests from (ip, rp_software, rp_software_version) tuples.

In [19]:
df_active = df_full[df_full['count'] > 10]
df_active.head()

Unnamed: 0,datetime,c-ip,rp_software,rp_software_version,count
1,2021-08-01,101.98.14.225,routinator,0.9.0,24
2,2021-08-01,101.98.14.230,routinator,0.9.0,24
3,2021-08-01,101.98.14.233,routinator,0.9.0,48
4,2021-08-01,102.118.33.141,routinator,0.9.0,42
5,2021-08-01,102.118.49.201,routinator,0.9.0,93


In [20]:
display("20 requests/day:")
display(df_active.groupby(['rp_software']).count())
display("All:")
display(df_full.groupby(['rp_software']).count())

'20 requests/day:'

Unnamed: 0_level_0,datetime,c-ip,rp_software_version,count
rp_software,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
blackbox-exporter,1,1,1,1
fort,115,115,115,115
octorpki,195,195,195,195
routinator,1324,1324,1324,1324
rpki-client,34,34,34,34
rpki-monitoring,1,1,1,1
rpki-prover,2,2,2,2
rpstir2,3,3,3,3
unknown,5,5,5,5
validator2,1,1,1,1


'All:'

Unnamed: 0_level_0,datetime,c-ip,rp_software_version,count
rp_software,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
blackbox-exporter,1,1,1,1
fort,136,136,136,136
octorpki,201,201,201,201
routinator,1545,1545,1545,1545
rpki-client,38,38,38,38
rpki-monitoring,1,1,1,1
rpki-prover,4,4,4,4
rpstir2,3,3,3,3
unknown,13,13,13,13
validator2,1,1,1,1


In [6]:
# Select one additional column an sacrifice it to be able to `.count()`
df_full[['rp_software', 'rp_software_version', 'datetime']]\
    .groupby(['rp_software', 'rp_software_version'])\
    .count()\
    .reset_index()

Unnamed: 0,rp_software,rp_software_version,datetime
0,blackbox-exporter,2.0,1
1,fort,1.2.0,26
2,fort,1.2.1,20
3,fort,1.3.0,5
4,fort,1.4.0,4
...,...,...,...
63,validator3,3.2-2021.03.01.14.58.34,1
64,validator3,3.2-2021.03.02.15.08,7
65,validator3,3.2-2021.04.07.12.55,66
66,validator3,3.2-2021.06.15.05.49.38,1


The following code needs a lot of data (for all days in your chart) to work but may get you started with a similar analysis as shared in discord:

In [7]:
def build_charts(df, base_name=None, prefix="", color_range=None):
    assert base_name
        
    for period in [None, 180, 365]:
        if period:
            then = datetime.datetime.now() - datetime.timedelta(days=period)
            period_data = df[df.datetime >= then]
            period_title = f", {period} days"
            period_file_postfix = f"{period}d"
        else:
            period_data  = df
            period_title = ""
            period_file_postfix = ""
            
        overall = alt.Chart(period_data).transform_joinaggregate(
            order='sum(c-ip)',
            groupby=['rp_software']
        ).mark_line().encode(
            x=alt.X('datetime:T', axis=alt.Axis(title='Time')),
            y=alt.Y("c-ip:Q",
              scale=alt.Scale(domain=(0, 1.1*df['c-ip'].max())),
              axis=alt.Axis(title='Number of clients')
            ),    
            color=alt.Color('rp_software:N',
              scale=alt.Scale(domain=color_range),
              legend=alt.Legend(title='Relying Party implementation')
            ),
            order=alt.Order('order:Q', sort='descending')
        ).properties(
            title=f'Number of unique IPs by RP software ({prefix}{period_title})'
        )

        overall.save(f'outputs/{base_name}-ip-by-rp-{prefix}{period_file_postfix}.png',
                     scale_factor=2.0)
        overall.save(f'outputs/{base_name}-ip-by-rp-{prefix}{period_file_postfix}.svg')
        display(overall)
        
        # Calculate max/min for area:
        df_summed = df.groupby(['datetime']).sum().reset_index()

        overall_area = alt.Chart(period_data).transform_joinaggregate(
            order='sum(c-ip)',
            groupby=['rp_software']
        ).mark_area().encode(
            x=alt.X('datetime:T', axis=alt.Axis(title='Time')),
            y=alt.Y("c-ip:Q",
              scale=alt.Scale(domain=(0, 1.1*df_summed['c-ip'].max())),
              axis=alt.Axis(title='Number of clients')
            ),    
            color=alt.Color('rp_software:N',
              scale=alt.Scale(domain=color_range),
              legend=alt.Legend(title='Relying Party implementation')
            ),
            order=alt.Order('order:Q', sort='descending')
        ).properties(
            title=f'Number of unique IPs by RP software ({prefix}{period_title})'
        )

        overall_area.save(f'outputs/{base_name}-ip-by-rp-area-{prefix}{period_file_postfix}.png',
                          scale_factor=2.0)
        overall_area.save(f'outputs/{base_name}-ip-by-rp-area-{prefix}{period_file_postfix}.svg')
        display(overall_area)

In [8]:
def calc_color_range(df, field='rp_software', last_n_days=90):
    if last_n_days is not None and last_n_days > 0:
        rel_data = df[df.datetime > datetime.datetime.now() - datetime.timedelta(days=last_n_days)]
    else:
        LOG.info("color range: selecting all data")
        rel_data = df
    by_rp = rel_data[[field, 'c-ip', 'datetime']].drop_duplicates().groupby([field]).count().rename(columns={'c-ip': 'count'})

    return list(
        by_rp.sort_values('count', ascending=False).index
    )

color_range = calc_color_range(df_full.reset_index(), last_n_days=180)
display(color_range)

['routinator',
 'validator3',
 'octorpki',
 'fort',
 'rpki-client',
 'unknown',
 'rpki-prover',
 'rpstir2',
 'blackbox-exporter',
 'rpki-monitoring',
 'validator2']