*This notebook gets the raw ripeatlas last hop latencies for each (probeid, timestamp) tuple from `ra_latencies_last_hop` table; aggregates the last_hop latencies across 3 queries and generates a 5-tuple then inserts entire dataframe in `ra_latencies_last_hop_agg` table.*
- - - 

In [27]:
import sqlite3
import pandas as pd
import requests
import ipaddress
import time

In [28]:
DB_LOCATION = 'lastmile.db'
RA_LATENCIES_LAST_HOP = 'ra_latencies_last_hop'
RA_LATENCIES_LAST_HOP_AGG = 'ra_latencies_last_hop_agg'

In [29]:
con = sqlite3.connect(DB_LOCATION)

In [30]:
query = '''SELECT    probeid
                   , timestamp
                   , last_hop
                   , last_hop_latencies
           FROM      %s
        '''%(RA_LATENCIES_LAST_HOP)

df = pd.read_sql(query, con)

In [31]:
df.count()

probeid               137017
timestamp             137017
last_hop              137017
last_hop_latencies    135815
dtype: int64

In [32]:
df = df.dropna()

In [33]:
df.head()

Unnamed: 0,probeid,timestamp,last_hop,last_hop_latencies
0,10006,1406548034,18,"64.929, 65.337, 63.992"
1,10006,1406562432,18,"64.001, 64.404, 64.194"
2,10006,1406576835,18,"64.473, 64.215, 64.613"
3,10006,1406591230,18,"64.46, 65.16, 63.902"
4,10006,1406605631,18,"64.059, 63.728, 64.401"


In [34]:
def percentile(n):
    import numpy as np
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_ 

In [35]:
def get_min(latencies):
    latencies = latencies.strip()
    if latencies is None or latencies == 'None': return None
    latencies = [float(latency) for latency in latencies.split(',')]
    min = percentile(0)(latencies)
    return min

In [36]:
def get_q1(latencies):
    latencies = latencies.strip()
    if latencies is None or latencies == 'None': return None
    latencies = [float(latency) for latency in latencies.split(',')]
    q1 = percentile(25)(latencies)
    return q1

In [37]:
def get_median(latencies):
    latencies = latencies.strip()
    if latencies is None or latencies == 'None': return None
    latencies = [float(latency) for latency in latencies.split(',')]
    median = percentile(50)(latencies)
    return median

In [38]:
def get_q2(latencies):
    latencies = latencies.strip()
    if latencies is None or latencies == 'None': return None
    latencies = [float(latency) for latency in latencies.split(',')]
    q2 = percentile(75)(latencies)
    return q2

In [39]:
def get_max(latencies):
    latencies = latencies.strip()
    if latencies is None or latencies == 'None': return None
    latencies = [float(latency) for latency in latencies.split(',')]
    max = percentile(100)(latencies)
    return max

In [40]:
df['last_hop_min']    = df['last_hop_latencies'].apply(get_min)
df['last_hop_q1']     = df['last_hop_latencies'].apply(get_q1)
df['last_hop_median'] = df['last_hop_latencies'].apply(get_median)
df['last_hop_q2']     = df['last_hop_latencies'].apply(get_q2)
df['last_hop_max']    = df['last_hop_latencies'].apply(get_max)

In [41]:
df.head()

Unnamed: 0,probeid,timestamp,last_hop,last_hop_latencies,last_hop_min,last_hop_q1,last_hop_median,last_hop_q2,last_hop_max
0,10006,1406548034,18,"64.929, 65.337, 63.992",63.992,64.4605,64.929,65.133,65.337
1,10006,1406562432,18,"64.001, 64.404, 64.194",64.001,64.0975,64.194,64.299,64.404
2,10006,1406576835,18,"64.473, 64.215, 64.613",64.215,64.344,64.473,64.543,64.613
3,10006,1406591230,18,"64.46, 65.16, 63.902",63.902,64.181,64.46,64.81,65.16
4,10006,1406605631,18,"64.059, 63.728, 64.401",63.728,63.8935,64.059,64.23,64.401


In [42]:
del df['last_hop_latencies']

In [43]:
df.head()

Unnamed: 0,probeid,timestamp,last_hop,last_hop_min,last_hop_q1,last_hop_median,last_hop_q2,last_hop_max
0,10006,1406548034,18,63.992,64.4605,64.929,65.133,65.337
1,10006,1406562432,18,64.001,64.0975,64.194,64.299,64.404
2,10006,1406576835,18,64.215,64.344,64.473,64.543,64.613
3,10006,1406591230,18,63.902,64.181,64.46,64.81,65.16
4,10006,1406605631,18,63.728,63.8935,64.059,64.23,64.401


In [44]:
df.count()

probeid            135815
timestamp          135815
last_hop           135815
last_hop_min       135815
last_hop_q1        135815
last_hop_median    135815
last_hop_q2        135815
last_hop_max       135815
dtype: int64

In [45]:
cur = con.execute('pragma foreign_keys=ON')

In [46]:
index_label = ['probeid', 'timestamp']
df = df.set_index(index_label)

In [47]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,last_hop,last_hop_min,last_hop_q1,last_hop_median,last_hop_q2,last_hop_max
probeid,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10006,1406548034,18,63.992,64.4605,64.929,65.133,65.337
10006,1406562432,18,64.001,64.0975,64.194,64.299,64.404
10006,1406576835,18,64.215,64.344,64.473,64.543,64.613
10006,1406591230,18,63.902,64.181,64.46,64.81,65.16
10006,1406605631,18,63.728,63.8935,64.059,64.23,64.401


In [48]:
df.to_sql(  '%s'%RA_LATENCIES_LAST_HOP_AGG
          , con
          , flavor='sqlite'
          , if_exists = 'append'
          , index_label = index_label
         )

In [49]:
con.commit()
con.close()