*This notebook gets the raw ripeatlas h1 and h2 latencies for each (probeid, timestamp) tuple from `ra_latencies` table; aggregates the h1_latencies and h2_latencies across 3 queries and generates a 5-tuple then inserts entire dataframe in `ra_latencies_agg` table.*
- - - 

In [1]:
import sqlite3
import pandas as pd
import requests
import ipaddress
import time

In [3]:
DB_LOCATION = 'lastmile.db'
RA_LATENCIES = 'ra_latencies'
RA_LATENCIES_AGG = 'ra_latencies_agg'

In [4]:
con = sqlite3.connect(DB_LOCATION)

In [5]:
query = '''SELECT    probeid
                   , timestamp
                   , h1_latencies
                   , h2_latencies
           FROM      %s
        '''%(RA_LATENCIES)

df = pd.read_sql(query, con)

In [6]:
df.head()

Unnamed: 0,probeid,timestamp,h1_latencies,h2_latencies
0,10006,1406548034,"0.559, 0.429, 0.412","20.49, 27.571, 17.716"
1,10006,1406562432,"0.579, 0.431, 0.423","17.7, 17.281, 51.434"
2,10006,1406576835,"0.58, 0.438, 0.42","34.678, 17.776, 17.473"
3,10006,1406591230,"0.769, 0.453, 0.478","17.577, 18.21, 17.533"
4,10006,1406605631,"0.534, 0.422, 0.416","17.442, 37.561, 17.526"


In [45]:
def percentile(n):
    import numpy as np
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_ 

In [57]:
def get_min(latencies):
    latencies = latencies.strip()
    if latencies is None or latencies == 'None': return None
    latencies = [float(latency) for latency in latencies.split(',')]
    min = percentile(0)(latencies)
    return min

In [67]:
def get_q1(latencies):
    latencies = latencies.strip()
    if latencies is None or latencies == 'None': return None
    latencies = [float(latency) for latency in latencies.split(',')]
    q1 = percentile(25)(latencies)
    return q1

In [65]:
def get_median(latencies):
    latencies = latencies.strip()
    if latencies is None or latencies == 'None': return None
    latencies = [float(latency) for latency in latencies.split(',')]
    median = percentile(50)(latencies)
    return median

In [69]:
def get_q2(latencies):
    latencies = latencies.strip()
    if latencies is None or latencies == 'None': return None
    latencies = [float(latency) for latency in latencies.split(',')]
    q2 = percentile(75)(latencies)
    return q2

In [61]:
def get_max(latencies):
    latencies = latencies.strip()
    if latencies is None or latencies == 'None': return None
    latencies = [float(latency) for latency in latencies.split(',')]
    max = percentile(100)(latencies)
    return max

In [74]:
df['h1_min'] = df['h1_latencies'].apply(get_min)
df['h2_min'] = df['h2_latencies'].apply(get_min)

In [75]:
df['h1_q1'] = df['h1_latencies'].apply(get_q1)
df['h2_q1'] = df['h2_latencies'].apply(get_q1)

In [76]:
df['h1_median'] = df['h1_latencies'].apply(get_median)
df['h2_median'] = df['h2_latencies'].apply(get_median)

In [77]:
df['h1_q2'] = df['h1_latencies'].apply(get_q2)
df['h2_q2'] = df['h2_latencies'].apply(get_q2)

In [78]:
df['h1_max'] = df['h1_latencies'].apply(get_max)
df['h2_max'] = df['h2_latencies'].apply(get_max)

In [79]:
df.head()

Unnamed: 0,probeid,timestamp,h1_latencies,h2_latencies,h1_min,h1_max,h2_min,h2_max,h1_q1,h2_q1,h1_median,h2_median,h1_q2,h2_q2
0,10006,1406548034,"0.559, 0.429, 0.412","20.49, 27.571, 17.716",0.412,0.559,17.716,27.571,0.4205,19.103,0.429,20.49,0.494,24.0305
1,10006,1406562432,"0.579, 0.431, 0.423","17.7, 17.281, 51.434",0.423,0.579,17.281,51.434,0.427,17.4905,0.431,17.7,0.505,34.567
2,10006,1406576835,"0.58, 0.438, 0.42","34.678, 17.776, 17.473",0.42,0.58,17.473,34.678,0.429,17.6245,0.438,17.776,0.509,26.227
3,10006,1406591230,"0.769, 0.453, 0.478","17.577, 18.21, 17.533",0.453,0.769,17.533,18.21,0.4655,17.555,0.478,17.577,0.6235,17.8935
4,10006,1406605631,"0.534, 0.422, 0.416","17.442, 37.561, 17.526",0.416,0.534,17.442,37.561,0.419,17.484,0.422,17.526,0.478,27.5435


In [80]:
del df['h1_latencies']
del df['h2_latencies']

In [81]:
df.head()

Unnamed: 0,probeid,timestamp,h1_min,h1_max,h2_min,h2_max,h1_q1,h2_q1,h1_median,h2_median,h1_q2,h2_q2
0,10006,1406548034,0.412,0.559,17.716,27.571,0.4205,19.103,0.429,20.49,0.494,24.0305
1,10006,1406562432,0.423,0.579,17.281,51.434,0.427,17.4905,0.431,17.7,0.505,34.567
2,10006,1406576835,0.42,0.58,17.473,34.678,0.429,17.6245,0.438,17.776,0.509,26.227
3,10006,1406591230,0.453,0.769,17.533,18.21,0.4655,17.555,0.478,17.577,0.6235,17.8935
4,10006,1406605631,0.416,0.534,17.442,37.561,0.419,17.484,0.422,17.526,0.478,27.5435


In [86]:
df.count()

probeid      137017
timestamp    137017
h1_min       136180
h1_max       136180
h2_min       132742
h2_max       132742
h1_q1        136180
h2_q1        132742
h1_median    136180
h2_median    132742
h1_q2        136180
h2_q2        132742
dtype: int64

In [87]:
cur = con.execute('pragma foreign_keys=ON')

In [88]:
index_label = ['probeid', 'timestamp']
df = df.set_index(index_label)

In [89]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,h1_min,h1_max,h2_min,h2_max,h1_q1,h2_q1,h1_median,h2_median,h1_q2,h2_q2
probeid,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10006,1406548034,0.412,0.559,17.716,27.571,0.4205,19.103,0.429,20.49,0.494,24.0305
10006,1406562432,0.423,0.579,17.281,51.434,0.427,17.4905,0.431,17.7,0.505,34.567
10006,1406576835,0.42,0.58,17.473,34.678,0.429,17.6245,0.438,17.776,0.509,26.227
10006,1406591230,0.453,0.769,17.533,18.21,0.4655,17.555,0.478,17.577,0.6235,17.8935
10006,1406605631,0.416,0.534,17.442,37.561,0.419,17.484,0.422,17.526,0.478,27.5435


In [90]:
df.to_sql(  '%s'%RA_LATENCIES_AGG
          , con
          , flavor='sqlite'
          , if_exists = 'append'
          , index_label = index_label
         )

In [91]:
con.commit()
con.close()