# Notebook for Processing and Aggregation of Measurement Data

Resulting dataframes are saved to a separate database for later plotting/analysis.

In [1]:
import pandas as pd
import sqlite3

# `netflix` test

In [2]:
# unitid of probes with known faulty behavior --> remove from data
faulty_probes = [525884, 658929, 19602, 632406, 660076]

In [3]:
conn = sqlite3.connect('../data/netflix-data.db')

# only load successful measurements for now
netflix_df = pd.read_sql_query('select unit_id, dtime, address, \
                                connect_time, \
                                bytes_sec \
                                from netflix \
                                where successes != 0', con=conn, parse_dates=['dtime'])

conn.close()

In [4]:
netflix_df = netflix_df[~netflix_df['unit_id'].isin(faulty_probes)]  # remove faulty probes

netflix_df.drop_duplicates(inplace=True)

netflix_df['dtime'] = netflix_df['dtime'].dt.floor('h')  # floor to hours for later grouping of v4/v6 pairs

In [5]:
netflix_df.dtypes

unit_id                  int64
dtime           datetime64[ns]
address                 object
connect_time             int64
bytes_sec                int64
dtype: object

In [6]:
netflix_df

Unnamed: 0,unit_id,dtime,address,connect_time,bytes_sec
42,62712,2016-07-22 02:00:00,198.38.117.146,9046,11640296
43,62712,2016-07-22 03:00:00,198.38.117.146,8340,11753569
44,62712,2016-07-22 04:00:00,198.38.117.153,8239,11638482
45,62712,2016-07-22 05:00:00,198.38.116.153,8457,11752382
46,62712,2016-07-22 06:00:00,198.38.117.153,8300,11653446
47,62712,2016-07-22 07:00:00,198.38.116.148,8334,11641541
48,62712,2016-07-22 09:00:00,198.38.116.148,8267,11751035
49,62712,2016-07-22 10:00:00,198.38.117.147,8351,11540112
50,62712,2016-07-22 11:00:00,198.38.117.152,8358,11548673
51,62712,2016-07-22 12:00:00,198.38.117.149,8413,11610108


In [7]:
# split v4 and v6 based on address
v4 = netflix_df[~netflix_df['address'].str.contains(':')]
v6 = netflix_df[netflix_df['address'].str.contains(':')]

In [8]:
v4

Unnamed: 0,unit_id,dtime,address,connect_time,bytes_sec
42,62712,2016-07-22 02:00:00,198.38.117.146,9046,11640296
43,62712,2016-07-22 03:00:00,198.38.117.146,8340,11753569
44,62712,2016-07-22 04:00:00,198.38.117.153,8239,11638482
45,62712,2016-07-22 05:00:00,198.38.116.153,8457,11752382
46,62712,2016-07-22 06:00:00,198.38.117.153,8300,11653446
47,62712,2016-07-22 07:00:00,198.38.116.148,8334,11641541
48,62712,2016-07-22 09:00:00,198.38.116.148,8267,11751035
49,62712,2016-07-22 10:00:00,198.38.117.147,8351,11540112
50,62712,2016-07-22 11:00:00,198.38.117.152,8358,11548673
51,62712,2016-07-22 12:00:00,198.38.117.149,8413,11610108


In [9]:
v6

Unnamed: 0,unit_id,dtime,address,connect_time,bytes_sec
63,123256,2016-07-22 00:00:00,2a00:86c0:119:119::145,16803,10674265
65,123256,2016-07-22 02:00:00,2a00:86c0:119:119::142,17350,10958991
68,123256,2016-07-22 04:00:00,2a00:86c0:116:116::145,18747,9191069
70,123256,2016-07-22 05:00:00,2a00:86c0:117:117::141,23625,8737111
71,123256,2016-07-22 10:00:00,2a00:86c0:116:116::148,25121,8356098
74,123256,2016-07-22 13:00:00,2a00:86c0:118:118::143,14076,10857188
76,123256,2016-07-22 14:00:00,2a00:86c0:118:118::149,14320,10756554
78,123256,2016-07-22 15:00:00,2a00:86c0:119:119::142,17494,10573729
80,123256,2016-07-22 17:00:00,2a00:86c0:116:116::148,22151,9316159
82,123256,2016-07-22 19:00:00,2a00:86c0:119:119::148,14430,10861696


## General Metrics

In [10]:
# merge v4 and v6 for comparison and calculation of deltas
netflix_merged = v4.merge(v6, on=['unit_id', 'dtime'], suffixes=('_v4', '_v6'))

In [11]:
netflix_merged

Unnamed: 0,unit_id,dtime,address_v4,connect_time_v4,bytes_sec_v4,address_v6,connect_time_v6,bytes_sec_v6
0,123256,2016-07-22 00:00:00,198.38.119.140,20515,10371717,2a00:86c0:119:119::145,16803,10674265
1,123256,2016-07-22 02:00:00,198.38.118.140,20570,10751181,2a00:86c0:119:119::142,17350,10958991
2,123256,2016-07-22 04:00:00,198.38.119.149,15614,10388970,2a00:86c0:116:116::145,18747,9191069
3,123256,2016-07-22 10:00:00,198.38.119.149,18534,9844976,2a00:86c0:116:116::148,25121,8356098
4,123256,2016-07-22 13:00:00,198.38.117.153,23514,8260807,2a00:86c0:118:118::143,14076,10857188
5,123256,2016-07-22 14:00:00,198.38.118.141,19945,10729881,2a00:86c0:118:118::149,14320,10756554
6,123256,2016-07-22 15:00:00,198.38.118.141,18305,8935313,2a00:86c0:119:119::142,17494,10573729
7,123256,2016-07-22 17:00:00,198.38.119.145,21400,10431608,2a00:86c0:116:116::148,22151,9316159
8,123256,2016-07-22 19:00:00,198.38.118.143,20945,10121108,2a00:86c0:119:119::148,14430,10861696
9,123256,2016-07-22 23:00:00,198.38.116.148,25450,8773578,2a00:86c0:119:119::146,18686,10890393


In [12]:
# v4 - v6, i.e., positive values means v6 is lower
netflix_merged['delta_connect_time'] = netflix_merged['connect_time_v4'] - netflix_merged['connect_time_v6']
netflix_merged['delta_bytes_sec'] = netflix_merged['bytes_sec_v4'] - netflix_merged['bytes_sec_v6']
netflix_merged

Unnamed: 0,unit_id,dtime,address_v4,connect_time_v4,bytes_sec_v4,address_v6,connect_time_v6,bytes_sec_v6,delta_connect_time,delta_bytes_sec
0,123256,2016-07-22 00:00:00,198.38.119.140,20515,10371717,2a00:86c0:119:119::145,16803,10674265,3712,-302548
1,123256,2016-07-22 02:00:00,198.38.118.140,20570,10751181,2a00:86c0:119:119::142,17350,10958991,3220,-207810
2,123256,2016-07-22 04:00:00,198.38.119.149,15614,10388970,2a00:86c0:116:116::145,18747,9191069,-3133,1197901
3,123256,2016-07-22 10:00:00,198.38.119.149,18534,9844976,2a00:86c0:116:116::148,25121,8356098,-6587,1488878
4,123256,2016-07-22 13:00:00,198.38.117.153,23514,8260807,2a00:86c0:118:118::143,14076,10857188,9438,-2596381
5,123256,2016-07-22 14:00:00,198.38.118.141,19945,10729881,2a00:86c0:118:118::149,14320,10756554,5625,-26673
6,123256,2016-07-22 15:00:00,198.38.118.141,18305,8935313,2a00:86c0:119:119::142,17494,10573729,811,-1638416
7,123256,2016-07-22 17:00:00,198.38.119.145,21400,10431608,2a00:86c0:116:116::148,22151,9316159,-751,1115449
8,123256,2016-07-22 19:00:00,198.38.118.143,20945,10121108,2a00:86c0:119:119::148,14430,10861696,6515,-740588
9,123256,2016-07-22 23:00:00,198.38.116.148,25450,8773578,2a00:86c0:119:119::146,18686,10890393,6764,-2116815


In [13]:
conn = sqlite3.connect('../data/netflix-data-aggregated.db')
netflix_merged.to_sql(name = 'netflix', con=conn, index=False, if_exists='replace')
conn.close()

## Including meta data information

In [14]:
# AS Metadata of probes
probe_as = pd.read_csv('../metadata/probes_asns.csv')
probe_as

Unnamed: 0,unit_id,src_asn_v4,src_asn_v6,src_holder_v4,src_holder_v6
0,62712,680,680,DFN - Verein zur Foerderung eines Deutschen Fo...,DFN - Verein zur Foerderung eines Deutschen Fo...
1,201338,680,680,DFN - Verein zur Foerderung eines Deutschen Fo...,DFN - Verein zur Foerderung eines Deutschen Fo...
2,123256,8767,8767,MNET-AS - M-net Telekommunikations GmbH,MNET-AS - M-net Telekommunikations GmbH
3,148644,24956,24956,GDS-1 - Gaertner Datensysteme GmbH & Co. KG,GDS-1 - Gaertner Datensysteme GmbH & Co. KG
4,148650,3320,3320,DTAG - Deutsche Telekom AG,DTAG - Deutsche Telekom AG
5,660160,3320,3320,DTAG - Deutsche Telekom AG,DTAG - Deutsche Telekom AG
6,950210,3320,3320,DTAG - Deutsche Telekom AG,DTAG - Deutsche Telekom AG
7,167808,5607,5607,BSKYB-BROADBAND-AS - Sky UK Limited,BSKYB-BROADBAND-AS - Sky UK Limited
8,658891,5607,5607,BSKYB-BROADBAND-AS - Sky UK Limited,BSKYB-BROADBAND-AS - Sky UK Limited
9,950230,5607,5607,BSKYB-BROADBAND-AS - Sky UK Limited,BSKYB-BROADBAND-AS - Sky UK Limited


In [15]:
# looked up AS information of all destinations that were connected to
endpoints = pd.read_csv('../metadata/netflix_endpoint_to_asn.csv', sep=';')
endpoints

Unnamed: 0,ip,asn,holder
0,2620:10c:7005:10a9::73,2906,AS-SSI - Netflix Streaming Services Inc.
1,108.175.35.201,2906,AS-SSI - Netflix Streaming Services Inc.
2,2620:10c:700f:4487::154,2906,AS-SSI - Netflix Streaming Services Inc.
3,108.175.34.216,2906,AS-SSI - Netflix Streaming Services Inc.
4,108.175.35.209,2906,AS-SSI - Netflix Streaming Services Inc.
5,2620:10c:7005:10a9::109,2906,AS-SSI - Netflix Streaming Services Inc.
6,108.175.35.185,2906,AS-SSI - Netflix Streaming Services Inc.
7,2620:10c:700f:4487::187,2906,AS-SSI - Netflix Streaming Services Inc.
8,2a00:86c0:1047:1047::135,2906,AS-SSI - Netflix Streaming Services Inc.
9,108.175.35.206,2906,AS-SSI - Netflix Streaming Services Inc.


In [16]:
# join data with metadata information
netflix_meta = netflix_merged.merge(probe_as, on=['unit_id'])
netflix_meta = netflix_meta.merge(endpoints, left_on='address_v4', right_on='ip')
netflix_meta = netflix_meta.merge(endpoints, left_on='address_v6', right_on='ip', suffixes=('_v4', '_v6'))

In [17]:
netflix_meta.rename(columns={'asn_v4' : 'dst_asn_v4',
                             'holder_v4' : 'dst_holder_v4',
                             'asn_v6' : 'dst_asn_v6',
                             'holder_v6' : 'dst_holder_v6'}, inplace=True)

In [18]:
netflix_meta = netflix_meta[['unit_id', 'dtime',
                             'src_asn_v4', 'src_holder_v4', 'address_v4', 'dst_asn_v4', 'dst_holder_v4', 'connect_time_v4', 'bytes_sec_v4', 
                             'src_asn_v6', 'src_holder_v6', 'address_v6', 'dst_asn_v6', 'dst_holder_v6', 'connect_time_v6', 'bytes_sec_v6', 
                             'delta_connect_time', 'delta_bytes_sec']
                           ]

In [19]:
netflix_meta

Unnamed: 0,unit_id,dtime,src_asn_v4,src_holder_v4,address_v4,dst_asn_v4,dst_holder_v4,connect_time_v4,bytes_sec_v4,src_asn_v6,src_holder_v6,address_v6,dst_asn_v6,dst_holder_v6,connect_time_v6,bytes_sec_v6,delta_connect_time,delta_bytes_sec
0,123256,2016-07-22 00:00:00,8767,MNET-AS - M-net Telekommunikations GmbH,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,20515,10371717,8767,MNET-AS - M-net Telekommunikations GmbH,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,16803,10674265,3712,-302548
1,123256,2016-07-25 00:00:00,8767,MNET-AS - M-net Telekommunikations GmbH,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,16457,10272516,8767,MNET-AS - M-net Telekommunikations GmbH,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,17265,10867458,-808,-594942
2,123256,2016-12-15 23:00:00,8767,MNET-AS - M-net Telekommunikations GmbH,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,18057,9869233,8767,MNET-AS - M-net Telekommunikations GmbH,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,17221,10550408,836,-681175
3,148650,2016-10-04 18:00:00,3320,DTAG - Deutsche Telekom AG,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,26762,4514214,3320,DTAG - Deutsche Telekom AG,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,26208,4415574,554,98640
4,950210,2018-03-08 20:00:00,3320,DTAG - Deutsche Telekom AG,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,26690,1774382,3320,DTAG - Deutsche Telekom AG,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,26973,1756487,-283,17895
5,201338,2018-05-31 19:00:00,680,DFN - Verein zur Foerderung eines Deutschen Fo...,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,7992,11569479,680,DFN - Verein zur Foerderung eines Deutschen Fo...,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,8167,13820282,-175,-2250803
6,201338,2018-07-18 12:00:00,680,DFN - Verein zur Foerderung eines Deutschen Fo...,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,7951,21483897,680,DFN - Verein zur Foerderung eines Deutschen Fo...,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,8120,26046413,-169,-4562516
7,148650,2016-08-02 13:00:00,3320,DTAG - Deutsche Telekom AG,198.38.118.140,2906,AS-SSI - Netflix Streaming Services Inc.,25404,4912246,3320,DTAG - Deutsche Telekom AG,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,25934,4791703,-530,120543
8,148650,2016-08-17 01:00:00,3320,DTAG - Deutsche Telekom AG,198.38.118.140,2906,AS-SSI - Netflix Streaming Services Inc.,24967,5850606,3320,DTAG - Deutsche Telekom AG,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,25814,5760262,-847,90344
9,148650,2016-08-17 01:00:00,3320,DTAG - Deutsche Telekom AG,198.38.118.140,2906,AS-SSI - Netflix Streaming Services Inc.,25350,5828530,3320,DTAG - Deutsche Telekom AG,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,25814,5760262,-464,68268


In [20]:
conn = sqlite3.connect('../data/netflix-data-aggregated.db')
conn.text_factory = str  # required since some AS holders have special characters in their names
netflix_meta.to_sql(name = 'netflix_meta', con=conn, index=False, if_exists='replace')
conn.close()

## Success Rates

In [21]:
# load all measurements, excluding stall-related ones
conn = sqlite3.connect('../data/netflix-data.db')
success_rate = pd.read_sql_query('select unit_id, dtime, address, target, \
                                  error, error_msg, successes, failures \
                                  from netflix \
                                  where error not like "%STALL%"', con=conn, parse_dates=['dtime'])
conn.close()

In [22]:
success_rate = success_rate[~success_rate['unit_id'].isin(faulty_probes)]
success_rate.drop_duplicates(inplace=True)
success_rate['dtime'] = success_rate['dtime'].dt.floor('h')
success_rate['is_v6'] = success_rate['address'].str.contains(':')

In [23]:
success_rate[(success_rate['is_v6'] == False)]['target'].value_counts()

72.46.180.6                                  20162
109.105.98.161                               19274
88.217.233.230                               13271
194.109.76.68                                12541
194.109.76.76                                12533
194.109.76.67                                12520
194.109.76.75                                12337
82.197.168.214                               12073
82.197.168.210                               11500
195.18.252.214                               10281
195.18.252.218                                9915
195.18.252.210                                9640
194.109.76.66                                 9056
195.18.252.222                                8866
90.223.190.195                                8139
208.76.186.58                                 8108
45.57.16.133                                  8070
45.57.16.135                                  8065
208.76.186.62                                 7538
45.57.17.134                   

In [24]:
success_rate[(success_rate['is_v6'] == True)]['target'].value_counts()

ipv6_1.lagg0.c001.bma002.nordunetas.isp.nflxvideo.net    19153
ipv6_1.lagg0.c001.fsd001.premierol.isp.nflxvideo.net     18348
ipv6_1.lagg0.c001.zmu001.mnet.isp.nflxvideo.net          13132
ipv6_1.lagg0.c001.zrh002.init7.isp.nflxvideo.net         11802
ipv6_1.lagg0.c001.zrh001.init7.isp.nflxvideo.net         11436
ipv6_1.lagg0.c001.xsp001.m1.isp.nflxvideo.net            10828
ipv6_1.lagg0.c006.ams001.xs4all.isp.nflxvideo.net        10158
ipv6_1.lagg0.c007.ams001.xs4all.isp.nflxvideo.net        10007
ipv6_1.lagg0.c005.ams001.xs4all.isp.nflxvideo.net         9909
ipv6_1.lagg0.c004.ams001.xs4all.isp.nflxvideo.net         9877
ipv6_1.mce0.c007.vie001.ix.nflxvideo.net                  7315
ipv6_1.lagg0.c001.ams001.xs4all.isp.nflxvideo.net         7303
ipv6_1.lagg0.c004.lhr009.sky.isp.nflxvideo.net            7283
ipv6_1.mce0.c009.vie001.ix.nflxvideo.net                  7267
ipv6_1.lagg0.c001.phl001.kinber.isp.nflxvideo.net         7180
ipv6_1.cxl0.c026.tyo001.ix.nflxvideo.net               

#### => v4 mostly has IP addresses in 'target', v6 mostly 'ipv6...'; use this to filter below

In [25]:
# get errors that could neither be identified as IPv4 or IPv6 due to "None" in address
success_rate[success_rate['is_v6'].isnull()]

Unnamed: 0,unit_id,dtime,address,target,error,error_msg,successes,failures,is_v6
48,62712,2016-07-22 00:00:00,,,DNS_RESOLUTION_API_ERROR,curl_easy_perform: Couldn't resolve host 'api-...,0,1,
49,62712,2016-07-22 01:00:00,,,DNS_RESOLUTION_API_ERROR,curl_easy_perform: Couldn't resolve host 'api-...,0,1,
50,62712,2016-07-22 02:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
52,62712,2016-07-22 03:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
54,62712,2016-07-22 04:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
56,62712,2016-07-22 05:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
58,62712,2016-07-22 06:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
60,62712,2016-07-22 07:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
62,62712,2016-07-22 09:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
64,62712,2016-07-22 10:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,


To identify this in more detail:
- target contains "ipv6" -> is v6
- error message contains IPv6 address -> is v6

In [26]:
# create temporary copies below

# a) identification by target column: contains IPv4 address vs "ipv6"
x = success_rate[success_rate['is_v6'].isnull()]
y = x.copy()

In [27]:
y['is_v6'] = x['target'].apply(lambda x: ('.' not in x) if x else None) # IPv4 measurements have a regular IP address here (see above)
y['is_v6'] = x['target'].str.contains('ipv6')  # IPv6 in targetname

In [28]:
x.update(y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[col] = expressions.where(mask, this, that)


In [29]:
x

Unnamed: 0,unit_id,dtime,address,target,error,error_msg,successes,failures,is_v6
48,62712,2016-07-22 00:00:00,,,DNS_RESOLUTION_API_ERROR,curl_easy_perform: Couldn't resolve host 'api-...,0,1,
49,62712,2016-07-22 01:00:00,,,DNS_RESOLUTION_API_ERROR,curl_easy_perform: Couldn't resolve host 'api-...,0,1,
50,62712,2016-07-22 02:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
52,62712,2016-07-22 03:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
54,62712,2016-07-22 04:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
56,62712,2016-07-22 05:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
58,62712,2016-07-22 06:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
60,62712,2016-07-22 07:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
62,62712,2016-07-22 09:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,
64,62712,2016-07-22 10:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0,1,


In [30]:
# overwrite original frame
success_rate.update(x)

In [31]:
# b) error message contains v6 address
x = x[x['is_v6'].isnull()]
y = x.copy()

In [32]:
# check error message for IPv6 address
y['is_v6'] = y['error_msg'].str.split('IP:').str[-1].str.contains(':')

In [33]:
y['error_msg'].str.split('IP:').str[-1].str.contains(':')
y['is_v6'].replace({ False : None}, inplace=True)

In [34]:
x.update(y)

In [35]:
# overwrite original frame
success_rate.update(x)

In [36]:
# unidentifiable (w.r.t. IP version) rows
success_rate[success_rate['is_v6'].isnull()]

Unnamed: 0,unit_id,dtime,address,target,error,error_msg,successes,failures,is_v6
48,62712.0,2016-07-22 00:00:00,,,DNS_RESOLUTION_API_ERROR,curl_easy_perform: Couldn't resolve host 'api-...,0.0,1.0,
49,62712.0,2016-07-22 01:00:00,,,DNS_RESOLUTION_API_ERROR,curl_easy_perform: Couldn't resolve host 'api-...,0.0,1.0,
80,62712.0,2016-07-22 18:00:00,,,DNS_RESOLUTION_API_ERROR,curl_easy_perform: Couldn't resolve host 'api-...,0.0,1.0,
242,148220.0,2016-07-22 02:00:00,,,NETWORK_API_ERROR,curl_easy_perform: The requested URL returned ...,0.0,1.0,
318,148644.0,2016-07-22 16:00:00,,,NETWORK_API_ERROR,curl_easy_perform: The requested URL returned ...,0.0,1.0,
391,148648.0,2016-07-22 04:00:00,,,NETWORK_API_ERROR,curl_easy_perform: The requested URL returned ...,0.0,1.0,
556,181536.0,2016-07-22 02:00:00,,,DNS_RESOLUTION_API_ERROR,curl_easy_perform: name lookup timed out (IP: ),0.0,1.0,
557,181536.0,2016-07-22 02:00:00,,,DNS_RESOLUTION_API_ERROR,curl_easy_perform: name lookup timed out (IP: ),0.0,1.0,
622,201338.0,2016-07-22 08:00:00,,,DNS_RESOLUTION_API_ERROR,curl_easy_perform: Couldn't resolve host 'api-...,0.0,1.0,
667,241743.0,2016-07-22 06:00:00,,,NETWORK_API_ERROR,curl_easy_perform: The requested URL returned ...,0.0,1.0,


In [37]:
# drop all failures that cannot be assigned to either v4 or v6 for success rate calculation
success_rate = success_rate[~success_rate['is_v6'].isnull()]

In [38]:
success_rate

Unnamed: 0,unit_id,dtime,address,target,error,error_msg,successes,failures,is_v6
50,62712.0,2016-07-22 02:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0.0,1.0,True
51,62712.0,2016-07-22 02:00:00,198.38.117.146,198.38.117.146,NO_ERROR,,1.0,0.0,False
52,62712.0,2016-07-22 03:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0.0,1.0,True
53,62712.0,2016-07-22 03:00:00,198.38.117.146,198.38.117.146,NO_ERROR,,1.0,0.0,False
54,62712.0,2016-07-22 04:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0.0,1.0,True
55,62712.0,2016-07-22 04:00:00,198.38.117.153,198.38.117.153,NO_ERROR,,1.0,0.0,False
56,62712.0,2016-07-22 05:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0.0,1.0,True
57,62712.0,2016-07-22 05:00:00,198.38.116.153,198.38.116.153,NO_ERROR,,1.0,0.0,False
58,62712.0,2016-07-22 06:00:00,,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...,0.0,1.0,True
59,62712.0,2016-07-22 06:00:00,198.38.117.153,198.38.117.153,NO_ERROR,,1.0,0.0,False


In [39]:
# aggregation: daily medians

success_rate['is_v6'] = success_rate['is_v6'].astype(bool)
success_rate['dtime'] = success_rate['dtime'].dt.floor('D')  # day intervals

success_rate_v4 = success_rate[~success_rate['is_v6']
                              ].groupby(['unit_id', 'dtime'], as_index=False
                                       ).agg({'successes' : sum, 'failures' : sum})

success_rate_v6 = success_rate[success_rate['is_v6']
                              ].groupby(['unit_id', 'dtime'], as_index=False
                                       ).agg({'successes' : sum, 'failures' : sum})

In [40]:
success_rate_v4['total'] = success_rate_v4['successes'] + success_rate_v4['failures']
success_rate_v6['total'] = success_rate_v6['successes'] + success_rate_v6['failures']

In [41]:
success_rate_v4['success_rate'] = success_rate_v4['successes'] / success_rate_v4['total']
success_rate_v6['success_rate'] = success_rate_v6['successes'] / success_rate_v6['total']

In [42]:
success_rate_v4

Unnamed: 0,unit_id,dtime,successes,failures,total,success_rate
0,62712.0,2016-07-22,21.0,0.0,21.0,1.000000
1,62712.0,2016-07-23,21.0,0.0,21.0,1.000000
2,62712.0,2016-07-24,23.0,0.0,23.0,1.000000
3,62712.0,2016-07-25,22.0,0.0,22.0,1.000000
4,62712.0,2016-07-26,23.0,0.0,23.0,1.000000
5,62712.0,2016-07-27,21.0,0.0,21.0,1.000000
6,62712.0,2016-07-28,24.0,0.0,24.0,1.000000
7,62712.0,2016-07-29,22.0,0.0,22.0,1.000000
8,62712.0,2016-07-30,23.0,0.0,23.0,1.000000
9,62712.0,2016-07-31,23.0,0.0,23.0,1.000000


In [43]:
success_rate_v6

Unnamed: 0,unit_id,dtime,successes,failures,total,success_rate
0,62712.0,2016-07-22,0.0,20.0,20.0,0.000000
1,62712.0,2016-07-23,0.0,16.0,16.0,0.000000
2,62712.0,2016-07-24,0.0,19.0,19.0,0.000000
3,62712.0,2016-07-25,0.0,23.0,23.0,0.000000
4,62712.0,2016-07-26,0.0,22.0,22.0,0.000000
5,62712.0,2016-07-27,0.0,24.0,24.0,0.000000
6,62712.0,2016-07-28,0.0,23.0,23.0,0.000000
7,62712.0,2016-07-29,0.0,20.0,20.0,0.000000
8,62712.0,2016-07-30,0.0,21.0,21.0,0.000000
9,62712.0,2016-07-31,0.0,21.0,21.0,0.000000


In [44]:
conn = sqlite3.connect('../data/netflix-data-aggregated.db')
conn.text_factory = str
success_rate_v4.to_sql(name = 'success_rate_v4', con=conn, index=False, if_exists='replace')
success_rate_v6.to_sql(name = 'success_rate_v6', con=conn, index=False, if_exists='replace')
conn.close()

## Failures

In [45]:
# Failures = all measurements with errors (EXCLUDING stall events)

conn = sqlite3.connect('../data/netflix-data.db')
failures = pd.read_sql_query('select unit_id, dtime, address, \
                              error, error_msg \
                              from netflix \
                              where error not like "%STALL%"', con=conn, parse_dates=['dtime'])
conn.close()

failures = failures[~failures['unit_id'].isin(faulty_probes)]
failures.drop_duplicates(inplace=True)

In [46]:
failures

Unnamed: 0,unit_id,dtime,address,error,error_msg
48,62712,2016-07-22 00:34:37,,DNS_RESOLUTION_API_ERROR,curl_easy_perform: Couldn't resolve host 'api-...
49,62712,2016-07-22 01:35:44,,DNS_RESOLUTION_API_ERROR,curl_easy_perform: Couldn't resolve host 'api-...
50,62712,2016-07-22 02:33:51,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...
51,62712,2016-07-22 02:34:01,198.38.117.146,NO_ERROR,
52,62712,2016-07-22 03:34:30,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...
53,62712,2016-07-22 03:34:41,198.38.117.146,NO_ERROR,
54,62712,2016-07-22 04:32:58,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...
55,62712,2016-07-22 04:33:09,198.38.117.153,NO_ERROR,
56,62712,2016-07-22 05:34:19,,CONNECTION_API_ERROR,curl_easy_perform: Failed to connect to 2a01:5...
57,62712,2016-07-22 05:34:30,198.38.116.153,NO_ERROR,


## Failure Overview

In [47]:
failures['error'].value_counts()

NO_ERROR                        2133118
CONNECTION_API_ERROR             150732
NETWORK_API_ERROR                 67290
NETWORK_CONTENT_ERROR             43358
DNS_RESOLUTION_API_ERROR           7626
DNS_RESOLUTION_CONTENT_ERROR       6673
CONNECTION_CONTENT_ERROR           6063
Name: error, dtype: int64

In [48]:
failures['error'].value_counts(normalize=True)

NO_ERROR                        0.883330
CONNECTION_API_ERROR            0.062419
NETWORK_API_ERROR               0.027865
NETWORK_CONTENT_ERROR           0.017955
DNS_RESOLUTION_API_ERROR        0.003158
DNS_RESOLUTION_CONTENT_ERROR    0.002763
CONNECTION_CONTENT_ERROR        0.002511
Name: error, dtype: float64

## IPv6 Preference

In [49]:
# Happy Eyeballs

he_timer = 250000  # 250~ms

def preferred_version(row):
    if (row['connect_time_v6'] < he_timer) or (row['connect_time_v6']-he_timer <= row['connect_time_v4']):
        return 'v6'
    else:
        return 'v4'

In [50]:
# determine whether v4 or v6 would have been preferred based on Happy Eyeballs algorithm
netflix_merged['pref_version'] = netflix_merged.apply(preferred_version, axis=1)
netflix_merged

Unnamed: 0,unit_id,dtime,address_v4,connect_time_v4,bytes_sec_v4,address_v6,connect_time_v6,bytes_sec_v6,delta_connect_time,delta_bytes_sec,pref_version
0,123256,2016-07-22 00:00:00,198.38.119.140,20515,10371717,2a00:86c0:119:119::145,16803,10674265,3712,-302548,v6
1,123256,2016-07-22 02:00:00,198.38.118.140,20570,10751181,2a00:86c0:119:119::142,17350,10958991,3220,-207810,v6
2,123256,2016-07-22 04:00:00,198.38.119.149,15614,10388970,2a00:86c0:116:116::145,18747,9191069,-3133,1197901,v6
3,123256,2016-07-22 10:00:00,198.38.119.149,18534,9844976,2a00:86c0:116:116::148,25121,8356098,-6587,1488878,v6
4,123256,2016-07-22 13:00:00,198.38.117.153,23514,8260807,2a00:86c0:118:118::143,14076,10857188,9438,-2596381,v6
5,123256,2016-07-22 14:00:00,198.38.118.141,19945,10729881,2a00:86c0:118:118::149,14320,10756554,5625,-26673,v6
6,123256,2016-07-22 15:00:00,198.38.118.141,18305,8935313,2a00:86c0:119:119::142,17494,10573729,811,-1638416,v6
7,123256,2016-07-22 17:00:00,198.38.119.145,21400,10431608,2a00:86c0:116:116::148,22151,9316159,-751,1115449,v6
8,123256,2016-07-22 19:00:00,198.38.118.143,20945,10121108,2a00:86c0:119:119::148,14430,10861696,6515,-740588,v6
9,123256,2016-07-22 23:00:00,198.38.116.148,25450,8773578,2a00:86c0:119:119::146,18686,10890393,6764,-2116815,v6


In [51]:
netflix_merged['pref_version'].value_counts()

v6    963925
v4      5583
Name: pref_version, dtype: int64

In [52]:
# splitting dtime into separate columns for more flexibility
netflix_merged['year'] = netflix_merged['dtime'].dt.year 
netflix_merged['month'] = netflix_merged['dtime'].dt.month
netflix_merged['day'] = netflix_merged['dtime'].dt.day
netflix_merged['hour'] = netflix_merged['dtime'].dt.hour

### Considering Local Time of Probe

In [53]:
local_time = netflix_merged.copy()
probe_tz = pd.read_csv('../metadata/probes-timezones.csv', sep=';')

In [54]:
probe_tz.head()

Unnamed: 0,unit_id,msmid,location,timezone
0,62712,28f784568afa11e2b54500e08147c934,Bremen,Europe/Berlin
1,123256,bdf8bc9a8af611e2a74900e08147c934,Munich,Europe/Berlin
2,148644,635cd9468afc11e2901a00e08147c934,Braunschweig,Europe/Berlin
3,148650,915c1bf6b4a311e2a0d300e08147c934,Oldenburg,Europe/Berlin
4,167808,28ea168ace7111e28ebf00e08147c934,London,Europe/London


In [55]:
# join with location; interpret original time as UTC
local_time = local_time.merge(probe_tz, on=['unit_id'])
local_time['dtime'] = local_time['dtime'].dt.tz_localize('utc')

In [56]:
local_time.head()

Unnamed: 0,unit_id,dtime,address_v4,connect_time_v4,bytes_sec_v4,address_v6,connect_time_v6,bytes_sec_v6,delta_connect_time,delta_bytes_sec,pref_version,year,month,day,hour,msmid,location,timezone
0,123256,2016-07-22 00:00:00+00:00,198.38.119.140,20515,10371717,2a00:86c0:119:119::145,16803,10674265,3712,-302548,v6,2016,7,22,0,bdf8bc9a8af611e2a74900e08147c934,Munich,Europe/Berlin
1,123256,2016-07-22 02:00:00+00:00,198.38.118.140,20570,10751181,2a00:86c0:119:119::142,17350,10958991,3220,-207810,v6,2016,7,22,2,bdf8bc9a8af611e2a74900e08147c934,Munich,Europe/Berlin
2,123256,2016-07-22 04:00:00+00:00,198.38.119.149,15614,10388970,2a00:86c0:116:116::145,18747,9191069,-3133,1197901,v6,2016,7,22,4,bdf8bc9a8af611e2a74900e08147c934,Munich,Europe/Berlin
3,123256,2016-07-22 10:00:00+00:00,198.38.119.149,18534,9844976,2a00:86c0:116:116::148,25121,8356098,-6587,1488878,v6,2016,7,22,10,bdf8bc9a8af611e2a74900e08147c934,Munich,Europe/Berlin
4,123256,2016-07-22 13:00:00+00:00,198.38.117.153,23514,8260807,2a00:86c0:118:118::143,14076,10857188,9438,-2596381,v6,2016,7,22,13,bdf8bc9a8af611e2a74900e08147c934,Munich,Europe/Berlin


In [57]:
# convert from UTC to local time, based on location
def utc_to_local(time, zone):
    return time.tz_convert(zone)

In [58]:
local_time['local_time'] = local_time.apply(lambda x: utc_to_local(x['dtime'], x['timezone']), axis=1)

In [59]:
# determine local hour
local_time['local_hour'] = local_time['local_time'].apply(lambda x: x.hour)

In [60]:
local_time[['dtime', 'local_time', 'local_hour']].head()

Unnamed: 0,dtime,local_time,local_hour
0,2016-07-22 00:00:00+00:00,2016-07-22 02:00:00+02:00,2
1,2016-07-22 02:00:00+00:00,2016-07-22 04:00:00+02:00,4
2,2016-07-22 04:00:00+00:00,2016-07-22 06:00:00+02:00,6
3,2016-07-22 10:00:00+00:00,2016-07-22 12:00:00+02:00,12
4,2016-07-22 13:00:00+00:00,2016-07-22 15:00:00+02:00,15


In [61]:
# group by local probe and local hour to determine frequency of v6 preference

hourly_pref_grouped = local_time.groupby(['unit_id', 'local_hour'])
hourly_pref = pd.DataFrame(columns=['unit_id', 'local_hour', 'v6_pref'])

for name, grp in hourly_pref_grouped:
    pref = ((grp['pref_version'].values == 'v6').sum()*1.0)/len(grp)
    date = grp['local_hour'].iloc[0]
    unit_id = grp['unit_id'].iloc[0]
    
    hourly_pref = hourly_pref.append({'unit_id' : unit_id, 'local_hour' : date, 'v6_pref' : pref},
                                     ignore_index=True)
hourly_pref

Unnamed: 0,unit_id,local_hour,v6_pref
0,62712.0,0.0,1.000000
1,62712.0,1.0,1.000000
2,62712.0,2.0,1.000000
3,62712.0,3.0,1.000000
4,62712.0,4.0,1.000000
5,62712.0,5.0,1.000000
6,62712.0,6.0,0.998485
7,62712.0,7.0,1.000000
8,62712.0,8.0,0.989346
9,62712.0,9.0,0.998494


In [62]:
hourly_pref['unit_id'] = hourly_pref['unit_id'].astype(int)
hourly_pref['local_hour'] = hourly_pref['local_hour'].astype(int)

In [63]:
conn = sqlite3.connect('../data/netflix-data-aggregated.db')
conn.text_factory = str
hourly_pref.to_sql(name = 'v6_pref_probe_hour', con=conn, index=False, if_exists='replace')
conn.close()

# `traceroute` to Netflix servers

In [64]:
# only take completed traceroute measurements into account
conn = sqlite3.connect('../data/netflix-data.db')
traceroute_df = pd.read_sql_query('select unit_id, dtime, \
                                   destination, \
                                   ttl, \
                                   rtt \
                                   from traceroute \
                                   where status == "COMPLETED" and endpoint == destination',
                                   con=conn, parse_dates=['dtime'])
conn.close()

In [65]:
# pre-process data as above
traceroute_df = traceroute_df[~traceroute_df['unit_id'].isin(faulty_probes)]
traceroute_df.drop_duplicates(inplace=True)
traceroute_df['dtime'] = traceroute_df['dtime'].dt.floor('h')
traceroute_df['rtt'] = traceroute_df['rtt'].astype(float)

In [66]:
traceroute_df

Unnamed: 0,unit_id,dtime,destination,ttl,rtt
42,62712,2016-07-22 02:00:00,198.38.117.146,8,8.810
43,62712,2016-07-22 03:00:00,198.38.117.146,8,8.252
44,62712,2016-07-22 04:00:00,198.38.117.153,8,7.883
45,62712,2016-07-22 05:00:00,198.38.116.153,8,8.001
46,62712,2016-07-22 06:00:00,198.38.117.153,8,7.934
47,62712,2016-07-22 07:00:00,198.38.116.148,8,8.147
48,62712,2016-07-22 09:00:00,198.38.116.148,8,7.985
49,62712,2016-07-22 10:00:00,198.38.117.147,8,7.877
50,62712,2016-07-22 11:00:00,198.38.117.152,8,8.029
51,62712,2016-07-22 12:00:00,198.38.117.149,8,8.697


In [67]:
# split by v4 and v6
traceroute_v4 = traceroute_df[~traceroute_df['destination'].str.contains(':')]
traceroute_v6 = traceroute_df[traceroute_df['destination'].str.contains(':')]

In [68]:
traceroute_v4

Unnamed: 0,unit_id,dtime,destination,ttl,rtt
42,62712,2016-07-22 02:00:00,198.38.117.146,8,8.810
43,62712,2016-07-22 03:00:00,198.38.117.146,8,8.252
44,62712,2016-07-22 04:00:00,198.38.117.153,8,7.883
45,62712,2016-07-22 05:00:00,198.38.116.153,8,8.001
46,62712,2016-07-22 06:00:00,198.38.117.153,8,7.934
47,62712,2016-07-22 07:00:00,198.38.116.148,8,8.147
48,62712,2016-07-22 09:00:00,198.38.116.148,8,7.985
49,62712,2016-07-22 10:00:00,198.38.117.147,8,7.877
50,62712,2016-07-22 11:00:00,198.38.117.152,8,8.029
51,62712,2016-07-22 12:00:00,198.38.117.149,8,8.697


In [69]:
traceroute_v6

Unnamed: 0,unit_id,dtime,destination,ttl,rtt
63,123256,2016-07-22 00:00:00,2a00:86c0:119:119::145,9,14.370
66,123256,2016-07-22 02:00:00,2a00:86c0:119:119::142,9,15.826
69,123256,2016-07-22 04:00:00,2a00:86c0:116:116::145,11,20.654
70,123256,2016-07-22 05:00:00,2a00:86c0:117:117::141,11,18.596
71,123256,2016-07-22 10:00:00,2a00:86c0:116:116::148,11,23.037
75,123256,2016-07-22 13:00:00,2a00:86c0:118:118::143,9,17.366
77,123256,2016-07-22 14:00:00,2a00:86c0:118:118::149,9,22.139
79,123256,2016-07-22 15:00:00,2a00:86c0:119:119::142,9,17.588
81,123256,2016-07-22 17:00:00,2a00:86c0:116:116::148,11,24.586
82,123256,2016-07-22 19:00:00,2a00:86c0:119:119::148,9,19.373


In [70]:
# join v4 and v6 pairs to calculate deltas
traceroute_merged = traceroute_v4.merge(traceroute_v6, on=['unit_id', 'dtime'], suffixes=('_v4', '_v6'))
traceroute_merged

Unnamed: 0,unit_id,dtime,destination_v4,ttl_v4,rtt_v4,destination_v6,ttl_v6,rtt_v6
0,123256,2016-07-22 00:00:00,198.38.119.140,8,16.989,2a00:86c0:119:119::145,9,14.370
1,123256,2016-07-22 02:00:00,198.38.118.140,8,16.136,2a00:86c0:119:119::142,9,15.826
2,123256,2016-07-22 04:00:00,198.38.119.149,8,22.194,2a00:86c0:116:116::145,11,20.654
3,123256,2016-07-22 10:00:00,198.38.119.149,8,20.222,2a00:86c0:116:116::148,11,23.037
4,123256,2016-07-22 13:00:00,198.38.117.153,10,23.100,2a00:86c0:118:118::143,9,17.366
5,123256,2016-07-22 14:00:00,198.38.118.141,8,20.608,2a00:86c0:118:118::149,9,22.139
6,123256,2016-07-22 15:00:00,198.38.118.141,8,17.799,2a00:86c0:119:119::142,9,17.588
7,123256,2016-07-22 17:00:00,198.38.119.145,8,20.275,2a00:86c0:116:116::148,11,24.586
8,123256,2016-07-22 19:00:00,198.38.118.143,8,19.567,2a00:86c0:119:119::148,9,19.373
9,123256,2016-07-22 23:00:00,198.38.116.148,10,23.248,2a00:86c0:119:119::146,9,16.535


In [71]:
traceroute_merged['delta_ttl'] = traceroute_merged['ttl_v4'] - traceroute_merged['ttl_v6']
traceroute_merged['delta_rtt'] = traceroute_merged['rtt_v4'] - traceroute_merged['rtt_v6']
traceroute_merged

Unnamed: 0,unit_id,dtime,destination_v4,ttl_v4,rtt_v4,destination_v6,ttl_v6,rtt_v6,delta_ttl,delta_rtt
0,123256,2016-07-22 00:00:00,198.38.119.140,8,16.989,2a00:86c0:119:119::145,9,14.370,-1,2.619
1,123256,2016-07-22 02:00:00,198.38.118.140,8,16.136,2a00:86c0:119:119::142,9,15.826,-1,0.310
2,123256,2016-07-22 04:00:00,198.38.119.149,8,22.194,2a00:86c0:116:116::145,11,20.654,-3,1.540
3,123256,2016-07-22 10:00:00,198.38.119.149,8,20.222,2a00:86c0:116:116::148,11,23.037,-3,-2.815
4,123256,2016-07-22 13:00:00,198.38.117.153,10,23.100,2a00:86c0:118:118::143,9,17.366,1,5.734
5,123256,2016-07-22 14:00:00,198.38.118.141,8,20.608,2a00:86c0:118:118::149,9,22.139,-1,-1.531
6,123256,2016-07-22 15:00:00,198.38.118.141,8,17.799,2a00:86c0:119:119::142,9,17.588,-1,0.211
7,123256,2016-07-22 17:00:00,198.38.119.145,8,20.275,2a00:86c0:116:116::148,11,24.586,-3,-4.311
8,123256,2016-07-22 19:00:00,198.38.118.143,8,19.567,2a00:86c0:119:119::148,9,19.373,-1,0.194
9,123256,2016-07-22 23:00:00,198.38.116.148,10,23.248,2a00:86c0:119:119::146,9,16.535,1,6.713


In [72]:
conn = sqlite3.connect('../data/netflix-data-aggregated.db')
conn.text_factory = str
traceroute_merged.to_sql(name = 'traceroute', con=conn, index=False, if_exists='replace')
conn.close()

## Including meta data information

In [73]:
traceroute_meta = traceroute_merged.merge(probe_as, on=['unit_id'])
traceroute_meta = traceroute_meta.merge(endpoints, left_on='destination_v4', right_on='ip')
traceroute_meta = traceroute_meta.merge(endpoints, left_on='destination_v6', right_on='ip', suffixes=('_v4', '_v6'))

In [74]:
traceroute_meta.rename(columns={'asn_v4' : 'dst_asn_v4',
                                'holder_v4' : 'dst_holder_v4',
                                'asn_v6' : 'dst_asn_v6',
                                'holder_v6' : 'dst_holder_v6'}, inplace=True)

In [75]:
traceroute_meta = traceroute_meta[['unit_id', 'dtime',
                                   'src_asn_v4', 'src_holder_v4', 'destination_v4', 'dst_asn_v4', 'dst_holder_v4', 'ttl_v4', 'rtt_v4',
                                   'src_asn_v6', 'src_holder_v6', 'destination_v6', 'dst_asn_v6', 'dst_holder_v6', 'ttl_v6', 'rtt_v6', 
                                   'delta_ttl', 'delta_rtt']
                                 ]

In [76]:
traceroute_meta

Unnamed: 0,unit_id,dtime,src_asn_v4,src_holder_v4,destination_v4,dst_asn_v4,dst_holder_v4,ttl_v4,rtt_v4,src_asn_v6,src_holder_v6,destination_v6,dst_asn_v6,dst_holder_v6,ttl_v6,rtt_v6,delta_ttl,delta_rtt
0,123256,2016-07-22 00:00:00,8767,MNET-AS - M-net Telekommunikations GmbH,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,8,16.989,8767,MNET-AS - M-net Telekommunikations GmbH,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,9,14.370,-1,2.619
1,123256,2016-07-25 00:00:00,8767,MNET-AS - M-net Telekommunikations GmbH,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,8,13.484,8767,MNET-AS - M-net Telekommunikations GmbH,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,9,14.786,-1,-1.302
2,123256,2016-12-15 23:00:00,8767,MNET-AS - M-net Telekommunikations GmbH,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,8,15.452,8767,MNET-AS - M-net Telekommunikations GmbH,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,9,13.122,-1,2.330
3,148650,2016-10-04 18:00:00,3320,DTAG - Deutsche Telekom AG,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,6,26.841,3320,DTAG - Deutsche Telekom AG,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,6,25.611,0,1.230
4,950210,2018-03-08 20:00:00,3320,DTAG - Deutsche Telekom AG,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,5,24.924,3320,DTAG - Deutsche Telekom AG,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,5,25.451,0,-0.527
5,201338,2018-05-31 19:00:00,680,DFN - Verein zur Foerderung eines Deutschen Fo...,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,7,16.885,680,DFN - Verein zur Foerderung eines Deutschen Fo...,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,6,7.933,1,8.952
6,201338,2018-07-18 12:00:00,680,DFN - Verein zur Foerderung eines Deutschen Fo...,198.38.119.140,2906,AS-SSI - Netflix Streaming Services Inc.,7,7.692,680,DFN - Verein zur Foerderung eines Deutschen Fo...,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,6,7.840,1,-0.148
7,148650,2016-08-17 01:00:00,3320,DTAG - Deutsche Telekom AG,198.38.118.140,2906,AS-SSI - Netflix Streaming Services Inc.,6,24.253,3320,DTAG - Deutsche Telekom AG,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,6,24.878,0,-0.625
8,148650,2016-08-17 01:00:00,3320,DTAG - Deutsche Telekom AG,198.38.118.140,2906,AS-SSI - Netflix Streaming Services Inc.,6,24.709,3320,DTAG - Deutsche Telekom AG,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,6,24.878,0,-0.169
9,148650,2016-08-30 09:00:00,3320,DTAG - Deutsche Telekom AG,198.38.118.140,2906,AS-SSI - Netflix Streaming Services Inc.,6,24.719,3320,DTAG - Deutsche Telekom AG,2a00:86c0:119:119::145,2906,AS-SSI - Netflix Streaming Services Inc.,6,26.519,0,-1.800


In [77]:
conn = sqlite3.connect('../data/netflix-data-aggregated.db')
conn.text_factory = str
traceroute_meta.to_sql(name = 'traceroute_meta', con=conn, index=False, if_exists='replace')
conn.close()

# MLab measurements

Only used for rough comparison of throughput (MLab speedtest vs Netflix); data over v6 not fully complete; pairs not fully complementary

In [78]:
# load throughput measurements performed toward MLab servers

conn = sqlite3.connect('../data/mlab-data.db')
speedtest_v4 = pd.read_sql_query('select unit_id, dtime, address, \
                                  bytes_sec \
                                  from httpgetmt \
                                  where successes != 0',
                                  con=conn, parse_dates=['dtime'])

speedtest_v6 = pd.read_sql_query('select unit_id, dtime, address, \
                                  bytes_sec \
                                  from httpgetmt6 \
                                  where successes != 0',
                                  con=conn, parse_dates=['dtime'])
conn.close()

In [79]:
speedtest_v4['dtime'] = speedtest_v4['dtime'].dt.floor('h')
speedtest_v6['dtime'] = speedtest_v6['dtime'].dt.floor('h')

In [80]:
speedtest_df = speedtest_v4.merge(speedtest_v6, on=['unit_id', 'dtime'], suffixes=('_v4', '_v6'))

In [81]:
speedtest_df['delta_bytes_sec'] = speedtest_df['bytes_sec_v4'] - speedtest_df['bytes_sec_v6']

In [82]:
speedtest_df

Unnamed: 0,unit_id,dtime,address_v4,bytes_sec_v4,address_v6,bytes_sec_v6,delta_bytes_sec
0,19602,2014-09-05 12:00:00,80.239.168.226,167469,2001:2030:0:1a::226,191698,-24229
1,19602,2014-09-05 18:00:00,80.239.168.226,195406,2001:2030:0:1a::226,185876,9530
2,19602,2014-09-05 19:00:00,80.239.168.226,167903,2001:2030:0:1a::226,188858,-20955
3,19602,2014-09-05 20:00:00,80.239.168.226,195262,2001:2030:0:1a::226,192124,3138
4,19602,2014-09-05 21:00:00,80.239.168.226,161566,2001:2030:0:1a::226,190988,-29422
5,19602,2014-09-05 22:00:00,80.239.168.226,195117,2001:2030:0:1a::226,193260,1857
6,62712,2014-09-05 18:00:00,213.244.128.162,11598048,2001:4c08:2003:2::162,11532943,65105
7,62712,2014-09-05 19:00:00,213.244.128.162,11607398,2001:4c08:2003:2::162,11257064,350334
8,62712,2014-09-05 20:00:00,213.244.128.162,11638454,2001:4c08:2003:2::162,11296059,342395
9,62712,2014-09-05 21:00:00,213.244.128.162,11645509,2001:4c08:2003:2::162,11407647,237862


In [83]:
conn = sqlite3.connect('../data/mlab-data-aggregated.db')
speedtest_df.to_sql(name = 'speedtest', con=conn, index=False, if_exists='replace')
conn.close()