In [5]:
import pandas as pd
import numpy as np
import sqlite3
import csv
from datetime import timedelta, date, datetime
import json
from ripe.atlas.cousteau import Measurement

In [2]:
db = sqlite3.connect("data/ripe-dns-all-probes-with-abuf.db")
df = pd.read_sql_query("SELECT * from data", db)
db.close()

In [4]:
df[df['target_name'] == '']

Unnamed: 0,msm_id,prb_id,timestamp,resolver_address,address_family,rt,error,local_resolver,abuf,target_name,rcode,response_type,response_address,ttl
4,27186736,10092,1600188634,2001:4860:4860::8888,4,,"{""socket"": ""connect failed Network is unreacha...",True,,,,,,
5,27186736,10092,1600188634,78.90.221.193,4,,"{""timeout"": 5000}",True,,,,,,
10,27186736,10101,1600188633,2001:b88:1002::10,6,,"{""timeout"": 5000}",True,,,,,,
13,27186736,10134,1600188636,10.0.12.148,4,,"{""timeout"": 5000}",True,,,,,,
87,27186736,11108,1600188642,2001:1418:10:2::2,6,,"{""timeout"": 5000}",True,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13251908,27187241,10099,1601312239,2a07:a8c0::,6,26.223,"{""timeout"": 5000}",False,"error, no abuf",,,,,
13251952,27187241,28907,1601312234,2a07:a8c0::,6,10.568,"{""timeout"": 5000}",False,"error, no abuf",,,,,
13251964,27187241,32123,1601312234,2a07:a8c0::,6,20.429,"{""timeout"": 5000}",False,"error, no abuf",,,,,
13251974,27187241,35600,1601312239,2a07:a8c0::,6,9.277,"{""timeout"": 5000}",False,"error, no abuf",,,,,


In [10]:
df_msm_id_to_target_name = pd.DataFrame(columns=['msm_id', 'target_name_meta'])
msm_ids_location = 'metadata/msm_id_list.csv'
measurements = pd.read_csv(msm_ids_location, header=None)[0].unique()
for msm in measurements:
    measurement = Measurement(id=msm)
    #start_date = measurement.start_time.date()
    #proto = measurement.protocol
    #target = measurement.target
    #print(target)

    descr = measurement.description.split(" ")
    #print(descr[5])
    df_msm_id_to_target_name.loc[len(df_msm_id_to_target_name)] = [msm, descr[5]]
    


In [12]:
df = pd.merge(df, df_msm_id_to_target_name, on='msm_id', how='left')

In [13]:
resolvers_dict = {'185.228.168.168':'CleanBrowsing', '1.1.1.1':'Cloudflare', '8.8.8.8':'Google',
                  '208.67.222.123':'OpenDNS', '185.121.177.177':'OpenNIC', '9.9.9.9':'Quad9', '64.6.64.6':'VeriSign',
                  '77.88.8.8':'Yandex', '156.154.70.1':'Neustar UltraRecursive', '45.90.28.0':'NextDNS',
                  '2a0d:2a00:1::1':'CleanBrowsing', '2606:4700:4700::1111':'Cloudflare',
                  '2001:4860:4860::8888':'Google', '2620:0:ccc::2':'OpenDNS',
                  '2a05:dfc7:5::5353':'OpenNIC', '2620:fe::9':'Quad9', '2620:74:1b::1:1':'VeriSign',
                  '2a02:6b8::feed:ff':'Yandex', '2610:a1:1018::1':'Neustar UltraRecursive',
                     '2a07:a8c0::': 'NextDNS'}

In [14]:
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df['target_name'] = np.where(df['target_name'].str.endswith('.'), df['target_name'].str[:-1], df['target_name'])
df['target_name'] = df['target_name'].str.lower()
df['address_family'] = df['address_family'].astype(int)
df['rt'] = df['rt'].replace({'' : np.nan}).astype(float)
df['ttl'] = df['ttl'].replace({'' : np.nan}).astype(float)
df['local_resolver'] = df['local_resolver'] == 'True'

In [16]:
df[df['target_name_meta'] == '']

Unnamed: 0,msm_id,prb_id,timestamp,resolver_address,address_family,rt,error,local_resolver,abuf,target_name,rcode,response_type,response_address,ttl,target_name_meta


In [18]:
df[df['target_name'] == ''].target_name_meta.value_counts()

$r.google.com                       30162
www.t-mobile.com                    23885
www.youtube.com                     23561
www.protect-your-privacy-now.com    23052
www.publicinvasion.com              22897
www.libero.it                       22813
www.ifccenter.com                   22791
www.tohotheater.jp                  22777
www.uol.com.br                      22774
www.txxx.com                        22708
www.chaturbate.com                  22615
www.microsoft.com                   22578
www.thedailystar.net                22544
www.live.com                        22538
www.apple.com                       22522
www.cnn.com                         22511
www.znak.com                        22511
www.office.com                      22492
www.fbcdn.net                       22489
www.wikia.com                       22449
www.instagram.com                   22273
www.google.com                      22237
www.wikipedia.org                   22211
Name: target_name_meta, dtype: int

In [22]:
df

Unnamed: 0,msm_id,prb_id,timestamp,resolver_address,address_family,rt,error,local_resolver,abuf,target_name,rcode,response_type,response_address,ttl,target_name_meta
0,27186736,10001,2020-09-15 16:50:40,fd00::eadf:70ff:fea3:7a26,6,20.312,,True,vFGBgwABAAAAAQAAEGQyZGFlNzhiMTk4NTMxNjUGZ29vZ2...,d2dae78b19853165.google.com,NXDOMAIN,,,,$r.google.com
1,27186736,10001,2020-09-15 16:50:40,192.168.178.1,4,15.416,,True,ti2BgwABAAAAAQAAEGRmNDgyYjYwY2JkNjA0M2UGZ29vZ2...,df482b60cbd6043e.google.com,NXDOMAIN,,,,$r.google.com
2,27186736,10006,2020-09-15 16:50:37,8.8.8.8,4,29.835,,True,hEKBgwABAAAAAQAAEGZlM2M3YzAxN2RhMzYwZWUGZ29vZ2...,fe3c7c017da360ee.google.com,NXDOMAIN,,,,$r.google.com
3,27186736,10022,2020-09-15 16:50:44,10.71.6.12,4,10.700,,True,JK2BgwABAAAAAQAAEDdkZTIzYTQwZjQ0ZjQxMTcGZ29vZ2...,7de23a40f44f4117.google.com,NXDOMAIN,,,,$r.google.com
4,27186736,10092,2020-09-15 16:50:34,2001:4860:4860::8888,4,,"{""socket"": ""connect failed Network is unreacha...",True,,,,,,,$r.google.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13251991,27187241,54984,2020-09-28 16:57:09,2a07:a8c0::,6,27.065,,False,BWeAgAABAAAADQAAA3d3dwR6bmFrA2NvbQAAHAABwBUAAg...,www.znak.com,NOERROR,,,,www.znak.com
13251992,27187241,55590,2020-09-28 16:57:25,2a07:a8c0::,6,36.250,,False,BE+AgAABAAAADQAAA3d3dwR6bmFrA2NvbQAAHAABwBUAAg...,www.znak.com,NOERROR,,,,www.znak.com
13251993,27187241,55660,2020-09-28 16:57:21,2a07:a8c0::,6,222.830,,False,WwGAgAABAAAADQAAA3d3dwR6bmFrA2NvbQAAHAABwBUAAg...,www.znak.com,NOERROR,,,,www.znak.com
13251994,27187241,55692,2020-09-28 16:57:11,2a07:a8c0::,6,14.162,,False,ffiAgAABAAAAAgAEA3d3dwR6bmFrA2NvbQAAHAABwBAAAg...,www.znak.com,NOERROR,,,,www.znak.com


In [29]:
def get_address_family(ip_addr_tmp):
    if ip_addr_tmp.find(':') > 0:
        return 6
    else:
        return 4


In [30]:
df['response_af'] = df.apply(lambda row: get_address_family(row.response_address), axis=1)

In [49]:
df[(df.address_family != df.response_af) & (df.response_address != '')].target_name_meta.unique()

array(['$r.google.com', 'www.microsoft.com', 'www.apple.com',
       'www.uol.com.br', 'www.libero.it',
       'www.protect-your-privacy-now.com', 'www.publicinvasion.com',
       'www.txxx.com', 'www.chaturbate.com', 'www.instagram.com',
       'www.fbcdn.net', 'www.wikia.com', 'www.cnn.com', 'www.google.com',
       'www.youtube.com', 'www.t-mobile.com', 'www.thedailystar.net',
       'www.tohotheater.jp', 'www.ifccenter.com', 'www.live.com',
       'www.office.com', 'www.wikipedia.org', 'www.znak.com'],
      dtype=object)

In [53]:
df[(df.address_family != df.response_af) & (df.response_address != '')].prb_id.nunique()

1147

In [56]:
df[(df.address_family != df.response_af) & (df.response_address != '') & (df.address_family == 4)]

Unnamed: 0,msm_id,prb_id,timestamp,resolver_address,address_family,rt,error,local_resolver,abuf,target_name,rcode,response_type,response_address,ttl,target_name_meta,response_af,resolver_af
1111746,27186769,10568,2020-09-15 16:50:54,192.168.5.1,4,0.589,,True,paSBgAABAAgAAAAAA3d3dwltaWNyb3NvZnQDY29tAAAcAA...,www.microsoft.com,NOERROR,AAAA,2600:1408:24:488:0:0:0:356e,18.0,www.microsoft.com,6,4
1111747,27186769,11513,2020-09-15 16:50:42,96.76.225.35,4,13.792,,True,iDqBgAABAAUACAAFA3d3dwltaWNyb3NvZnQDY29tAAAcAA...,www.microsoft.com,NOERROR,AAAA,2600:1407:f800:488:0:0:0:356e,20.0,www.microsoft.com,6,4
1111748,27186769,12014,2020-09-15 16:50:54,192.168.33.1,4,56.392,,True,FZOBgAABAAYAAAAAA3d3dwltaWNyb3NvZnQDY29tAAAcAA...,www.microsoft.com,NOERROR,AAAA,2a02:26f0:6b:5b6:0:0:0:356e,4.0,www.microsoft.com,6,4
1111750,27186769,12020,2020-09-15 16:50:40,91.121.161.184,4,31.303,,True,WO6BgAABAAUAAAAAA3d3dwltaWNyb3NvZnQDY29tAAAcAA...,www.microsoft.com,NOERROR,AAAA,2a02:26f0:1700:493:0:0:0:356e,20.0,www.microsoft.com,6,4
1111753,27186769,12922,2020-09-15 16:50:41,10.0.2.1,4,0.593,,True,rByBgAABAAgAAAAAA3d3dwltaWNyb3NvZnQDY29tAAAcAA...,www.microsoft.com,NOERROR,AAAA,2a02:26f0:1200:2a9:0:0:0:356e,7.0,www.microsoft.com,6,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13106851,27187231,54704,2020-09-28 16:57:19,192.168.0.3,4,121.663,,True,1AGBgAABAAIAAAAAA3d3dwR6bmFrA2NvbQAAHAABwAwABQ...,www.znak.com,NOERROR,AAAA,2001:41d0:303:aed2:7a6e:616b:0:138,3600.0,www.znak.com,6,4
13106853,27187231,54879,2020-09-28 16:57:50,192.168.1.1,4,1.432,,True,+wOBgAABAAIAAAAAA3d3dwR6bmFrA2NvbQAAHAABwAwABQ...,www.znak.com,NOERROR,AAAA,2001:41d0:303:aed2:7a6e:616b:0:138,3587.0,www.znak.com,6,4
13106854,27187231,55364,2020-09-28 16:57:36,1.1.1.1,4,18.073,,True,poKBgAABAAIAAAAAA3d3dwR6bmFrA2NvbQAAHAABwAwABQ...,www.znak.com,NOERROR,AAAA,2001:41d0:303:aed2:7a6e:616b:0:138,3327.0,www.znak.com,6,4
13106856,27187231,55364,2020-09-28 16:57:36,1.0.0.1,4,18.852,,True,UxKBgAABAAIAAAAAA3d3dwR6bmFrA2NvbQAAHAABwAwABQ...,www.znak.com,NOERROR,AAAA,2001:41d0:303:aed2:7a6e:616b:0:138,3326.0,www.znak.com,6,4


In [32]:
df['resolver_af'] = df.apply(lambda row: get_address_family(row.resolver_address), axis=1)

In [33]:
df[df.address_family != df.resolver_af]

Unnamed: 0,msm_id,prb_id,timestamp,resolver_address,address_family,rt,error,local_resolver,abuf,target_name,rcode,response_type,response_address,ttl,target_name_meta,response_af,resolver_af
4,27186736,10092,2020-09-15 16:50:34,2001:4860:4860::8888,4,,"{""socket"": ""connect failed Network is unreacha...",True,,,,,,,$r.google.com,4,6
128,27186736,11702,2020-09-15 16:50:32,2a02:1812:2c13:c530:280f:c6ff:fef3:727b,4,,"{""socket"": ""connect failed Network is unreacha...",True,,,,,,,$r.google.com,4,6
163,27186736,11978,2020-09-15 16:50:39,2401:4800::123:108:224:6,4,,"{""socket"": ""connect failed Network is unreacha...",True,,,,,,,$r.google.com,4,6
193,27186736,12151,2020-09-15 16:50:40,2606:4700:4700::1111,4,,"{""socket"": ""connect failed Network is unreacha...",True,,,,,,,$r.google.com,4,6
285,27186736,13213,2020-09-15 16:50:35,2001:4860:4860::8888,4,,"{""socket"": ""connect failed Network is unreacha...",True,,,,,,,$r.google.com,4,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13103446,27187231,29359,2020-09-27 16:54:06,8.8.8.8,6,,"{""socket"": ""connect failed Network is unreacha...",True,,,,,,,www.znak.com,4,4
13103914,27187231,51703,2020-09-27 16:55:25,2001:470:e347:1::1,4,,"{""socket"": ""connect failed Permission denied""}",True,,,,,,,www.znak.com,4,6
13105620,27187231,29359,2020-09-28 16:54:15,8.8.8.8,6,,"{""socket"": ""connect failed Network is unreacha...",True,,,,,,,www.znak.com,4,4
13106091,27187231,51703,2020-09-28 16:55:16,2001:470:e347:1::1,4,,"{""socket"": ""connect failed Permission denied""}",True,,,,,,,www.znak.com,4,6


In [40]:
df[(df.address_family != df.resolver_af) & (df.local_resolver == False)]

Unnamed: 0,msm_id,prb_id,timestamp,resolver_address,address_family,rt,error,local_resolver,abuf,target_name,rcode,response_type,response_address,ttl,target_name_meta,response_af,resolver_af
470515,27186748,26736,2020-10-04 16:50:59,2606:4700:4700::1111,4,13.279,"{""socket"": ""connect failed Network is unreacha...",False,"error, no abuf",,,,,,$r.google.com,4,6
471005,27186748,19835,2020-10-04 16:51:08,2606:4700:4700::1111,4,23.074,"{""socket"": ""connect failed Network is unreacha...",False,"error, no abuf",,,,,,$r.google.com,4,6
471530,27186748,26736,2020-10-05 16:50:59,2606:4700:4700::1111,4,10.303,"{""socket"": ""connect failed Network is unreacha...",False,"error, no abuf",,,,,,$r.google.com,4,6
473080,27186748,26736,2020-10-06 16:51:01,2606:4700:4700::1111,4,26.537,"{""socket"": ""connect failed Network is unreacha...",False,"error, no abuf",,,,,,$r.google.com,4,6
497716,27186749,19835,2020-10-04 16:51:08,2620:fe::9,4,22.422,"{""socket"": ""connect failed Network is unreacha...",False,"error, no abuf",,,,,,$r.google.com,4,6
497760,27186749,26736,2020-10-04 16:51:03,2620:fe::9,4,38.743,"{""socket"": ""connect failed Network is unreacha...",False,"error, no abuf",,,,,,$r.google.com,4,6
498779,27186749,26736,2020-10-05 16:51:01,2620:fe::9,4,33.496,"{""socket"": ""connect failed Network is unreacha...",False,"error, no abuf",,,,,,$r.google.com,4,6
499802,27186749,26736,2020-10-06 16:51:01,2620:fe::9,4,33.292,"{""socket"": ""connect failed Network is unreacha...",False,"error, no abuf",,,,,,$r.google.com,4,6
519312,27186750,21566,2020-09-29 16:51:16,2a0d:2a00:1::1,4,20.927,"{""socket"": ""connect failed Network is unreacha...",False,"error, no abuf",,,,,,$r.google.com,4,6
523872,27186750,19835,2020-10-04 16:50:47,2a0d:2a00:1::1,4,47.241,"{""socket"": ""connect failed Network is unreacha...",False,"error, no abuf",,,,,,$r.google.com,4,6


In [44]:
df[(df.address_family != df.resolver_af) & (df.local_resolver == True) & (df.error == '')]

Unnamed: 0,msm_id,prb_id,timestamp,resolver_address,address_family,rt,error,local_resolver,abuf,target_name,rcode,response_type,response_address,ttl,target_name_meta,response_af,resolver_af


In [46]:
df[(df.address_family != df.resolver_af) & (df.local_resolver == False) & (df.error == '')]

Unnamed: 0,msm_id,prb_id,timestamp,resolver_address,address_family,rt,error,local_resolver,abuf,target_name,rcode,response_type,response_address,ttl,target_name_meta,response_af,resolver_af


In [34]:
df.dtypes

msm_id                      object
prb_id                       int64
timestamp           datetime64[ns]
resolver_address            object
address_family               int64
rt                         float64
error                       object
local_resolver              object
abuf                        object
target_name                 object
rcode                       object
response_type               object
response_address            object
ttl                        float64
target_name_meta            object
response_af                  int64
resolver_af                  int64
dtype: object

In [35]:
df['local_resolver'] = df['local_resolver'] == 'True'

In [36]:
df.dtypes

msm_id                      object
prb_id                       int64
timestamp           datetime64[ns]
resolver_address            object
address_family               int64
rt                         float64
error                       object
local_resolver                bool
abuf                        object
target_name                 object
rcode                       object
response_type               object
response_address            object
ttl                        float64
target_name_meta            object
response_af                  int64
resolver_af                  int64
dtype: object

In [37]:
conn = sqlite3.connect('data/dns.db')
df.to_sql('data', con=conn, index=False, if_exists='replace')
conn.close()