*This notebook gets reverse-dns entries from `sk_one_off_extensions` table, and searches the sample words to identify dsl and cable deployments and create a temporary file `temp.sql` with UPDATE queries that can be run on `sk_one_off_extensions` table*
- - - 

In [1]:
import sqlite3
import pandas as pd
import requests
import ipaddress
import time

In [2]:
DB_LOCATION = 'lastmile.db'
SK_ONE_OFF_EXTENSIONS = 'sk_one_off_extensions'
SK_ASN = 'sk_asn'
METADATA_SNAPSHOT_EXTENSIONS = 'metadata_snapshot_extensions'

In [3]:
d_fav_rdns = {

     'CABLE'     :     ['hsd1', 'cable', 'hfc']
   , 'DSL'       :     ['adsl', 'dsl', 'dslb', 'xdsl', 'luxdsl']
   , 'FIBRE'     :     ['fios', 'ftth', 'fiber7', 'fttb']    
}

In [4]:
con = sqlite3.connect(DB_LOCATION)

In [5]:
query = '''SELECT    asn_holder_name
           FROM      %s
        '''%(SK_ASN)

df1 = pd.read_sql(query, con)

In [6]:
df1.head()

Unnamed: 0,asn_holder_name
0,"COGENT-174 - Cogent Communications,US"
1,"CW Cable and Wireless Worldwide plc,GB"
2,DEMON-INTERNET Now maintained by Cable & Wirel...
3,"BT-UK-AS BT Public Internet Service,GB"
4,"NTL Virgin Media Limited,GB"


In [7]:
holder_names = []
for value in df1.values: holder_names.append(value[0])

In [8]:
query = '''SELECT    ai.probeid, a.asn_access_type, oe.reverse_dns, '' as access_technology
           FROM      %s as oe, %s as ai, %s as a
           WHERE     oe.probeid = ai.probeid
           AND       ai.asn =  a.asn
        '''%(SK_ONE_OFF_EXTENSIONS, METADATA_SNAPSHOT_EXTENSIONS, SK_ASN)

df = pd.read_sql(query, con)

In [9]:
df.head()

Unnamed: 0,probeid,asn_access_type,reverse_dns,access_technology
0,14190,,178-78-87-102.static.kc.net.uk,
1,14266,,95.145.135.27,
2,14851,DSL,host86-153-18-233.range86-153.btcentralplus.com,
3,14880,CABLE,97e619d1.skybroadband.com,
4,14905,DSL,host31-53-231-115.range31-53.btcentralplus.com,


In [10]:
def create_set_of_tlds():
    try: res = requests.get('http://www.internic.net/domain/root.zone')
    except Exception as e: print(e)
    else: data = res.text
        
    tld_list = []
    for line in data.split('\n'):
        try: tld = line.split()[0]
        except Exception as e: print('.')
        else:
            tld = tld.rstrip('.')
            tld_list.append(tld)

    return set(tld_list)    

In [11]:
tlds = create_set_of_tlds()

.


In [12]:
def split_rdns_by_word(rdns):
    import re
    rdns_list = re.split('\.|-',rdns)
    return rdns_list
df['rdns_list'] = df['reverse_dns'].apply(split_rdns_by_word)

In [13]:
d = {}
for i, row in df.iterrows():
    probeid = row['probeid']
    word_list = row['rdns_list']

    for word in word_list: 
        if word in tlds: continue
        try: int(word)
        except:
            try: d[word].append(probeid)
            except KeyError as e: d[word] = []; d[word].append(probeid)

In [14]:
from collections import OrderedDict
ordered_d = OrderedDict(sorted(d.items(), key=lambda x: len(x[1]), reverse=True))

In [21]:
for k, v in ordered_d.items():
    if len(v) >= 100: print('%s: %d'%(k, len(v)))

btcentralplus: 529
virginm: 422
cable: 417
host86: 314
range86: 314
skybroadband: 176
as13285: 161
dyn: 133


In [22]:
for i, row in df.iterrows():
    probeid = row['probeid']; rdns_list = row['rdns_list']
    for k,v in d_fav_rdns.items():
        v_set = set(v); rdns_set = set(rdns_list)     
        intersect = v_set.intersection(rdns_set)
        if len(intersect) > 0: 
            df.ix[i, 'access_technology'] = k
            break

In [23]:
df.head()

Unnamed: 0,probeid,asn_access_type,reverse_dns,access_technology,rdns_list
0,14190,,178-78-87-102.static.kc.net.uk,,"[178, 78, 87, 102, static, kc, net, uk]"
1,14266,,95.145.135.27,,"[95, 145, 135, 27]"
2,14851,DSL,host86-153-18-233.range86-153.btcentralplus.com,,"[host86, 153, 18, 233, range86, 153, btcentral..."
3,14880,CABLE,97e619d1.skybroadband.com,,"[97e619d1, skybroadband, com]"
4,14905,DSL,host31-53-231-115.range31-53.btcentralplus.com,,"[host31, 53, 231, 115, range31, 53, btcentralp..."


In [24]:
del df['asn_access_type']
del df['reverse_dns']
del df['rdns_list']

In [26]:
df.tail()

Unnamed: 0,probeid,access_technology
2204,584164,
2205,584166,
2206,589646,CABLE
2207,601700,CABLE
2208,608580,


In [28]:
fsock = open('temp.sql', 'w')
fsock.write('BEGIN TRANSACTION;\n')

for i, row in df.iterrows():
    probeid = row['probeid']; at = row['access_technology']
    if at == '': continue
    query = 'UPDATE %s SET access_type_technology = "%s" WHERE probeid = %d;'%(SK_ONE_OFF_EXTENSIONS, at, probeid)
    fsock.write('%s\n'%query)
    
    
fsock.write('END TRANSACTION;\n')
fsock.close()