In [None]:
import json
from tqdm.notebook import tqdm
import tldextract
import re
import urllib.parse as urlparse

In [None]:
base_directory = 'khaleesi/'

# Replace * with HTTP or JS request chains file name below
json_chains_dir = base_directory + 'data/crawl-*-labeled.json'
features_dir = base_directory + 'features/*.csv'

In [None]:
with open(json_chains_dir) as f:
    data = json.load(f)

In [None]:
ad_keywords = ["click", "measurement", "measure", "promoted", "pagead", "hit", "banner", "2mdn",\
               "adsystem", "adsense", "ptracking", "beacon", "openx", "aralego", "usermatch",\
               "appnexus", "popunder", "punder", "metrics", "tpid", "pixel", "idsync", "uuid",\
               "uid", "advertising", "adsync", "dspid", "dpid", "dpuuid", "tracking", "ad", "delivery",\
               "pid", "id_sync", "pxl", "1x1", "px", "pix", "analytics", "csync", "cksync", "adserver",\
               "bidder", "ads", "adform", "advert", "iframe", "googlead", "advertise", "track", "prebid",\
               "bid", "zoneid", "siteid", "pageid", "viewid", "zone_id", "google_afc" , "google_afs",\
               "google_gid", "google_cver", "pix", "rtb", "ssp", "dsp", "dmt", "sync", "doubleclick",\
               "match", "tid", "google_nid", "google_dbm", "google_cm", "google_sc"]

ad_keywords_plain = set(["pagead", "measure", "promote", "banner", "2mdn", "adsystem", "adsense",\
                         "beacon", "openx", "aralego", "usermatch", "metrics", "appnexus", "popunder",\
                         "punder", "tpid", "pixel", "uuid", "advertising", "dspid", "dpid", "dpuuid",\
                         "tracking", "adserver", "1x1", "analytics", "adform", "advert", "iframe",\
                         "googlead", "advertise", "track", "prebid", "zoneid", "siteid", "pageid",\
                         "viewid", "zone_id", "google_afc", "google_afs", "google_gid", "google_cver",\
                         "sync", "doubleclick", "match", "google_nid", "google_dbm", "google_cm", "google_sc"])

def keyword_in_url_test(word, url):
    regexKeywordsLeft = re.compile(r'[^0-9a-zA-Z]+' + word)
    regexKeywordsRight = re.compile(word + r'[^0-9a-zA-Z]')
    if regexKeywordsLeft.search(url) or regexKeywordsRight.search(url):
        return True

def has_uuid(url):
    regexKeyword = re.compile(r'........-....-....-....-............')
    if regexKeyword.search(url):
        return True

def dimensions_in_url(url):
    regexKeyword = re.compile(r'\\d{2,4}[xX]\\d{2,4}')
    if regexKeyword.search(url):
        return True

In [None]:
out = open(features_dir, 'w')
out.write('identifier,length_of_url,request_method,response_status,etag_in_header,p3p_in_header,has_subdomains,subdomain_of_top_level_domain_check,resource_type,url_has_uuid,url_has_dimensions,response_sets_cookie,third_party_domain,num_non_alphanumeric_chars_in_query_string,top_domain_in_query_string,num_request_cookies,semi_colons_in_url,response_type,response_subtype,content_length,query_string_length,keyword_in_url_re,keyword_in_url,redirect_to_new_domain,length_of_chain,num_unique_domains,num_request_headers,num_response_headers,target\n')

pbar = tqdm(total=len(data), position=0, leave=True)

domains = {}

for key in data:
    pbar.update(1)

    top_url = data[key]['top_url']
    if top_url == None:
        top_url = ''
    top_url_extracted = tldextract.extract(top_url)
    top_domain = top_url_extracted.domain + '.' + top_url_extracted.suffix
    top_hostname = top_url_extracted.subdomain + '.' + top_url_extracted.domain + '.' + top_url_extracted.suffix

    i = 0
    while i < len(data[key]['content']):
        identifier = key + '|' + str(data[key]['content'][i]['redirect_id'])
        url = data[key]['content'][i]['url']
        url_extracted = tldextract.extract(url)
        domain = url_extracted.domain + '.' + url_extracted.suffix
        hostname = url_extracted.subdomain + '.' + url_extracted.domain + '.' + url_extracted.suffix

        request_headers = data[key]['content'][i]['request_headers']
        resource_type = data[key]['content'][i]['resource_type']

        if key in domains:
            domains[key].add(domain)
        else:
            domains[key] = set([domain])

        num_unique_domains = str(len(domains[key]))

        length_of_chain = str(i + 1)

        query_string = urlparse.urlparse(url).query
        query_string_length = str(len(query_string))
        num_non_alphanumeric_chars_in_query_string = '0'
        if query_string_length != '0':
            non_alphanumeric_pattern = r'[^0-9a-zA-Z]'
            num_non_alphanumeric_chars_in_query_string = str(
                len(re.findall(non_alphanumeric_pattern, query_string)))

        subdomain_of_top_level_domain_check = '0'
        if top_domain == domain and top_hostname != hostname:
            subdomain_of_top_level_domain_check = '1'

        top_domain_in_query_string = '0'
        if top_domain in query_string:
            top_domain_in_query_string = '1'

        semi_colons_in_url = '0'
        if ';' in query_string:
            semi_colons_in_url = '1'

        keyword_in_url_re = '0'
        for keyword in ad_keywords:
            if keyword_in_url_test(keyword, url.lower()) == True:
                keyword_in_url_re = '1'
                break

        keyword_in_url = '0'
        for keyword in ad_keywords_plain:
            if keyword in url.lower():
                keyword_in_url = '1'
                break

        url_has_uuid = '0'
        if has_uuid(url) == True:
            url_has_uuid = '1'

        url_has_dimensions = '0'
        if dimensions_in_url(url) == True:
            url_has_dimensions = '1'

        length_of_url = str(len(url))

        has_subdomains = '1'
        if url_extracted.subdomain == '' or url_extracted.subdomain == 'www':
            has_subdomains = '0'

        third_party_domain = '0'
        if top_domain != domain:
            third_party_domain = '1'

        num_request_headers = str(len(request_headers))

        request_method = data[key]['content'][i]['method']

        num_request_cookies = '0'
        for element in request_headers:
            if element[0].lower() == 'cookie':
                try:
                    num_request_cookies = str(element[1].count("; ") + 1)
                except:
                    num_request_cookies = '1'

        target = str(False)
        if 'ground_truth' in data[key]['content'][i]:
            target = str(data[key]['content'][i]['ground_truth'])

        num_response_headers = etag_in_header = response_sets_cookie = p3p_in_header = content_length = response_type = response_subtype = response_status = redirect_to_new_domain = '?'
        if i != 0:
            previous_url = data[key]['content'][i-1]['url']
            previous_url_extracted = tldextract.extract(previous_url)
            previous_domain = previous_url_extracted.domain + \
                '.' + previous_url_extracted.suffix
            response_headers = data[key]['content'][i-1]['response_headers']
            response_status = str(data[key]['content'][i-1]['response_status'])

            num_response_headers = str(len(response_headers))

            redirect_to_new_domain = '0'
            if domain != previous_domain:
                redirect_to_new_domain = '1'

            response_type_field = ''
            etag_in_header = response_sets_cookie = p3p_in_header = content_length = '0'
            for element in response_headers:
                if element[0].lower() == 'content-type':
                    response_type_field = element[1]
                elif element[0].lower() == 'content-length':
                    content_length = str(element[1])
                elif element[0].lower() == 'etag':
                    etag_in_header = '1'
                elif element[0].lower() == 'p3p':
                    p3p_in_header = '1'
                elif element[0].lower() == 'set-cookie':
                    response_sets_cookie = '1'

            response_type = '0'
            if 'application' in response_type_field:
                response_type = '1'
            elif 'audio' in response_type_field:
                response_type = '2'
            elif 'image' in response_type_field:
                response_type = '3'
            elif 'text' in response_type_field:
                response_type = '4'
            elif 'video' in response_type_field:
                response_type = '5'
            elif 'font' in response_type_field:
                response_type = '6'
            elif 'model' in response_type_field:
                response_type = '7'

            response_subtype = '0'
            if 'html' in response_type_field:
                response_subtype = '1'
            elif 'css' in response_type_field:
                response_subtype = '2'
            elif 'javascript' in response_type_field:
                response_subtype = '3'
            elif 'gif' in response_type_field:
                response_subtype = '4'
            elif 'png' in response_type_field:
                response_subtype = '5'
            elif 'jpeg' in response_type_field:
                response_subtype = '6'
            elif 'plain' in response_type_field:
                response_subtype = '7'
            elif 'json' in response_type_field:
                response_subtype = '8'

        out.write(','.join([identifier, length_of_url, request_method, response_status, etag_in_header, p3p_in_header, has_subdomains, subdomain_of_top_level_domain_check, resource_type, url_has_uuid, url_has_dimensions, response_sets_cookie, third_party_domain, num_non_alphanumeric_chars_in_query_string, top_domain_in_query_string,
                            num_request_cookies, semi_colons_in_url, response_type, response_subtype, content_length, query_string_length, keyword_in_url_re, keyword_in_url, redirect_to_new_domain, length_of_chain, num_unique_domains, num_request_headers, num_response_headers, target]) + '\n')

        i += 1

out.close()