In [None]:
import json
from adblockparser import AdblockRules
import utilities
from openwpm_utils import domain as du
from tqdm.notebook import tqdm

from multiprocessing import Pool as ThreadPool
import multiprocessing

In [None]:
base_directory = 'khaleesi/'

# Replace * with HTTP or JS request chains file name below
json_representation_dir = base_directory + 'data/crawl-*.json'
json_representation_dir_labeled = base_directory + 'data/crawl-*-labeled.json'

easylist_dir = base_directory + 'ground_truth/easylist.txt'
easyprivacy_dir = base_directory + 'ground_truth/easyprivacy.txt'

In [None]:
el_rules = utilities.read_file_newline_stripped(easylist_dir)
ep_rules = utilities.read_file_newline_stripped(easyprivacy_dir)

## Initialize EL and EP rules

In [None]:
adblock_el_rules_script = AdblockRules(el_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['script', 'domain', 'subdocument'], skip_unsupported_rules=False)
adblock_el_rules_script_third = AdblockRules(el_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['third-party', 'script', 'domain', 'subdocument'], skip_unsupported_rules=False)
 
adblock_el_rules_image = AdblockRules(el_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['image', 'domain', 'subdocument'], skip_unsupported_rules=False)
adblock_el_rules_image_third = AdblockRules(el_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['third-party', 'image', 'domain', 'subdocument'], skip_unsupported_rules=False)
 
adblock_el_rules_css = AdblockRules(el_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['stylesheet', 'domain', 'subdocument'], skip_unsupported_rules=False)
adblock_el_rules_css_third = AdblockRules(el_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['third-party', 'stylesheet', 'domain', 'subdocument'], skip_unsupported_rules=False)

adblock_el_rules_xmlhttp = AdblockRules(el_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['xmlhttprequest', 'domain', 'subdocument'], skip_unsupported_rules=False)
adblock_el_rules_xmlhttp_third = AdblockRules(el_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['third-party', 'xmlhttprequest', 'domain', 'subdocument'], skip_unsupported_rules=False)
 
adblock_el_rules_third = AdblockRules(el_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['third-party', 'domain', 'subdocument'], skip_unsupported_rules=False)
adblock_el_rules_domain = AdblockRules(el_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['domain', 'subdocument'], skip_unsupported_rules=False)

In [None]:
adblock_ep_rules_script = AdblockRules(ep_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['script', 'domain', 'subdocument'], skip_unsupported_rules=False)
adblock_ep_rules_script_third = AdblockRules(ep_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['third-party', 'script', 'domain', 'subdocument'], skip_unsupported_rules=False)
 
adblock_ep_rules_image = AdblockRules(ep_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['image', 'domain', 'subdocument'], skip_unsupported_rules=False)
adblock_ep_rules_image_third = AdblockRules(ep_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['third-party', 'image', 'domain', 'subdocument'], skip_unsupported_rules=False)
 
adblock_ep_rules_css = AdblockRules(ep_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['stylesheet', 'domain', 'subdocument'], skip_unsupported_rules=False)
adblock_ep_rules_css_third = AdblockRules(ep_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['third-party', 'stylesheet', 'domain', 'subdocument'], skip_unsupported_rules=False)

adblock_ep_rules_xmlhttp = AdblockRules(ep_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['xmlhttprequest', 'domain', 'subdocument'], skip_unsupported_rules=False)
adblock_ep_rules_xmlhttp_third = AdblockRules(ep_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['third-party', 'xmlhttprequest', 'domain', 'subdocument'], skip_unsupported_rules=False)
 
adblock_ep_rules_third = AdblockRules(ep_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['third-party', 'domain', 'subdocument'], skip_unsupported_rules=False)
adblock_ep_rules_domain = AdblockRules(ep_rules, use_re2=True, max_mem=1024*1024*1024, supported_options=['domain', 'subdocument'], skip_unsupported_rules=False)

### Helper functions

In [None]:
def match_url_el(top_level_url, current_url, resource_type):
    try:
        domain_top_level = du.get_ps_plus_1(top_level_url)
        current_domain = du.get_ps_plus_1(current_url)

        if domain_top_level == current_domain:
            third_party_check = False
        else:
            third_party_check = True

        if resource_type == 'sub_frame':
            subdocument_check = True
        else:
            subdocument_check = False

        if resource_type == 'script':
            if third_party_check:
                adblock_el_rules = adblock_el_rules_script_third
                options = {'third-party': True, 'script': True, 'domain': domain_top_level, 'subdocument': subdocument_check}
            else:
                adblock_el_rules = adblock_el_rules_script
                options = {'script': True, 'domain': domain_top_level, 'subdocument': subdocument_check}

        elif resource_type == 'image' or resource_type == 'imageset':
            if third_party_check:
                adblock_el_rules = adblock_el_rules_image_third
                options = {'third-party': True, 'image': True, 'domain': domain_top_level, 'subdocument': subdocument_check}
            else:
                adblock_el_rules = adblock_el_rules_image
                options = {'image': True, 'domain': domain_top_level, 'subdocument': subdocument_check}

        elif resource_type == 'stylesheet':
            if third_party_check:
                adblock_el_rules = adblock_el_rules_css_third
                options = {'third-party': True, 'stylesheet': True, 'domain': domain_top_level, 'subdocument': subdocument_check}
            else:
                adblock_el_rules = adblock_el_rules_css
                options = {'stylesheet': True, 'domain': domain_top_level, 'subdocument': subdocument_check}

        elif resource_type == 'xmlhttprequest':
            if third_party_check:
                adblock_el_rules = adblock_el_rules_xmlhttp_third
                options = {'third-party': True, 'xmlhttprequest': True, 'domain': domain_top_level, 'subdocument': subdocument_check}
            else:
                adblock_el_rules = adblock_el_rules_xmlhttp
                options = {'xmlhttprequest': True, 'domain': domain_top_level, 'subdocument': subdocument_check}

        elif third_party_check:
            adblock_el_rules = adblock_el_rules_third
            options = {'third-party': True, 'domain': domain_top_level, 'subdocument': subdocument_check}

        else:
            adblock_el_rules = adblock_el_rules_domain
            options = {'domain': domain_top_level, 'subdocument': subdocument_check}

        return adblock_el_rules.should_block(current_url, options)
    except:
        return False
    

def match_url_ep(top_level_url, current_url, resource_type):
    try:
        domain_top_level = du.get_ps_plus_1(top_level_url)
        current_domain = du.get_ps_plus_1(current_url)

        if domain_top_level == current_domain:
            third_party_check = False
        else:
            third_party_check = True

        if resource_type == 'sub_frame':
            subdocument_check = True
        else:
            subdocument_check = False

        if resource_type == 'script':
            if third_party_check:
                adblock_ep_rules = adblock_ep_rules_script_third
                options = {'third-party': True, 'script': True, 'domain': domain_top_level, 'subdocument': subdocument_check}
            else:
                adblock_ep_rules = adblock_ep_rules_script
                options = {'script': True, 'domain': domain_top_level, 'subdocument': subdocument_check}

        elif resource_type == 'image' or resource_type == 'imageset':
            if third_party_check:
                adblock_ep_rules = adblock_ep_rules_image_third
                options = {'third-party': True, 'image': True, 'domain': domain_top_level, 'subdocument': subdocument_check}
            else:
                adblock_ep_rules = adblock_ep_rules_image
                options = {'image': True, 'domain': domain_top_level, 'subdocument': subdocument_check}

        elif resource_type == 'stylesheet':
            if third_party_check:
                adblock_ep_rules = adblock_ep_rules_css_third
                options = {'third-party': True, 'stylesheet': True, 'domain': domain_top_level, 'subdocument': subdocument_check}
            else:
                adblock_ep_rules = adblock_ep_rules_css
                options = {'stylesheet': True, 'domain': domain_top_level, 'subdocument': subdocument_check}

        elif resource_type == 'xmlhttprequest':
            if third_party_check:
                adblock_ep_rules = adblock_ep_rules_xmlhttp_third
                options = {'third-party': True, 'xmlhttprequest': True, 'domain': domain_top_level, 'subdocument': subdocument_check}
            else:
                adblock_ep_rules = adblock_ep_rules_xmlhttp
                options = {'xmlhttprequest': True, 'domain': domain_top_level, 'subdocument': subdocument_check}

        elif third_party_check:
            adblock_ep_rules = adblock_ep_rules_third
            options = {'third-party': True, 'domain': domain_top_level, 'subdocument': subdocument_check}

        else:
            adblock_ep_rules = adblock_ep_rules_domain
            options = {'domain': domain_top_level, 'subdocument': subdocument_check}

        return adblock_ep_rules.should_block(current_url, options)
    except:
        return False

In [None]:
json_representation = utilities.read_json(json_representation_dir)

## Label ads/trackers

In [None]:
def match_with_fl(item):
    try:
        splitted_item = item.split('|-|-|')
        key = splitted_item[0]
        top_url = splitted_item[1]
        current_url = splitted_item[2]
        resource_type = splitted_item[3]

        if match_url_ep(top_url, current_url, resource_type) or match_url_el(top_url, current_url, resource_type):
            return key + '|' + str(True)
        else:
            return key + '|' + str(False)
    except:
        return 'INVALID'

In [None]:
key_map = []
for key in json_representation:
    top_url = json_representation[key]['top_url']
    for request in json_representation[key]['content']:
        resource_type = request['resource_type']
        current_url = request['url']
        redirect_id = request['redirect_id']
        if top_url == None:
            top_url = ''
        key_map.append(key + '|' + str(redirect_id) + '|-|-|' + top_url + '|-|-|' + current_url + '|-|-|' + resource_type)

In [None]:
cpu_to_relax = 1
pool = ThreadPool(processes=multiprocessing.cpu_count() - cpu_to_relax)
results = pool.map(match_with_fl, key_map)
pool.close()
pool.join()

In [None]:
for key in json_representation:
    json_representation[key]['ground_truth'] = False

In [None]:
for r_item in results:
    key = r_item.rsplit('|', 1)[0].rsplit('|', 1)[0]
    redirect_id = r_item.rsplit('|', 1)[0].rsplit('|', 1)[1]
    label = True if r_item.rsplit('|', 1)[1] == 'True' else False

    json_representation[key]['ground_truth'] = label
    for request in json_representation[key]['content']:
        current_redirect_id = request['redirect_id']
        
        if int(current_redirect_id) == int(redirect_id):
            request['ground_truth'] = label

In [None]:
utilities.write_json(json_representation_dir_labeled, json_representation)