In [1]:

import argparse
import json
from urllib.parse import urlparse
from tld import get_fld
from adblockparser import AdblockRules

In [2]:
# Code is from https://gist.github.com/tomatohater/8853161
def parse_har(harfile_path):
    """Reads a har file from the filesystem, converts to CSV, then dumps to
    stdout.
    """
    urls = []
    harfile = open(harfile_path)
    harfile_json = json.loads(harfile.read())
    i = 0

    for entry in harfile_json['log']['entries']:
        i = i + 1
        url = entry['request']['url']
        urlparts = urlparse(entry['request']['url'])
        size_bytes = entry['response']['bodySize']
        size_kilobytes = float(entry['response']['bodySize'])/1024
        mimetype = 'unknown'
        if 'mimeType' in entry['response']['content']:
            mimetype = entry['response']['content']['mimeType']
        urls.append((url, mimetype))
        #print(f"{i}, {url}, {urlparts.hostname}, {size_bytes}, {mimetype}")
    return urls

In [3]:
easylist = []
with open('./easylist.txt') as f:
    for cnt, line in enumerate(f):
        easylist.append(line.rstrip())
rules = AdblockRules(easylist) 

In [11]:
cnn_urls = parse_har('./www.cnn.com.har')

sites = {}

for url, mimetype in cnn_urls:    
    is_script  = 'script' in mimetype
    is_image = 'image' in mimetype
    domain = 'www.cnn.com'
    try:
        url_domain = get_fld(url)
    except Exception as e:
        print(f"invalid domain in {url}")
        continue
    is_third_party = url_domain != 'cnn.com'
    options = {
        'script': is_script,
        'domain': domain, 
        'image': is_image,
        'third-party': is_third_party
    }
    
    if url_domain not in sites:
        sites[url_domain] = {
            "number_requests": 0,
            "number_requests_blocked":0
        }
        
        
    sites[url_domain]["number_requests"] += 1
    
    
    if rules.should_block(url, options):
        sites[url_domain]["number_requests_blocked"] +=1

        
#print(f"CNN had {number_requests} total requests, and {number_requests_blocked} requests blocked")

invalid domain in https://todo/?google_gid=CAESECD2O5xN3IZm4NHYL1az6ek&google_cver=1&google_push=AeyLB5c6ogjcdHvaAWC4o_iMmkq40-60yoAsz03koX6iroBLlxgzqJZV31BTJ4nlpedPWGjVgN1j-xohR4NRYH1NO00fviA6Deim
{'cnn.com': {'number_requests': 69, 'number_requests_blocked': 5}, 'googletagservices.com': {'number_requests': 8, 'number_requests_blocked': 2}, 'amazon-adsystem.com': {'number_requests': 10, 'number_requests_blocked': 10}, 'criteo.net': {'number_requests': 3, 'number_requests_blocked': 2}, 'cookielaw.org': {'number_requests': 12, 'number_requests_blocked': 0}, 'outbrain.com': {'number_requests': 16, 'number_requests_blocked': 0}, 'optimizely.com': {'number_requests': 4, 'number_requests_blocked': 0}, 'doubleclick.net': {'number_requests': 44, 'number_requests_blocked': 44}, 'jsdelivr.net': {'number_requests': 1, 'number_requests_blocked': 1}, 'beemray.com': {'number_requests': 1, 'number_requests_blocked': 0}, 'ugdturner.com': {'number_requests': 1, 'number_requests_blocked': 0}, 'adsafepr

In [12]:
macys_urls = parse_har('./www.macys.com.har')

for url, mimetype in macys_urls:    
    is_script  = 'script' in mimetype
    is_image = 'image' in mimetype
    domain = 'www.macys.com'
    try:
        url_domain = get_fld(url)
    except Exception as e:
        print(f"invalid domain in {url}")
        continue
    is_third_party = url_domain != 'macys.com'
    options = {
        'script': is_script,
        'domain': domain, 
        'image': is_image,
        'third-party': is_third_party
    }
    
    if url_domain not in sites:
        sites[url_domain] = {
            "number_requests": 0,
            "number_requests_blocked":0
        }
        
        
    sites[url_domain]["number_requests"] += 1
    
    
    if rules.should_block(url, options):
        sites[url_domain]["number_requests_blocked"] +=1


In [13]:
print("Site, # HTTP requests, # HTTP requests blocked")
for site, requests in sites.items():
    print(f"{site}, {requests['number_requests']}, {requests['number_requests_blocked']}")

Site, # HTTP requests, # HTTP requests blocked
cnn.com, 69, 5
googletagservices.com, 8, 2
amazon-adsystem.com, 10, 10
criteo.net, 6, 4
cookielaw.org, 12, 0
outbrain.com, 19, 0
optimizely.com, 4, 0
doubleclick.net, 48, 46
jsdelivr.net, 1, 1
beemray.com, 1, 0
ugdturner.com, 1, 0
adsafeprotected.com, 31, 31
indexww.com, 1, 1
krxd.net, 10, 0
chartbeat.com, 3, 0
bing.com, 5, 0
bounceexchange.com, 7, 0
ads-twitter.com, 2, 2
tru.am, 2, 0
boomtrain.com, 3, 0
segment.com, 2, 0
demdex.net, 9, 0
scorecardresearch.com, 8, 0
tree.com, 3, 0
cloudflare.com, 2, 0
google.com, 5, 4
bleacherreport.net, 1, 0
bootstrapcdn.com, 1, 0
googletagmanager.com, 2, 0
google-analytics.com, 4, 0
lendingtree.com, 1, 0
rlcdn.com, 6, 0
rkdms.com, 1, 0
adsrvr.org, 6, 6
rubiconproject.com, 19, 19
adnxs.com, 10, 10
t.co, 2, 0
usabilla.com, 1, 0
everesttech.net, 4, 1
segment.io, 2, 0
yieldmo.com, 2, 2
outbrainimg.com, 2, 0
3lift.com, 3, 3
casalemedia.com, 10, 10
facebook.net, 4, 0
googlesyndication.com, 74, 74
imrworldwide.