# Identification of Tracking Calls in HTTP Requests

We identified tracking behviour based on the HTTP Request URLs. Therefore we used EasyList and EasyPrivacy to decide whether a request is related to tracking or not.

## Data Preparation

### Mount Google Drive

In [0]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### Import HTTP-Request Crawl Data

In [0]:
import pandas as pd

#load data
http_requests_df = pd.read_csv("/content/drive/My Drive/IT_Security/http_requests.csv")

print(len(http_requests_df))

855804


In [0]:
#rename columns
http_requests_df.columns = ['visit_id', 'url', 'top_level_url']

http_requests_df[15:20]

Unnamed: 0,visit_id,url,top_level_url
15,5,https://d1wigddrwdtsce.cloudfront.net/26540_1c...,https://www.linguee.de/
16,5,https://d1wigddrwdtsce.cloudfront.net/26540_1c...,https://www.linguee.de/
17,5,https://d1wigddrwdtsce.cloudfront.net/26540_1c...,https://www.linguee.de/
18,5,https://d1wigddrwdtsce.cloudfront.net/26540_1c...,https://www.linguee.de/
19,5,https://d1wigddrwdtsce.cloudfront.net/26540_1c...,https://www.linguee.de/


## Import Tracking Lists

### EasyList

In [0]:
#identify tracking functions/requsts (1)
#EasyList - Advertisement related Tracking

#import package for regular expressions
import re

#read EasyList file
easylist = open(r"/content/drive/My Drive/IT_Security/easylist_new.csv", encoding = "ISO-8859-1")
#easylist = open(r"tracking_lists/easylist_new.csv", encoding = "ISO-8859-1")

#read lines in file
easylist = easylist.readlines()

#remove line break operators
easylist = [re.sub(r'\n', '', x) for x in easylist]

#remove first element
easylist.pop(0)

print(easylist[:10])
print(len(easylist))

['&act=ads_', '&ad.vid=$~xmlhttprequest', '&ad_block=', '&ad_box_', '&ad_channel=', '&ad_classid=', '&ad_code=', '&ad_height=', '&ad_ids=', '&ad_keyword=']
72952


### EasyPrivacy

In [0]:
#identify tracking functions/requsts (2)
#EasyPrivacy - non Advertisement related Tracking

#import package for regular expressions
import re

#read EasyPrivacy file
easyprivacy = open(r"/content/drive/My Drive/IT_Security/easyprivacy_new.csv", encoding = "ISO-8859-1")
#easyprivacy = open(r"tracking_lists/easyprivacy_new.csv", encoding = "ISO-8859-1")

#read lines in file
easyprivacy = easyprivacy.readlines()

#remove line break operators
easyprivacy = [re.sub(r'\n', '', x) for x in easyprivacy]

#to prevent not iterable error
easyprivacy = [str(i) for i in easyprivacy]

#remove first element
easyprivacy.pop(0)

print(easyprivacy[:10])
print(len(easyprivacy))

['&action=js_stats&', '&action=js_stats_', '&callback=hitStats_', '&ctxId=*&pubId=*&clientDT=', '&ctxId=*&pubId=*&objId=', '&event=view&', '&funnel_state=', '&http_referer=$script,xmlhttprequest', '&idsite=*&send_image=$image', '&pageReferrer=']
17246


## Identification of Tracking Requests

In [0]:
!pip install adblockparser
from adblockparser import AdblockRules



In [0]:
#convert elements to string
easylist = [str(i) for i in easylist]
easyprivacy = [str(i) for i in easyprivacy]

In [0]:
#convert http request urls to string to prevent type errors
http_requests_df.url = http_requests_df.url.apply(lambda x: str(x))

### Advertisement-Related Tracking

In [0]:
#setting rules to identify tracking
rules = AdblockRules(easylist)

In [0]:
#apply rule-checking function to all request urls
ad_tracking_df = http_requests_df[http_requests_df.url.apply(lambda x: rules.should_block(x))]

### Non Advertisement-Related Tracking

In [0]:
#setting rules to identify tracking
rules = AdblockRules(easyprivacy)

In [0]:
#apply rule-checking function to all request urls
non_ad_tracking_df = http_requests_df[http_requests_df.url.apply(lambda x: rules.should_block(x))]

## Identify Tracking Domains

### Import List of Tracking Host Domains

In [0]:
#import package for regular expressions
import re

#read tracking host domains file
tracking_host_list = open(r"/content/drive/My Drive/IT_Security/tracking_host_list.csv", encoding = "ISO-8859-1")
#tracking_host_list = open(r"tracking_lists/tracking_host_list.csv", encoding = "ISO-8859-1")

#read lines in file
tracking_host_list = tracking_host_list.readlines()

#remove line break operators
tracking_host_list = [re.sub(r'\n', '', x) for x in tracking_host_list]

#convert to string to prevent not iterable error
tracking_host_list = [str(i) for i in tracking_host_list]

print(tracking_host_list[:10])
print(len(tracking_host_list))

['google-analytics.com', 'gstatic.com', 'google.com', 'fonts.googleapis.com', 'googletagmanager.com', 'doubleclick.net', 'facebook.net', 'googletagservices.com', 'facebook.com', 'googleadservices.com']
2183


In [0]:
print('' in tracking_host_list)
print(tracking_host_list.index(''))
print(len(tracking_host_list))
#last element is empty string -> fallback option for not finding a tracking domain

True
2182
2183


In [0]:
#check presence of top 20 trackers from open WPM paper
print('google-analytics.com' in tracking_host_list)
print('gstatic.com' in tracking_host_list)
print('doubleclick.net' in tracking_host_list)
print('google.com' in tracking_host_list)
print('fonts.googleapis.com' in tracking_host_list)
print('facebook.com' in tracking_host_list)
print('facebook.net' in tracking_host_list)
print('ajax.googleapis.com' in tracking_host_list)
print('googlesyndication.com' in tracking_host_list)
print('fbcdn.net' in tracking_host_list)
print('twitter.com' in tracking_host_list)
print('googleadservices.com' in tracking_host_list)
print('adnxs.com' in tracking_host_list)
print('googleusercontent.com' in tracking_host_list)
print('bluekai.com' in tracking_host_list)
print('mathtag.com' in tracking_host_list)
print('youtube.com' in tracking_host_list)
print('ytimg.com' in tracking_host_list)
print('googletagmanager.com' in tracking_host_list)
print('yahoo.com'in tracking_host_list)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


### Check for Presence of Tracking Host Domain in identified Tracking URL

In [0]:
#function, that checks wether a substring (here: tracking domain) is contained in an url
def check_result(substr_ls, url_ls):
    
    result_url = list()
    tracking_call_list = list()
    
    #select url to check
    for url in url_ls:
        #select a domain as substring
        for substr in substr_ls:
            #check if domain is included in tracking call
            if substr in url:
                result_url.append(url)
                tracking_call_list.append(substr)
                break

    #create dataframe of results
    result_df = pd.DataFrame({'tracking_url': result_url,
                              'host_domain': tracking_call_list})
    
    #return the resulting dataframe               
    return result_df

#apply function to both tracking identification dataframes      
ad_tracking_hosts_df = check_result(tracking_host_list, list(ad_tracking_df.url))
non_ad_tracking_hosts_df = check_result(tracking_host_list, list(non_ad_tracking_df.url))

print(ad_tracking_hosts_df.head())
print(len(ad_tracking_hosts_df))
print(len(ad_tracking_df))
print(set(ad_tracking_hosts_df.host_domain))

print(non_ad_tracking_hosts_df.head())
print(len(non_ad_tracking_hosts_df))
print(len(non_ad_tracking_df))
print(set(non_ad_tracking_hosts_df.host_domain))

                                        tracking_url      host_domain
0  https://securepubads.g.doubleclick.net/tag/js/...  doubleclick.net
1  https://securepubads.g.doubleclick.net/gpt/pub...  doubleclick.net
2  https://securepubads.g.doubleclick.net/gpt/pub...  doubleclick.net
3  https://securepubads.g.doubleclick.net/gampad/...  doubleclick.net
4  https://securepubads.g.doubleclick.net/gpt/pub...  doubleclick.net
854
854
{'', 'doubleclick.net', 'awin1.com', 'jsdelivr.net', 'storage.googleapis.com', 'adition.com', 'adform.net', 'nuggad.net', 'mpnrs.com', 'd2tycn7nnoiglw.cloudfront.net', 'ampproject.org', 'tribalfusion.com', 'theadex.com', 'h-bid.com', 'adventori.com', 'imasdk.googleapis.com', 'travel-assets.com', 'adspirit.de', 'criteo.net', 'go.com', 'akamaihd.net', 'mlsat02.de', 'googlesyndication.com', 'google.com', 'serving-sys.com', 'pubmatic.com', 'yieldlab.net', 'glotgrx.com', 'adtech.de', 'googleadservices.com', 'dyntrk.com', 'lijit.com', 'openx.net', 'pagefair.net', 'f11-ads

In [0]:
#show elements where a tracking behaviour was detected but no tracker could be identified
non_ad_tracking_hosts_df[non_ad_tracking_hosts_df.host_domain == '']

Unnamed: 0,tracking_url,host_domain
5,https://smetrics.dhl.de/id?d_visid_ver=2.5.0&d...,
6,https://smetrics.dhl.de/b/ss/deutschepostdhlpa...,
18,https://cdn.duden.de/public_files/google_tag/g...,
133,https://www.tripadvisor.de/CookiePingback?earl...,
138,https://www.daserste.de/mediasrc/js/atinternet...,
...,...,...
875,https://www.expedia.de/cl/1x1.gif?gcoAgent=fal...,
880,https://www.expedia.de/cl/data/omg-udo.json?st...,
881,https://www.expedia.de/gc/model.json?skipSite=...,
884,https://www.expedia.de/userHistory/count?&guid...,


In [0]:
#merge together tracking calls and corresponding domains
if(len(ad_tracking_df) == len(ad_tracking_hosts_df)):
    ad_tracking_df['host_domain'] = list(ad_tracking_hosts_df['host_domain'])
    print(ad_tracking_df[:5])
else:
    print('unequal length of dataframes')
    print(len(ad_tracking_df))
    print(len(ad_tracking_hosts_df))
    
if(len(non_ad_tracking_df) == len(non_ad_tracking_hosts_df)):
    non_ad_tracking_df['host_domain'] = list(non_ad_tracking_hosts_df['host_domain'])
    print(non_ad_tracking_df[:5])
else:
    print('unequal length of dataframes')
    print(len(non_ad_tracking_df))
    print(len(non_ad_tracking_hosts_df))

     visit_id  ...      host_domain
35          1  ...  doubleclick.net
106         1  ...  doubleclick.net
120         5  ...  doubleclick.net
133         5  ...  doubleclick.net
134         5  ...  doubleclick.net

[5 rows x 4 columns]
     visit_id  ...           host_domain
82          3  ...            demdex.net
87          5  ...               ioam.de
100         5  ...  google-analytics.com
107         5  ...               ioam.de
108         1  ...            criteo.net

[5 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


### Saving Results of Tracking Call Identification

In [0]:
#save ad_tracking_df
ad_tracking_df.to_csv("ad_tracking_df.csv", index=False)

#save non_ad_tracking_df
non_ad_tracking_df.to_csv("non_ad_tracking_df.csv", index=False)