## Local vs Remote Experiment

This notebook contains the data analysis for the experiment mentioned in the limitations section of our [methodology](https://themarkup.org/blacklight/2020/09/22/how-we-built-a-real-time-privacy-inspector##limitations).

We took a random sample of 1,000 sites from the top websites from the Tranco List that we had already run through Blacklight on AWS. We ran this sample through Blacklight software on our computer locally at a residential IP address in New York City, and compared the results of the tests.

In [1]:
import os
import json
import pandas as pd
import numpy as np
import time

In [2]:
def get_summary_col_pct(df,sample_size, col, t):
    num = set(df[df[col] == True].origin_domain)
    value =  round(len(num)/sample_size*100, 2)
    return {f"{col} (pct)": round(value) , "type": t}

def get_sites_with_canvas_fp(df,sample_size, t):
    fp_df = df[ (df.has_third_party_canvas_fingerprinters == True) |  (df.has_first_party_canvas_fingerprinters == True)]
    num = set(fp_df.origin_domain)
    value = round(len(num)/sample_size*100, 2)
    return {'canvas_fingerprinting (pct)':round(value), "type": t }

def get_mean_tpt(df, t):
    counts = []
    for domain, df in df.groupby(["origin_domain"]):
        counts.append(df.script_domain.nunique()) 
    sr = pd.Series(counts)
    return {'third_party_trackers (median)':round(sr.median()), "type": t }

def get_mean_tpc(df, t):
    df =  df[df.cookie_is_third_party == True]
    tg = df.groupby("origin_domain")["cookie_domain"].nunique().sort_values(ascending = False)
    return {'third_party_cookies (median)':round(tg.median()), "type": t }


In [4]:
local_data_input_dir = "data/local-urls-for-experiment-2020-07-29"
remote_data_input_dir = "data/remote-urls-for-experiment-2020-07-29"

# Load the reports
local_summary = pd.read_csv(os.path.join(local_data_input_dir,'summary.csv'))
remote_summary = pd.read_csv(os.path.join(remote_data_input_dir,'summary.csv'))
l_cookie_df = pd.read_csv(os.path.join(local_data_input_dir,'cookies.csv'))
r_cookie_df = pd.read_csv(os.path.join(remote_data_input_dir,'cookies.csv'))
l_tpt_df = pd.read_csv(os.path.join(local_data_input_dir,'third_party_trackers.csv'))
r_tpt_df = pd.read_csv(os.path.join(remote_data_input_dir,'third_party_trackers.csv'))



local_domains = local_summary[local_summary['origin_domain'].notna()].origin_domain.unique()
remote_domains = remote_summary[remote_summary['origin_domain'].notna()].origin_domain.unique()
SAMPLE_SIZE = 1000
sample = np.intersect1d(local_domains, remote_domains)[:SAMPLE_SIZE]
sample_size = len(sample)


print(f"Size of sample for experiment {sample_size}")

Size of sample for experiment 1000


In [5]:
# Only consider domains that were in both sections
local_summary = local_summary[local_summary["origin_domain"].isin(sample)]
remote_summary = remote_summary[remote_summary["origin_domain"].isin(sample)]

l_cookie_df = l_cookie_df[l_cookie_df["origin_domain"].isin(sample)]
r_cookie_df = r_cookie_df[r_cookie_df["origin_domain"].isin(sample)]
l_tpt_df = l_tpt_df[l_tpt_df["origin_domain"].isin(sample)]
r_tpt_df = r_tpt_df[r_tpt_df["origin_domain"].isin(sample)]

In [6]:
results = []
results.append(get_sites_with_canvas_fp(local_summary,sample_size, 'local'))
results.append(get_summary_col_pct(local_summary,sample_size, "has_session_recorders", 'local'))
results.append(get_summary_col_pct(local_summary,sample_size, "has_key_loggers", 'local'))
results.append(get_mean_tpt(l_tpt_df, 'local'))
results.append(get_mean_tpc(l_cookie_df, 'local'))

results.append(get_sites_with_canvas_fp(remote_summary,sample_size, 'remote'))
results.append(get_summary_col_pct(remote_summary, sample_size, "has_session_recorders", 'remote'))
results.append(get_summary_col_pct(remote_summary,sample_size, "has_key_loggers", 'remote'))
results.append(get_mean_tpt(r_tpt_df,'remote'))
results.append(get_mean_tpc(r_cookie_df, 'remote'))

results_df = pd.DataFrame.from_dict(results)
results_df
pd.pivot_table(results_df, index='type')



Unnamed: 0_level_0,canvas_fingerprinting (pct),has_key_loggers (pct),has_session_recorders (pct),third_party_cookies (median),third_party_trackers (median)
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
local,8.0,4.0,18.0,4.0,7.0
remote,10.0,6.0,19.0,5.0,8.0
