In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import ttest_ind, mannwhitneyu, kstest

In [68]:
data_train = pd.read_parquet(
    "../data/processed/chrome/08_12_2022/train_set_01_processed.parquet.gzip"
)

data_test = pd.read_parquet(
    "../data/processed/chrome/08_12_2022/test_set_01_processed.parquet.gzip"
)

In [2]:
chrome_old = pd.read_parquet(
    "../data/processed/chrome/08_12_2022/merged_data.parquet.gzip", engine="pyarrow",
            dtype_backend="pyarrow"
)

chrome_new = pd.read_parquet('../data/processed/chrome/03_29_2023/merged_data.parquet.gzip', engine="pyarrow", dtype_backend="pyarrow")

In [5]:
chrome_train = pd.read_parquet(
    "../data/processed/chrome/08_12_2022/train_set_featurized_BE.parquet.gzip"
)

chrome_test = pd.read_parquet(
    "../data/processed/chrome/08_12_2022/test_set_featurized_BE.parquet.gzip"
)

In [10]:
chrome_train

In [9]:
chrome_test.dtypes.value_counts()

In [None]:
data[f"{elem}_binary"] = np.where(data[elem].isnull(), 0, 1)

In [10]:
chrome_test.iloc[:, 156:] = np.where(chrome_test.iloc[:, 156:].isnull(), 0, 1)

In [8]:
chrome_test = chrome_test.astype("uint8")

In [26]:
chrome_test['x-b3-sampled_binary'] = chrome_test['x-b3-sampled_binary'].astype("uint8")

In [29]:
def label_as_last_column(dataset: pd.DataFrame):
    temp_cols = dataset.columns.tolist()
    index_col = dataset.columns.get_loc("tracker")
    new_col_order = (
        temp_cols[0:index_col]
        + temp_cols[index_col + 1 :]
        + temp_cols[index_col : index_col + 1]
    )
    return new_col_order

In [31]:
col_order = label_as_last_column(chrome_test)

In [8]:
other_columns = chrome_train.columns.values.tolist()

In [9]:
chrome_test = chrome_test[other_columns]

In [15]:
other_columns = chrome_train.columns.values.tolist()
cols_not_in_train_chrome = list(set(other_columns).difference(chrome_test.columns.values.tolist()))
cols_not_in_test_chrome = list(set(chrome_test.columns.values.tolist()).difference(other_columns))

In [9]:
cols_not_in_train_chrome

In [16]:
cols_not_in_test_chrome

In [21]:
chrome_test.drop(cols_not_in_test_chrome, axis=1, inplace=True)

In [22]:
chrome_test

In [25]:
chrome_test = chrome_test.reindex(columns=chrome_test.columns.tolist() + cols_not_in_train_chrome)

In [24]:
chrome_test

In [10]:
chrome_test.to_parquet("test_set_featurized_BE.parquet.gzip", compression="gzip")

In [14]:
chrome_train['x-ttl_binary']

In [5]:
chrome_old[['etag']].value_counts()

In [3]:
firefox = pd.read_parquet('../data/processed/firefox/08_12_2022/merged_data.parquet.gzip', engine="pyarrow",
            dtype_backend="pyarrow")

brave = pd.read_parquet('../data/processed/brave/08_12_2022/merged_data.parquet.gzip', engine="pyarrow",
            dtype_backend="pyarrow")

In [69]:
data = pd.concat([data_train, data_test], ignore_index=True)

In [3]:
plt.figure(figsize=(20, 10))
cl_values = chrome_old[["content-length", "tracker"]].dropna().astype('Int32')
# sns.histplot(cl_values[cl_values['content-length'] < 1000], bins=10, x="content-length", hue='tracker', element='step')

sns.displot(cl_values[cl_values['content-length'] < 1000], x="content-length", kind='ecdf', hue='tracker',
            legend=False)

plt.title('Empirical CDF of the Content-Length Header', fontsize=16)
plt.xlabel('Content-Length Value', fontsize=14)
plt.ylabel('Cumulative Probability', fontsize=14)
plt.legend(title="Classification", labels=['Tracker', 'Non-Tracker'], fontsize=10)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [11]:
fig, axes = plt.subplots(1, 2, figsize=(6, 3))

# First dataset (chrome_old)
cl_values_chrome = chrome_old[["content-length", "tracker"]].dropna().astype('Int32')
sns.ecdfplot(data=cl_values_chrome[cl_values_chrome['content-length'] < 1000], x="content-length", hue='tracker', ax=axes[0])
axes[0].set_title('$Chrome_{22}$: ECDF of Content-Length Header', fontsize=9)
axes[0].set_xlabel('Content-Length Value', fontsize=9)
axes[0].set_ylabel('Cumulative Probability', fontsize=9)
axes[0].tick_params(labelsize=8)
axes[0].legend(title="Classification", labels=['Tracker', 'Non-Tracker'], fontsize=8)

# Second dataset (firefox_new)
cl_values_firefox = chrome_new[["content-length", "tracker"]].dropna().astype('Int32')
sns.ecdfplot(data=cl_values_firefox[cl_values_firefox['content-length'] < 1000], x="content-length", hue='tracker', ax=axes[1])
axes[1].set_title('$Chrome_{23}$: ECDF of Content-Length Header', fontsize=9)
axes[1].set_xlabel('Content-Length Value', fontsize=9)
axes[1].set_ylabel('')
axes[1].tick_params(labelsize=8, labelleft=False)
axes[1].legend(title="Classification", labels=['Tracker', 'Non-Tracker'], fontsize=8)

plt.tight_layout()

# Save the figure as a PDF
plt.savefig('content_length_comparison.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [5]:
feature = 'x-xss-protection'

chrome_old_xss = chrome_old[feature].dropna().astype('category')
# chrome_new_xss = chrome_new[feature].dropna().astype('category')

firefox_xss = firefox[feature].dropna().astype('category')
brave_xss = brave[feature].dropna().astype('category')

df = pd.DataFrame({
    feature: np.concatenate((chrome_old_xss, firefox_xss)),
    'set': ['Chrome'] * len(chrome_old_xss) + ['Firefox'] * len(firefox_xss)
})

df2 = pd.DataFrame({
    feature: np.concatenate((chrome_old_xss, brave_xss)),
    'set': ['Chrome'] * len(chrome_old_xss) + ['Brave'] * len(brave_xss)
})

chrome_color = '#009E73'
firefox_color = '#E69F00'
brave_color = "#0072B2"

palette = {'Chrome': '#009E73', 'Firefox': '#E69F00', 'Brave': '#0072B2'}
palette2 = {'Chrome': '#009E73', 'Brave': '#0072B2'}

fig, axes = plt.subplots(2, 2, figsize=(6, 6))  # Adjust the figsize as necessary

fig.subplots_adjust(hspace=0.5)  # Adjust the vertical spacing

# Row A
fig.text(-0.02, 0.75, '(A)', ha='center', va='center', fontsize=12)
fig.text(-0.02, 0.25, '(B)', ha='center', va='center', fontsize=12)

# Plot your first two existing subplots in the first row
# First dataset (chrome_old)
cl_values_chrome = chrome_old[["content-length", "tracker"]].dropna().astype('Int32')
sns.ecdfplot(data=cl_values_chrome[cl_values_chrome['content-length'] < 10000], x="content-length", hue='tracker', ax=axes[0, 0])
axes[0, 0].set_title('$Chrome_{22}$: ECDF of Content-Length Header', fontsize=9)
axes[0, 0].set_xlabel('Content-Length Value', fontsize=9)
axes[0, 0].set_ylabel('Cumulative Probability', fontsize=9)
axes[0, 0].tick_params(labelsize=8)
axes[0, 0].legend(title="Classification", labels=['Tracker', 'Non-Tracker'], fontsize=8)

# Second dataset (firefox_new)
cl_values_firefox = chrome_new[["content-length", "tracker"]].dropna().astype('Int32')
sns.ecdfplot(data=cl_values_firefox[cl_values_firefox['content-length'] < 10000], x="content-length", hue='tracker', ax=axes[0, 1])
axes[0, 1].set_title('$Chrome_{23}$: ECDF of Content-Length Header', fontsize=9)
axes[0, 1].set_xlabel('Content-Length Value', fontsize=9)
axes[0, 1].set_ylabel('')
axes[0, 1].tick_params(labelsize=8, labelleft=False)
axes[0, 1].legend(title="Classification", labels=['Tracker', 'Non-Tracker'], fontsize=8)

# Plot the new subplot in the second row for df
sns.ecdfplot(data=df, x=feature, hue='set', log_scale=False, palette=palette, ax=axes[1, 0])
axes[1, 0].set_title('Feature Similarity between $Chrome_{22}$ and $Firefox_{22}$', fontsize=8)
axes[1, 0].set_xlabel('Feature Values', fontsize=9)  # Remove x-axis label
axes[1, 0].set_ylabel('Cumulative Probability', fontsize=9)
axes[1, 0].tick_params(labelsize=8)
# Set the custom legend with new labels and position it in the bottom right corner
handles, labels = axes[1, 0].get_legend_handles_labels()
axes[1, 0].legend(labels=['$Chrome_{22}$', '$Firefox_{22}$'], loc='lower right', fontsize=8, title="Browser")
axes[1, 0].set_xticklabels([])  # Clear x-tick labels

# Plot the new subplot in the second row for df2
sns.ecdfplot(data=df2, x=feature, hue='set', log_scale=False, palette=palette2, ax=axes[1, 1])
axes[1, 1].set_title('Feature Similarity between $Chrome_{22}$ and $Brave_{22}$', fontsize=8)
axes[1, 1].set_xlabel('Feature Values', fontsize=9)  # Remove x-axis label
axes[1, 1].set_ylabel('')
axes[1, 1].tick_params(labelsize=8)
# Set the custom legend with new labels and position it in the bottom right corner
handles, labels = axes[1, 1].get_legend_handles_labels()
axes[1, 1].legend(labels=['$Chrome_{22}$', '$Brave_{22}$'], loc='lower right', fontsize=8, title="Browser")
axes[1, 1].set_xticklabels([])  # Clear x-tick labels

# Adjust layout
# plt.tight_layout(rect=[0.05, 0.05, 1, 1]) 

# Adjust layout
plt.tight_layout()
plt.savefig('content_length_comparison.svg', format='svg', bbox_inches='tight')
plt.show()

In [75]:
cl_values_firefox.loc[cl_values_firefox['tracker'] == 1]['content-length'].median()

In [69]:
cl_values_chrome.loc[cl_values_chrome['tracker'] == 1]

In [99]:
def create_value_comparison(header_field, n_values):
    header_field_df = pd.DataFrame(
        brave[[f"{header_field}", "tracker"]].value_counts()
    ).reset_index()
    header_field_df.columns = [f"{header_field}", "tracker", "frequency"]
    header_field_df.sort_values(by=["frequency"], ascending=False, inplace=True)

    test_values = header_field_df[0:n_values][f"{header_field}"].values.tolist()
    frequent_content_types = (
        header_field_df[header_field_df[f"{header_field}"].isin(test_values)]
        .reset_index()
        .sort_values(by=f"{header_field}")
    )
    non_tracker = frequent_content_types[frequent_content_types.tracker == 0]
    tracker = frequent_content_types[frequent_content_types.tracker == 1]
    return non_tracker, tracker


def create_value_comparison_plot(non_tracker, tracker, n, header_field):
    r = np.arange(n)
    width = 0.25

    plt.figure(figsize=(8, 5))
    plt.bar(r, non_tracker["frequency"], log=True, label="Non-Tracker", width=width)
    plt.bar(r + width, tracker["frequency"], log=True, label="Tracker", width=width)
    plt.xticks(
        r + width / 2,
        non_tracker[f"{header_field}"].values.tolist(),
        # rotation=45,
        fontsize=12,
        fontweight="bold",
        ha="right",
    )
    # plt.title('Distribution of Content-Disposition Header Values', fontsize=16)
    # plt.xlabel('Content-Disposition Value', fontsize=14)
    # plt.ylabel('Frequency', fontsize=14)
    # plt.legend(title="Classification", labels=['Tracker', 'Non-Tracker'], fontsize=10)
    # plt.xticks(fontsize=12)
    # plt.yticks(fontsize=12)
    plt.legend()

In [107]:
non_tracker_cd, tracker_cd = create_value_comparison("server", 20)
create_value_comparison_plot(non_tracker_cd, tracker_cd, len(tracker_cd), 'server')

In [105]:
data[['content-type', 'tracker']].value_count

In [44]:
clCT = data[['content-length', 'tracker', 'content-type']]

In [47]:
clCT.dropna(inplace=True)

In [53]:
data[['content-length', 'tracker']].value_counts()

In [21]:
chrome = pd.read_parquet("../data/processed/chrome/08_12_2022/train_set_01.parquet.gzip")
chrome2 = pd.read_parquet("../data/processed/chrome/08_12_2022/test_set_01.parquet.gzip")

chromee = pd.concat([chrome, chrome2], ignore_index=True)

In [54]:
firefox = pd.read_parquet("../data/processed/firefox/08_12_2022/test_set.parquet.gzip")

In [2]:
brave = pd.read_parquet("../data/processed/brave/08_12_2022/test_set_0123.parquet.gzip")

In [3]:
brave = brave.iloc[:, 6:]

In [4]:
# TODO change var name, a bit misleading here
na_per_row = pd.DataFrame((brave.isnull().sum(axis=1) - len(brave.columns))*-1)
na_per_row["tracker"] = brave["tracker"]

In [6]:
plt.figure(figsize=(10, 6))
g = sns.boxplot(data=na_per_row, y=na_per_row.columns.values[0], x='tracker')

plt.title('Number of HTTP/S Headers per Response in Brave', fontsize=18)
plt.xlabel('Classification', fontsize=16)
plt.ylabel('HTTP/S header count', fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
g.set_xticklabels(['Non-Tracker', 'Tracker'], fontsize=14)
plt.show()

In [76]:
nt = chromee[chromee['tracker'] == 0].iloc[:, 6:]
t = chromee[chromee['tracker'] == 1].iloc[:, 6:]

In [69]:
del chrome2

In [19]:
brave.iloc[:, :40]

In [57]:
prevalent_header_chrome = chromee.iloc[:, 6:46]

In [58]:
prevalent_header_chrome['tracker'] = chromee[['tracker']]

In [59]:
nt_length = len(prevalent_header_chrome[prevalent_header_chrome["tracker"] == 0])
t_length = len(prevalent_header_chrome[prevalent_header_chrome["tracker"] ==1])

prevalent_header_chrome.groupby('tracker').agg(lambda x: f'{((x.notnull().sum())/ nt_length)*100} --'
                                                         f'{((x.notnull().sum())/ t_length)*100}')

In [47]:
prevalent_header_brave = brave.iloc[:, :40]
prevalent_header_brave['tracker'] = brave[['tracker']]

In [50]:
nt_lengthb = len(prevalent_header_brave[prevalent_header_brave["tracker"] == 0])
t_lengthb = len(prevalent_header_brave[prevalent_header_brave["tracker"] ==1])

prevalent_header_brave.groupby('tracker').agg(lambda x: f'{((x.notnull().sum())/ nt_lengthb)*100} --'
                                                         f'{((x.notnull().sum())/ t_lengthb)*100}')

In [55]:
prevalent_header_firefox = firefox.iloc[:, 6:40]
prevalent_header_firefox['tracker'] = firefox[['tracker']]
nt_lengthf = len(prevalent_header_firefox[prevalent_header_firefox["tracker"] == 0])
t_lengthf = len(prevalent_header_firefox[prevalent_header_firefox["tracker"] == 1])

prevalent_header_firefox.groupby('tracker').agg(lambda x: f'{((x.notnull().sum()) / nt_lengthf) * 100} --'
                                                        f'{((x.notnull().sum()) / t_lengthf) * 100}')

In [65]:
firefox[['tracker', 'content-encoding']].value_counts()

In [108]:
data_train = pd.read_parquet(
    "../data/processed/chrome/08_12_2022/train_set_01_featurized.parquet.gzip"
)


In [110]:
data_train['content-type'].value_counts()