In [1]:
import gzip
import json
import pathlib
import sys
from os import listdir, makedirs
from os.path import join
from typing import Dict, List, Any, Union
from urllib.parse import urlparse
import os
import pandas as pd
from alive_progress import alive_bar
from json_stream import load
from json_stream.dump import JSONStreamEncoder
from src.pipeline_functions.data_preprocessing_functions import *

In [2]:
def generate_url(r: Dict[str, Any]) -> Dict[str, Union[str, List[str]]]:
    """
    Extract and generate URL-related information from a given request dictionary.

    Parameters
    ----------
    r : Dict[str, Any]
        The input request dictionary containing the 'url' key.

    Returns
    -------
    Dict[str, Union[str, List[str]]]
        A dictionary containing URL-related information like hostname, pathname,
        filetype, filename_one, protocol, and query.
    """
    parsed_url = urlparse(r["url"])
    return {
        "hostname": parsed_url.netloc,
        "pathname": parsed_url.path,
        "filetype": parsed_url.path.split("/").pop().split(".").pop(),
        "filename_one": parsed_url.path.split("/").pop(),
        "protocol": parsed_url.scheme,
        "query": parsed_url.query,
    }


def generate_response_headers(r: Dict[str, Any]) -> List[List[str]]:
    """
    Extract and generate response headers from a given request dictionary.

    Parameters
    ----------
    r : Dict[str, Any]
        The input request dictionary containing the 'responseHeaders' key.

    Returns
    -------
    List[List[str]]
        A list of lists, where each inner list contains a header name and its value.
    """
    headers = r["requestHeaders"]
    transformed = []

    for header in headers:
        try:
            transformed.append([header["name"], header["value"]])
        except KeyError:
            # TODO: philip u know
            print(header["name"])
    return transformed

In [3]:
data_path = "../data/raw/chrome/03_29_2023/http-11.json.gz"

In [4]:
with gzip.open(data_path, 'r') as file:
    data_raw = json.load(file)

In [5]:
data_raw[0]

In [27]:
a = {
    "url": generate_url(data[0]),
    "labels": data[0]["labels"],
    "request": {
        "method": data[0]["method"],
        "initiator": data[0]["initiator"],
        },
    "requestHeaders": generate_response_headers(data[0]),
}

In [2]:
def read_json_file(
    name: str, target_file_name: str, target_data_dir, compression_alg
) -> pd.DataFrame:
    """
    Read a JSON file and return a pandas DataFrame.

    Parameters
    ----------
    name: String
        Name of the file to read from.
    target_file_name: String
        Name of the file directory to read from (Path).
        Note: data/raw/ is already defined.

    Returns
    -------
    object, type of objs

    """
    path = f"../data/{target_data_dir}/{target_file_name}/{name}.json.{compression_alg}"
    print(f"\nDEBUG: File exists? {os.path.isfile(path)}\n")

    return pd.read_json(path, orient="records", compression="gzip")

In [2]:
def prepare_initial_dataset(
    file_name: str, target_file: str, target_data_dir, compression_alg
) -> pd.DataFrame:

    data = (
        read_json_file(file_name, target_file, target_data_dir, compression_alg)
        .dropna()
        .reset_index(drop=True)
    )
    return data.loc[data["responseHeaders"].map(len) != 0].reset_index(drop=True)

In [17]:
data = read_json_file("merged_data_response", "03_29_2023", "merged/chrome", "gz")

In [19]:
pd.DataFrame.from_records(data['response']).to_numpy()

In [15]:
pd.DataFrame.from_records(data['request']).iloc[:,2].to_numpy()

In [16]:
[*data["request"][0]][2]

In [2]:
request_data = pd.read_parquet('../data/processed/chrome/08_12_2022/test_set_request.parquet.gzip', engine='pyarrow', dtype_backend='pyarrow')

In [5]:
request_data['httpMessageId']

In [8]:
request_data['tracker']

In [9]:
request_data.iloc[:10, 8094:]

In [12]:
request_data.iloc[:10, 6:10]

In [13]:
data_column_values = request_data.columns.values[6:-2].tolist()

In [14]:
data_column_values

In [10]:
def label_as_last_column(dataset: pd.DataFrame) -> List[str]:
    """
    Reorder the columns of a DataFrame, moving the "tracker" column to the end.

    Parameters
    ----------
    dataset : pd.DataFrame
        The input DataFrame with a "tracker" column.

    Returns
    -------
    List[str]
        A list of column names in the new order, with the "tracker" column last.
    """
    temp_cols = dataset.columns.tolist()
    index_col = dataset.columns.get_loc("tracker")
    new_col_order = (
        temp_cols[0:index_col]
        + temp_cols[index_col + 1:]
        + temp_cols[index_col: index_col + 1]
    )
    return new_col_order

In [11]:
reordered_cols = label_as_last_column(request_data)

In [18]:
reordered_cols[1755]

In [15]:
len(reordered_cols)

In [None]:
request_data = request_data[reordered_cols]

In [4]:
request_data.iloc[:10, :10]

In [5]:
empty_columns = [col for col in request_data if request_data[col].isnull().all() == True]
request_data.drop(empty_columns, axis=1, inplace=True)
request_data.reset_index(drop=True, inplace=True)

In [6]:
request_data

In [7]:
data_column_values = request_data.columns.values[6:-2].tolist()

In [10]:
match = [
            new_fuzzy_string_matching_for_column(j, data_column_values[i + 1 :])
            for i, j in enumerate(data_column_values)
            if i != len(data_column_values) - 1
        ]

In [11]:
match

In [12]:
match2 = pd.concat(match, ignore_index=True)

In [13]:
match2

In [15]:
result = [
            find_cols_with_similar_values(col, col2, request_data)
            for col, col2 in zip(match2["fuzzy_match"], match2["col_name"])
        ]

In [17]:
request_data.reset_index(drop=True, inplace=True)

In [18]:
similar_values = [
            select_similar_columns(col[0], col[1], match2)
            for col in result
            if col is not None
        ]

In [19]:
similar_values

In [20]:
similar_values_train = pd.concat(similar_values, ignore_index=True)

In [21]:
similar_values_train

In [23]:
similar_values_train.apply(
            lambda x: merge_similar_columns(x["fuzzy_match"], x["col_name"], request_data),
            axis=1,
        )

In [24]:
columns_to_remove = list(set(similar_values_train.fuzzy_match.values.tolist()))
request_data.drop(columns_to_remove, axis=1, inplace=True)

In [25]:
request_data

In [26]:
request_data = request_data.iloc[:, 6:]
list_of_dtypes = create_categories_list(request_data)
request_data = request_data.astype(list_of_dtypes)

In [27]:
request_data

In [29]:
summary_table = create_summary_table(request_data.iloc[:, :-2])

In [30]:
remove_headers_with_one_na_ratio = summary_table[
            summary_table["na_ratio"] == 1
        ].header_name.values.tolist()
remove_headers_with_one_value = summary_table[
            (summary_table["unique_values"] <= 1) & (summary_table["na_ratio"] != 1)
        ].header_name.values.tolist()

In [31]:
summary_table

In [32]:
request_data.drop(remove_headers_with_one_na_ratio, axis=1, inplace=True)
request_data.drop(remove_headers_with_one_value, axis=1, inplace=True)

In [33]:
request_data

In [34]:
summary_table2 = create_summary_table_2(request_data)

In [35]:
summary_table2

In [39]:
data = request_data.copy(deep=True)

In [37]:
concise_information_wrapper(data, summary_table2)

In [40]:
data["comb_col_non_tracker"] = 0
data["comb_col_tracker"] = 0

In [41]:
data

In [44]:
only_non_tracker_cols, only_tracker_cols = find_cols_to_combine(summary_table2)

In [45]:
only_non_tracker_cols

In [46]:
only_tracker_cols

In [47]:
data2 = data.copy(deep=True)

In [48]:
update_combined_columns(data2, only_tracker_cols, 1, "comb_col_tracker")

In [49]:
update_combined_columns(data2, only_non_tracker_cols, 0, "comb_col_non_tracker")

In [2]:
old_data = pd.read_parquet('../data/processed/chrome/08_12_2022/train_set_featurized_BE.parquet.gzip', engine='pyarrow', dtype_backend='pyarrow')

In [3]:
old_data

In [2]:
new_data_response = pd.read_parquet('../data/processed/chrome/08_12_2022/train_set_processed_response.parquet.gzip', engine='pyarrow', dtype_backend='pyarrow')

In [3]:
new_data_response

In [7]:
test_response = pd.read_parquet('../data/processed/firefox/08_12_2022/merged_data_processed_response.parquet.gzip', engine='pyarrow', dtype_backend='pyarrow')

In [8]:
test_response

In [10]:
list(set(new_data_response.columns.tolist()).difference(test_response.columns.tolist()))

In [28]:
new_data_request = pd.read_parquet('../data/processed/chrome/08_12_2022/train_set_featurized_request_BE.parquet.gzip', engine='pyarrow', dtype_backend='pyarrow')

In [29]:
new_data_request

In [42]:
new_data_request_2 = pd.read_parquet('../data/processed/brave/08_12_2022/merged_data_featurized_request_BE.parquet.gzip', engine='pyarrow', dtype_backend='pyarrow')

In [43]:
list(set(new_data_request.columns.tolist()).difference(new_data_request_2.columns.tolist()))

In [44]:
list(set(new_data_request_2.columns.tolist()).difference(new_data_request.columns.tolist()))

In [45]:
new_data_request_2