In [1]:
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
from typing import Dict, List, Union, Optional

import os
from typing import List, Tuple, Union, Any
from collections import Counter
import random
import string
from rapidfuzz import process

In [10]:
pd.read_json(
        f"../../data/merged/chrome/03_29_2023/merged_data copy.json.gzip",
    compression='gzip', lines=True, encoding = 'utf-8-sig'
    )

ValueError: Expected object or value

In [51]:
def test_new_categories_update(
    element: str, dataset: pd.DataFrame
) -> Optional[Dict[str, str]]:
    """
    Test if the given column of the dataset can be converted to the Int64 data type. If so, return a dictionary
    with the column name as the key and 'Int64' as the value. Otherwise, return None.

    Parameters
    ----------
    element : str
        The column name to be tested.
    dataset : pd.DataFrame
        The DataFrame containing the column.

    Returns
    -------
    Optional[Dict[str, str]]
        A dictionary with the column name and 'Int64' if the column can be converted to the Int64 data type,
        None otherwise.
    """
    categories = dataset[element].astype("category").cat.categories.values.tolist()
    try:
        np.array(categories, dtype="int64")
        return {element: "Int64"}
    except (ValueError, OverflowError):
        return None


def create_categories_list(dataset: pd.DataFrame) -> Dict[str, Union[str, None]]:
    """
    Create a dictionary of column names and their corresponding data types for the given dataset. The data types
    are determined based on the column values. If a column can be converted to Int64, its data type is set to
    'Int64', otherwise it is set to 'category'.

    Parameters
    ----------
    dataset : pd.DataFrame
        The input DataFrame for which to create the dictionary of column names and data types.

    Returns
    -------
    Dict[str, Union[str, None]]
        A dictionary of column names and their corresponding data types.
    """
    dtype_list = {i: "category" for i in dataset.columns.values[:-1]}
    current_columns = dataset.columns.values[:-1].tolist()
    int64_columns = [
        test_new_categories_update(element, dataset) for element in current_columns
    ]

    int64_columns = list(filter(lambda x: type(x) is dict, int64_columns))
    int64_columns = {k: v for d in int64_columns for k, v in d.items()}

    dtype_list.update(int64_columns)
    return dtype_list

In [12]:
def generate_large_dataframe(
    num_rows: int,
    num_columns: int,
    num_int_columns: int,
    max_cardinality: int,
    max_length: int,
) -> pd.DataFrame:
    """
    Generate a large DataFrame with a given number of rows, columns, integer columns, and varying
    cardinalities for categorical columns.

    Parameters
    ----------
    num_rows : int
        The number of rows in the DataFrame.
    num_columns : int
        The number of columns in the DataFrame.
    num_int_columns : int
        The number of integer columns in the DataFrame.
    max_cardinality : int
        The maximum cardinality for categorical columns.
    max_length : int
        The maximum length for categorical values.

    Returns
    -------
    pd.DataFrame
        A generated DataFrame with the specified number of rows, columns, and varying cardinalities.
    """

    def generate_categorical_values(cardinality: int, max_length: int) -> List[str]:
        return [
            "".join(
                random.choices(string.ascii_letters, k=random.randint(1, max_length))
            )
            for _ in range(cardinality)
        ]

    categorical_columns = num_columns - num_int_columns
    categorical_data = {
        f"col_{i}": np.random.choice(
            generate_categorical_values(random.randint(1, max_cardinality), max_length),
            size=num_rows,
        )
        for i in range(1, categorical_columns + 1)
    }
    int_data = {
        f"int_col_{i}": np.random.randint(0, 100, size=num_rows)
        for i in range(1, num_int_columns + 1)
    }

    data = {
        "query": np.random.choice(list("1234"), size=num_rows),
        "protocol": np.random.choice(list("5678"), size=num_rows),
    }

    data.update(categorical_data)
    data.update(int_data)

    return pd.DataFrame(data)

In [19]:
data = pd.read_parquet(
    "../../data/processed/chrome/08_12_2022/train_set_01.parquet.gzip",
    engine="pyarrow",
    dtype_backend="pyarrow",
)

In [11]:
data.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 272004 entries, 256683 to 157680
Columns: 5316 entries, hostname to tracker
dtypes: int32[pyarrow](1), list<item: list<item: string>>[pyarrow](1), null[pyarrow](200), string[pyarrow](5114)
memory usage: 5.5 GB


In [25]:
empty_columns = [col for col in data if data[col].isnull().all() == True]

In [26]:
data.drop(empty_columns, axis=1, inplace=True)

In [27]:
data_column_values = data.columns.values[6:-1].tolist()

In [28]:
def new_fuzzy_string_matching_for_column(
    col_name: str, col_values: List[str]
) -> pd.DataFrame:
    """
    Find fuzzy matches for a given column name with a list of column values.

    Parameters
    ----------
    col_name : str
        The column name for which fuzzy matches should be found.
    col_values : List[str]
        A list of column values to compare with the given column name.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the fuzzy matches, their respective scores,
        and the original column name.
    """
    fuzzy_result = pd.DataFrame(
        process.extract(
            col_name, col_values, processor=None, score_cutoff=80, limit=100
        ),
        columns=["fuzzy_match", "w_ratio", "index"],
    )
    fuzzy_result["col_name"] = col_name
    return fuzzy_result


def find_cols_with_similar_values(
    fuzzy_match: str, column: str, dataset: pd.DataFrame
) -> Optional[Tuple[str, str]]:
    """
    Compare two columns and return their names if more than 50% of their values are similar.

    Parameters
    ----------
    fuzzy_match : str
        The name of the first column.
    column : str
        The name of the second column.

    Returns
    -------
    Optional[Tuple[str, str]]
        A tuple containing the column names if more than 50% of their values are similar, otherwise None.
    """
    value_fuzzy = set(dataset[fuzzy_match].dropna().values)
    value_column = set(dataset[column].dropna().values)

    common_values = len(value_fuzzy.intersection(value_column))
    len_value_fuzzy = len(value_fuzzy)

    if common_values / len_value_fuzzy > 0.5:
        return fuzzy_match, column
    else:
        return None


def select_similar_columns(
    fuzzy_match: str, column: str, match_df: pd.DataFrame
) -> pd.DataFrame:
    """
    Select a row from the `match_df` DataFrame based on the provided column names and remove it from the DataFrame.

    Parameters
    ----------
    fuzzy_match : str
        The name of the first column.
    column : str
        The name of the second column.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the selected row.
    """
    row = match_df.loc[
        (match_df["fuzzy_match"] == fuzzy_match) & (match_df["col_name"] == column)
    ]
    match_df.drop(row.index[0], inplace=True)
    return row


def merge_similar_columns(fuzzy_match: str, col_name: str, df: pd.DataFrame) -> None:
    """
    Merge the values of two columns in the given DataFrame by replacing null values in the second column
    with the corresponding values from the first column.

    Parameters
    ----------
    fuzzy_match : str
        The name of the first column.
    col_name : str
        The name of the second column.
    df : pd.DataFrame
        The DataFrame to process.

    Returns
    -------
    None
    """
    boolean_mask = df[fuzzy_match].notnull()
    new_values = df.loc[boolean_mask, fuzzy_match].to_numpy()
    indices_fuzzy_matches = boolean_mask[boolean_mask].index.tolist()

    current_values = df[col_name].to_numpy()
    np.put(current_values, indices_fuzzy_matches, new_values)
    df[col_name] = current_values

In [29]:
%%time
match = [
    new_fuzzy_string_matching_for_column(j, data_column_values[i + 1 :])
    for i, j in enumerate(data_column_values)
    if i != len(data_column_values) - 1
]

CPU times: user 19.9 s, sys: 164 ms, total: 20 s
Wall time: 20.1 s


In [30]:
match2 = pd.concat(match, ignore_index=True)

In [31]:
match2

Unnamed: 0,fuzzy_match,w_ratio,index,col_name
0,pragrma,92.307692,3532,pragma
1,x-akamai-pragma-client-ip,90.000000,390,pragma
2,pragma-directive,90.000000,2534,pragma
3,pramga,83.333333,1041,pragma
4,x-content-type,92.307692,722,content-type
...,...,...,...,...
25933,x-width,90.000000,8,wid
25934,nrk-application-version,90.000000,0,nrk-application
25935,xc-version,80.000000,12,nrk-application-version
25936,x-adstyle,82.352941,2,x-adtype


In [32]:
%%time
result = [
    find_cols_with_similar_values(col, col2, data)
    for col, col2 in zip(match2["fuzzy_match"], match2["col_name"])
]

CPU times: user 1min 50s, sys: 945 ms, total: 1min 51s
Wall time: 1min 51s


In [26]:
result

[('pragrma', 'pragma'),
 None,
 ('pragma-directive', 'pragma'),
 ('pramga', 'pragma'),
 None,
 ('content_type', 'content-type'),
 ('contetn-type', 'content-type'),
 None,
 None,
 None,
 ('x-tb-optimization-original-content-type', 'content-type'),
 None,
 ('x-hs-alternate-content-type', 'content-type'),
 ('x-upyun-content-type', 'content-type'),
 ('x-amz-meta-content-type', 'content-type'),
 ('x-tb-oa-originalcontenttype', 'content-type'),
 None,
 ('content-style-type', 'content-type'),
 None,
 ('x-nyt-data-last-modified', 'last-modified'),
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 ('x-accept-ranges', 'accept-ranges'),
 None,
 ('x-amz-meta-accept-ranges', 'accept-ranges'),
 None,
 None,
 None,
 None,
 None,
 None,
 ('x-xss-protections', 'x-xss-protection'),
 None,
 None,
 ('x-content-type-option', 'x-content-type-options'),
 None,
 None,
 None,
 ('x-content-options', 'x-content-type-options'),
 ('x-control-type-options', 'x-content-type-options'),
 None,
 No

In [33]:
data.reset_index(drop=True, inplace=True)

In [34]:
similar_values = [
    select_similar_columns(col[0], col[1], match2) for col in result if col is not None
]

In [35]:
similar_values

[  fuzzy_match    w_ratio index col_name
 0     pragrma  92.307692  3532   pragma,
         fuzzy_match  w_ratio index col_name
 2  pragma-directive     90.0  2534   pragma,
   fuzzy_match    w_ratio index col_name
 3      pramga  83.333333  1041   pragma,
     fuzzy_match    w_ratio index      col_name
 5  content_type  91.666667  1449  content-type,
     fuzzy_match    w_ratio index      col_name
 6  contetn-type  91.666667  4032  content-type,
                                 fuzzy_match  w_ratio index      col_name
 10  x-tb-optimization-original-content-type     90.0  1513  content-type,
                     fuzzy_match  w_ratio index      col_name
 12  x-hs-alternate-content-type     90.0  1753  content-type,
              fuzzy_match  w_ratio index      col_name
 13  x-upyun-content-type     90.0  2151  content-type,
                 fuzzy_match  w_ratio index      col_name
 14  x-amz-meta-content-type     90.0  4339  content-type,
                     fuzzy_match    w_ratio ind

In [36]:
similar_values = pd.concat(similar_values, ignore_index=True)

In [37]:
similar_values

Unnamed: 0,fuzzy_match,w_ratio,index,col_name
0,pragrma,92.307692,3532,pragma
1,pragma-directive,90.000000,2534,pragma
2,pramga,83.333333,1041,pragma
3,content_type,91.666667,1449,content-type
4,contetn-type,91.666667,4032,content-type
...,...,...,...,...
1634,x-cache-nx,80.000000,3,x-cache-vf
1635,x-cache-cms-status,80.000000,151,x-cache-vf
1636,x-cache-nx,80.000000,0,x-cache-ve
1637,x-cache-cms-status,80.000000,148,x-cache-ve


In [38]:
data[data["pragrma"].notna()][["pragma", "pragrma"]]

Unnamed: 0,pragma,pragrma
267313,,no-cache


In [39]:
similar_values.apply(
    lambda x: merge_similar_columns(x["fuzzy_match"], x["col_name"], data), axis=1
)

0       None
1       None
2       None
3       None
4       None
        ... 
1634    None
1635    None
1636    None
1637    None
1638    None
Length: 1639, dtype: object

In [40]:
data[data["pragrma"].notna()][["pragma", "pragrma"]]

Unnamed: 0,pragma,pragrma
267313,no-cache,no-cache


In [41]:
columns_to_remove = list(set(similar_values.fuzzy_match.values.tolist()))
data.drop(columns_to_remove, axis=1, inplace=True)

In [42]:
data[data["pragrma"].notna()][["pragma", "pragrma"]]

KeyError: 'pragrma'

In [43]:
data

Unnamed: 0,hostname,pathname,filetype,filename,protocol,query,pragma,content-type,last-modified,accept-ranges,...,x-publisherdesk-origin,xc-version,x-amzn-waf-action,generated,x-amz-req-time-micros,debug_plat_b,x-ta,x-pbs-appsvrname,x-pbs-appsvrip,tracker
0,tpc.googlesyndication.com,/pagead/js/r20220810/r20110914/client/one_clic...,js,one_click_handler_one_afma_fy2021.js,https:,[],,text/javascript; charset=utf-8,,,...,,,,,,,,,,1
1,res.cdn.office.net,/officehub/bundles/staying-aware.b8b088b355c55...,js,staying-aware.b8b088b355c55b4ea7d6.chunk.v6.js,https:,[],,application/javascript,"mon, 08 aug 2022 21:56:59 gmt",,...,,,,,,,,,,0
2,www.google.com,/ads/ga-audiences,/ads/ga-audiences,ga-audiences,https:,"[array(['t', 'sr'], dtype=object) array(['aip'...",no-cache,image/gif,,,...,,,,,,,,,,1
3,ipv4-c113-lhr004-ix.1.oca.nflxvideo.net,/speedtest/range/0-0,/speedtest/range/0-0,0-0,https:,"[array(['c', 'de'], dtype=object) array(['n', ...",no-cache,,,,...,,,,,,,,,,0
4,shimo.im,/,/,,https:,[],,text/html; charset=utf-8,,,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271999,www.criteo.com,/wp-content/uploads/2021/01/ss-logo-Lamps-Plus...,png,ss-logo-Lamps-Plus.png,https:,[],,image/png,"fri, 12 nov 2021 05:09:38 gmt",bytes,...,,,,,,,,,,0
272000,ads.adfox.ru,/260122/event,/260122/event,event,https:,"[array(['pm', 'cyz'], dtype=object)  array(['h...",no-cache,,"fri, 12 aug 2022 09:22:13 gmt",,...,,,,,,,,,,1
272001,synostatic.synology.com,/font/inter/inter-w400-6.woff2,woff2,inter-w400-6.woff2,https:,[],,binary/octet-stream,"mon, 20 dec 2021 02:24:19 gmt",bytes,...,,,,,,,,,,0
272002,tenor.com,/opensearch.xml,xml,opensearch.xml,https:,[],,application/xml,"thu, 26 apr 2018 18:25:27 gmt",bytes,...,,,,,,,,,,0


In [44]:
data.dtypes

hostname            string[pyarrow]
pathname            string[pyarrow]
filetype            string[pyarrow]
filename            string[pyarrow]
protocol            string[pyarrow]
                         ...       
debug_plat_b        string[pyarrow]
x-ta                string[pyarrow]
x-pbs-appsvrname    string[pyarrow]
x-pbs-appsvrip      string[pyarrow]
tracker              int32[pyarrow]
Length: 4454, dtype: object

In [46]:
data = data.iloc[:, 6:]

In [47]:
data

Unnamed: 0,pragma,content-type,last-modified,accept-ranges,etag,p3p,x-xss-protection,x-content-type-options,strict-transport-security,x-robots-tag,...,x-publisherdesk-origin,xc-version,x-amzn-waf-action,generated,x-amz-req-time-micros,debug_plat_b,x-ta,x-pbs-appsvrname,x-pbs-appsvrip,tracker
0,,text/javascript; charset=utf-8,,,15601544113783900868,"policyref=""https://www.googleadservices.com/pa...",0,nosniff,,,...,,,,,,,,,,1
1,,application/javascript,"mon, 08 aug 2022 21:56:59 gmt",,,,,nosniff,max-age=31536000; includesubdomains,,...,,,,,,,,,,0
2,no-cache,image/gif,,,,"policyref=""https://www.googleadservices.com/pa...",0,nosniff,,,...,,,,,,,,,,1
3,no-cache,,,,,,,,,,...,,,,,,,,,,0
4,,text/html; charset=utf-8,,,,,,,max-age=15724800; includesubdomains,,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271999,,image/png,"fri, 12 nov 2021 05:09:38 gmt",bytes,"""618df712-2509""",,,,max-age=300,,...,,,,,,,,,,0
272000,no-cache,,"fri, 12 aug 2022 09:22:13 gmt",,,,,nosniff,,,...,,,,,,,,,,1
272001,,binary/octet-stream,"mon, 20 dec 2021 02:24:19 gmt",bytes,"""4abfa4888190370d347034f9fa43c804""",,,,,,...,,,,,,,,,,0
272002,,application/xml,"thu, 26 apr 2018 18:25:27 gmt",bytes,"w/""29a-1630333f5d8""",,1; mode=block,nosniff,max-age=31536000; preload,,...,,,,,,,,,,0


In [54]:
%%time
list_of_dtypes = create_categories_list(data)

CPU times: user 17 s, sys: 79.2 ms, total: 17.1 s
Wall time: 17.1 s
CPU times: user 17.3 s, sys: 122 ms, total: 17.4 s
Wall time: 17.6 s


In [55]:
list_of_dtypes

{'pragma': 'category',
 'content-type': 'category',
 'last-modified': 'category',
 'accept-ranges': 'category',
 'etag': 'category',
 'p3p': 'category',
 'x-xss-protection': 'category',
 'x-content-type-options': 'category',
 'strict-transport-security': 'category',
 'x-robots-tag': 'category',
 'vary': 'category',
 'content-encoding': 'category',
 'cache-control': 'category',
 'expires': 'category',
 'date': 'category',
 'content-length': 'Int64',
 'connection': 'category',
 'x-amz-server-side-encryption': 'category',
 'x-amz-version-id': 'category',
 'server': 'category',
 'x-cache': 'category',
 'via': 'category',
 'x-amz-cf-pop': 'category',
 'x-amz-cf-id': 'category',
 'age': 'category',
 'x-guploader-uploadid': 'category',
 'x-goog-generation': 'Int64',
 'x-goog-hash': 'category',
 'x-goog-storage-class': 'category',
 'cf-cache-status': 'category',
 'expect-ct': 'category',
 'report-to': 'category',
 'nel': 'category',
 'cf-ray': 'category',
 'alt-svc': 'category',
 'access-contr

In [56]:
data = data.astype(list_of_dtypes)

In [57]:
data.dtypes

pragma                    category
content-type              category
last-modified             category
accept-ranges             category
etag                      category
                         ...      
debug_plat_b              category
x-ta                      category
x-pbs-appsvrname          category
x-pbs-appsvrip            category
tracker             int32[pyarrow]
Length: 4448, dtype: object

In [58]:
def reduced_variance_per_column(column: str, dataset: pd.DataFrame) -> List[Union[str, int, float]]:
    """
    Calculate the number of unique values and the ratio of missing values for a given column in a DataFrame.

    Parameters
    ----------
    column : str
        The name of the column for which to compute the unique values and NA ratio.
    dataset : pd.DataFrame
        The DataFrame containing the specified column.

    Returns
    -------
    List[Union[str, int, float]]
        A list containing the column name, number of unique values (excluding NaN), and NA ratio.
    """
    unique_values = dataset[column].nunique(dropna=True)
    na_ratio = dataset[column].isna().mean()
    return [column, unique_values, round(na_ratio, 3)]

In [77]:
def create_summary_table(dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Create a summary table containing the number of unique values and the NA ratio for each column in a DataFrame.

    Parameters
    ----------
    dataset : pd.DataFrame
        The DataFrame for which to compute the summary table.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the number of unique values and the NA ratio for each column in the input DataFrame.
    """
    table_result = dataset.apply(
        lambda x: pd.Series(
            {
                "header_name": x.name,
                "unique_values": x.nunique(dropna=True),
                "na_ratio": round(x.isna().mean(), 3)
            }
        )
    ).T

    table_result["unique_values"] = table_result["unique_values"].astype("Int32")
    table_result["na_ratio"] = table_result["na_ratio"].astype("float32")
    table_result.reset_index(drop=True, inplace=True)

    return table_result

In [78]:
summary_table = create_summary_table(data.iloc[:, :-1])

In [79]:
summary_table

Unnamed: 0,header_name,unique_values,na_ratio
0,pragma,13,0.873
1,content-type,324,0.037
2,last-modified,88351,0.330
3,accept-ranges,2,0.517
4,etag,113969,0.456
...,...,...,...
4442,x-amz-req-time-micros,3,1.000
4443,debug_plat_b,1,1.000
4444,x-ta,1,1.000
4445,x-pbs-appsvrname,1,1.000


In [80]:
remove_headers_with_one_na_ratio = summary_table[
    summary_table["na_ratio"] == 1
    ].header_name.values.tolist()
remove_headers_with_one_value = summary_table[
    (summary_table["unique_values"] <= 1) & (summary_table["na_ratio"] != 1)
    ].header_name.values.tolist()

In [83]:
data.drop(remove_headers_with_one_na_ratio, axis=1, inplace=True)
data.drop(remove_headers_with_one_value, axis=1, inplace=True)

In [84]:
data

Unnamed: 0,pragma,content-type,last-modified,accept-ranges,etag,p3p,x-xss-protection,x-content-type-options,strict-transport-security,x-robots-tag,...,z-stale-enabled,z-fastly-info-state,z-backend-name,z-react-reason,z-polopoly-beckend,z-url-original,z-url-modified,z-surrogate-keys,z-actions-log,tracker
0,,text/javascript; charset=utf-8,,,15601544113783900868,"policyref=""https://www.googleadservices.com/pa...",0,nosniff,,,...,,,,,,,,,,1
1,,application/javascript,"mon, 08 aug 2022 21:56:59 gmt",,,,,nosniff,max-age=31536000; includesubdomains,,...,,,,,,,,,,0
2,no-cache,image/gif,,,,"policyref=""https://www.googleadservices.com/pa...",0,nosniff,,,...,,,,,,,,,,1
3,no-cache,,,,,,,,,,...,,,,,,,,,,0
4,,text/html; charset=utf-8,,,,,,,max-age=15724800; includesubdomains,,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271999,,image/png,"fri, 12 nov 2021 05:09:38 gmt",bytes,"""618df712-2509""",,,,max-age=300,,...,,,,,,,,,,0
272000,no-cache,,"fri, 12 aug 2022 09:22:13 gmt",,,,,nosniff,,,...,,,,,,,,,,1
272001,,binary/octet-stream,"mon, 20 dec 2021 02:24:19 gmt",bytes,"""4abfa4888190370d347034f9fa43c804""",,,,,,...,,,,,,,,,,0
272002,,application/xml,"thu, 26 apr 2018 18:25:27 gmt",bytes,"w/""29a-1630333f5d8""",,1; mode=block,nosniff,max-age=31536000; preload,,...,,,,,,,,,,0


In [85]:
del remove_headers_with_one_na_ratio
del summary_table

In [94]:
def count_trackers_and_non_trackers(column: pd.Series, tracker: pd.Series) -> List[Union[str, int]]:
    """
    Count the number of trackers and non-trackers in a given column of a DataFrame.

    Parameters
    ----------
    column : pd.Series
        The column to count trackers and non-trackers.
    tracker : pd.Series
        The 'tracker' column from the DataFrame.

    Returns
    -------
    List[Union[str, int]]
        A list containing the column name, the number of trackers, and the number of non-trackers.
    """
    column_name = column.name
    notnull_mask = column.notnull()
    tracker_ratio = tracker[notnull_mask].value_counts()
    try:
        trackers = tracker_ratio[1]
    except KeyError:
        trackers = 0
    try:
        non_trackers = tracker_ratio[0]
    except KeyError:
        non_trackers = 0
    return [column_name, trackers, non_trackers]


def create_summary_table_2(dataset: pd.DataFrame) -> pd.DataFrame:
    number_of_elements_reduced = np.array(
        [
            count_trackers_and_non_trackers(dataset[column], dataset["tracker"])
            for column in dataset.iloc[:, 4:-1].columns
        ]
    )
    summary_table_2 = pd.DataFrame(
        number_of_elements_reduced, columns=["header_name", "trackers", "non_trackers"]
    )
    summary_table_2["trackers"] = summary_table_2["trackers"].astype("Int32")
    summary_table_2["non_trackers"] = summary_table_2["non_trackers"].astype("float32")
    summary_table_2["ratio"] = (
        summary_table_2["trackers"] / summary_table_2["non_trackers"]
    ) * 100
    summary_table_2["ratio2"] = (
        summary_table_2["non_trackers"] / summary_table_2["trackers"]
    ) * 100
    return summary_table_2


In [97]:
%%time
summary_table_2 = create_summary_table_2(data)

CPU times: user 497 ms, sys: 203 ms, total: 700 ms
Wall time: 815 ms


In [111]:
def update_combined_columns(dataset: pd.DataFrame, col_list: List[str], classification: int, column_name: str) -> None:
    """
    Update the combined columns in the dataset based on given column list and classification.

    Parameters
    ----------
    dataset : pd.DataFrame
        The dataset to update.
    col_list : List[str]
        The list of columns to process.
    classification : int
        The classification value (0 or 1) to filter rows in the dataset.
    column_name : str
        The name of the column to update in the dataset.
    """
    indices = [
        dataset[(dataset[col].notnull()) & (dataset["tracker"] == classification)].index.tolist()
        for col in col_list
    ]
    indices_concat = list(np.concatenate(indices).flat)
    count_indices = dict(Counter(indices_concat))

    for key, value in count_indices.items():
        dataset.at[key, column_name] = value


def find_cols_to_combine(information_table: pd.DataFrame) -> Tuple[List[str], List[str]]:
    """
    Find columns to combine based on the given information table.

    Parameters
    ----------
    information_table : pd.DataFrame
        A summary table with column information.

    Returns
    -------
    Tuple[List[str], List[str]]
        A tuple containing two lists of column names: one for non-trackers and one for trackers.
    """
    only_non_trackers = information_table[
        information_table["ratio"] <= 10
    ].header_name.values.tolist()
    only_trackers = information_table[
        information_table["ratio2"] <= 10
    ].header_name.values.tolist()
    return only_non_trackers, only_trackers


def concise_information_wrapper(dataset: pd.DataFrame, table: pd.DataFrame) -> None:
    """
    Process dataset with concise information and update the dataset with combined columns.

    Parameters
    ----------
    dataset : pd.DataFrame
        The dataset to process and update.
    table : pd.DataFrame
        A summary table with column information.
    """

    only_non_tracker_cols, only_tracker_cols = find_cols_to_combine(table)

    dataset["comb_col_non_tracker"] = 0
    dataset["comb_col_tracker"] = 0

    update_combined_columns(dataset, only_tracker_cols, 1, "comb_col_tracker")
    update_combined_columns(dataset, only_non_tracker_cols, 0, "comb_col_non_tracker")

In [112]:
concise_information_wrapper(data, summary_table_2)

In [113]:
data

Unnamed: 0,pragma,content-type,last-modified,accept-ranges,etag,p3p,x-xss-protection,x-content-type-options,strict-transport-security,x-robots-tag,...,z-backend-name,z-react-reason,z-polopoly-beckend,z-url-original,z-url-modified,z-surrogate-keys,z-actions-log,tracker,comb_col_non_tracker,comb_col_tracker
0,,text/javascript; charset=utf-8,,,15601544113783900868,"policyref=""https://www.googleadservices.com/pa...",0,nosniff,,,...,,,,,,,,1,0,0
1,,application/javascript,"mon, 08 aug 2022 21:56:59 gmt",,,,,nosniff,max-age=31536000; includesubdomains,,...,,,,,,,,0,2,0
2,no-cache,image/gif,,,,"policyref=""https://www.googleadservices.com/pa...",0,nosniff,,,...,,,,,,,,1,0,0
3,no-cache,,,,,,,,,,...,,,,,,,,0,1,0
4,,text/html; charset=utf-8,,,,,,,max-age=15724800; includesubdomains,,...,,,,,,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271999,,image/png,"fri, 12 nov 2021 05:09:38 gmt",bytes,"""618df712-2509""",,,,max-age=300,,...,,,,,,,,0,5,0
272000,no-cache,,"fri, 12 aug 2022 09:22:13 gmt",,,,,nosniff,,,...,,,,,,,,1,0,0
272001,,binary/octet-stream,"mon, 20 dec 2021 02:24:19 gmt",bytes,"""4abfa4888190370d347034f9fa43c804""",,,,,,...,,,,,,,,0,0,0
272002,,application/xml,"thu, 26 apr 2018 18:25:27 gmt",bytes,"w/""29a-1630333f5d8""",,1; mode=block,nosniff,max-age=31536000; preload,,...,,,,,,,,0,1,0


In [115]:
only_non_tracker_col, only_tracker_col = find_cols_to_combine(summary_table_2)

In [116]:
only_non_tracker_col

['akamai-true-ttl',
 'server-timing',
 'x-serial',
 'x-check-cacheable',
 'content-md5',
 'x-ms-request-id',
 'x-ms-version',
 'x-ms-blob-type',
 'cf-bgj',
 'cf-polished',
 'x-edgeconnect-cache-status',
 'x-cache-hits',
 'x-cache-status',
 'x-client-ip',
 'x-akamai-transformed',
 'link',
 'actual-object-ttl',
 'x-oracle-dms-rid',
 'x-oracle-dms-ecid',
 'x-amz-ir-id',
 'surrogate-key',
 'edge-cache-tag',
 'content-language',
 'x-ua-compatible',
 'x-upstream',
 'x-accel-expires',
 'x-77-nzt',
 'x-77-cache',
 'x-77-nzt-ray',
 'x-77-pop',
 'x_req_id',
 'service-worker-allowed',
 'traceparent',
 'x-cloud-trace-context',
 'x-pantheon-styx-hostname',
 'x-styx-req-id',
 'fastly-io-info',
 'fastly-stats',
 'x-fb-trip-id',
 'x-seen-by',
 'x-wix-request-id',
 'x-varnish',
 'x-jsd-version',
 'x-jsd-version-type',
 'x-proxy-cache',
 'host-header',
 'x-ac',
 'x-nc',
 'wn',
 'x-rq',
 'server-time',
 'x-dispatcher',
 'x-vhost',
 'x-goog-meta-goog-reserved-file-mtime',
 'cf-ipcountry',
 'x-amz-meta-sur

In [119]:
data.drop(only_non_tracker_col, axis=1, inplace=True)
data.drop(only_tracker_col, axis=1, inplace=True)

In [120]:
data

Unnamed: 0,pragma,content-type,last-modified,accept-ranges,etag,p3p,x-xss-protection,x-content-type-options,strict-transport-security,x-robots-tag,...,x-b3-sampled,x-b3-spanid,x-hubspot-correlation-id,x-hs-target-asset,x-hostname,x-cdn-pop,x-cdn-pop-ip,tracker,comb_col_non_tracker,comb_col_tracker
0,,text/javascript; charset=utf-8,,,15601544113783900868,"policyref=""https://www.googleadservices.com/pa...",0,nosniff,,,...,,,,,,,,1,0,0
1,,application/javascript,"mon, 08 aug 2022 21:56:59 gmt",,,,,nosniff,max-age=31536000; includesubdomains,,...,,,,,,,,0,2,0
2,no-cache,image/gif,,,,"policyref=""https://www.googleadservices.com/pa...",0,nosniff,,,...,,,,,,,,1,0,0
3,no-cache,,,,,,,,,,...,,,,,,,,0,1,0
4,,text/html; charset=utf-8,,,,,,,max-age=15724800; includesubdomains,,...,,,,,,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271999,,image/png,"fri, 12 nov 2021 05:09:38 gmt",bytes,"""618df712-2509""",,,,max-age=300,,...,,,,,,,,0,5,0
272000,no-cache,,"fri, 12 aug 2022 09:22:13 gmt",,,,,nosniff,,,...,,,,,,,,1,0,0
272001,,binary/octet-stream,"mon, 20 dec 2021 02:24:19 gmt",bytes,"""4abfa4888190370d347034f9fa43c804""",,,,,,...,,,,,,,,0,0,0
272002,,application/xml,"thu, 26 apr 2018 18:25:27 gmt",bytes,"w/""29a-1630333f5d8""",,1; mode=block,nosniff,max-age=31536000; preload,,...,,,,,,,,0,1,0
