In [1]:
import pandas as pd
import janitor
from IPython.display import display

import warnings

warnings.filterwarnings("ignore")

import numpy as np

In [2]:
df = (
    # ==================================================
    # Ind-browsing level (n = 186,400)
    pd.read_csv("../data/yg_ind_domain.csv.gz")
    .assign(
        duration_min=lambda df_: df_["duration"] / 60,
        duration_hr=lambda df_: df_["duration"] / 3600,
    )
    # ==================================================
    # Merge to VT
    .merge(
        (pd.read_csv("../data/yg_virustotal_dat.csv")),
        how="left",
        left_on="private_domain",
        right_on="filename",
        validate="m:1",
    )
    .assign(
        malicious_bool=lambda df_: np.where(df_["malicious"] >= 2, True, False),
        malicious_visits=lambda df_: df_["malicious_bool"] * df_["visits"],
        malicious_min=lambda df_: df_["malicious_bool"] * df_["duration_min"],
        malicious_hr=lambda df_: df_["malicious_bool"] * df_["duration_hr"],
        suspicious_bool=lambda df_: df_["suspicious"].astype("bool"),
    )
    # ==================================================
    # Aggregate to ind.
    .groupby("caseid")
    .sum()
    .astype(int)
    .reset_index()
    # ==================================================
    # Merge to ind demo (n = 1200)
    .merge(
        (
            pd.read_csv("../data/profile.csv").assign(
                # https://github.com/themains/bad_domains/blob/main/data/codebook.pdf
                gender_lab=lambda df_: df_["gender"].replace({1: "Male", 2: "Female"}),
                race_lab=lambda df_: df_["race"].replace(
                    {
                        1: "White",
                        2: "Black",
                        3: "Hispanic",
                        4: "Asian",
                        5: "Other",
                        6: "Other",
                        7: "Other",
                        8: "Other",
                    }
                ),
                educ_lab=lambda df_: df_["educ"].replace(
                    {
                        1: "HS or Below",
                        2: "HS or Below",
                        3: "Some college",
                        4: "Some college",
                        5: "College",
                        6: "Postgrad",
                    }
                ),
                agegroup_lab=lambda df_: pd.cut(
                    df_["birthyr"],
                    # early baby boomers
                    # late baby boomers/ early genX 
                    # genX, early millenials
                    # millenials
                    # genZ
                    bins = [1929, 1958, 1973, 1988, 1998, 2003],
                    labels = ["65+", "50-64", "35-49", "25-34", "<25"],
                ),
            )
        ),
        how="left",
        on="caseid",
        validate="m:1",
    )
)
df.to_csv("../data/ind_data.csv", index=False)
display(df.head())
df.info()

Unnamed: 0,caseid,duration,visits,duration_min,duration_hr,harmless,malicious,suspicious,undetected,timeout,...,educ,pid3,pid7,presvote20post,inputstate,region,gender_lab,race_lab,educ_lab,agegroup_lab
0,47541,263115,17194,4385,73,37198,19,5,11033,0,...,2,2,6,2,12,3,Female,White,HS or Below,65+
1,56565,187793,11479,3129,52,22541,10,3,6576,0,...,3,3,5,2,17,2,Female,White,Some college,65+
2,203271,94510,6540,1575,26,10298,8,2,2943,0,...,6,1,2,-1,54,3,Female,White,Postgrad,35-49
3,216457,52109,1770,868,14,3295,1,0,971,0,...,2,2,7,2,27,2,Female,White,HS or Below,35-49
4,257495,188945,10012,3149,52,19151,33,6,5589,0,...,1,1,1,1,15,4,Female,Other,HS or Below,65+


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1134 entries, 0 to 1133
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   caseid            1134 non-null   int64   
 1   duration          1134 non-null   int64   
 2   visits            1134 non-null   int64   
 3   duration_min      1134 non-null   int64   
 4   duration_hr       1134 non-null   int64   
 5   harmless          1134 non-null   int64   
 6   malicious         1134 non-null   int64   
 7   suspicious        1134 non-null   int64   
 8   undetected        1134 non-null   int64   
 9   timeout           1134 non-null   int64   
 10  malicious_bool    1134 non-null   int64   
 11  malicious_visits  1134 non-null   int64   
 12  malicious_min     1134 non-null   int64   
 13  malicious_hr      1134 non-null   int64   
 14  suspicious_bool   1134 non-null   int64   
 15  birthyr           1134 non-null   float64 
 16  gender            1134 n

In [3]:
df["caseid"].nunique()

1134