In [1]:
import pandas as pd
import janitor
from utilities import calculate_summary_statistics, pandas_to_tex

import warnings

warnings.filterwarnings("ignore")

df = pd.read_csv("../data/ind_data.csv")
df.head(3)

Unnamed: 0,caseid,duration,visits,duration_min,duration_hr,harmless,malicious,suspicious,undetected,timeout,...,educ,pid3,pid7,presvote20post,inputstate,region,gender_lab,race_lab,educ_lab,agegroup_lab
0,47541,263115,17194,4385,73,37198,19,5,11033,0,...,2,2,6,2,12,3,Female,White,HS or Below,65+
1,56565,187793,11479,3129,52,22541,10,3,6576,0,...,3,3,5,2,17,2,Female,White,Some college,65+
2,203271,94510,6540,1575,26,10298,8,2,2943,0,...,6,1,2,-1,54,3,Female,White,Postgrad,35-49


## Exposure by demo

In [2]:
percentiles = [5, 10, 25, 50, 75, 90, 95]

In [3]:
gender_mal_bool_summ = calculate_summary_statistics(
    df,
    groupby_column="gender_lab",
    value_column="malicious_min",
    percentiles=percentiles,
)
pandas_to_tex(gender_mal_bool_summ, "../tabs/gender_mal_minutes_summ")
gender_mal_bool_summ

Unnamed: 0,gender_lab,count,mean,std,min,5,10,25,50,75,90,95,max
0,Female,595 (52.5\%),15.8,150.8,0,0.0,0.0,0.0,0.0,1.0,8.6,31.3,2879
1,Male,539 (47.5\%),10.8,50.5,0,0.0,0.0,0.0,0.0,4.0,18.0,50.0,890


In [4]:
race_mal_bool_summ = calculate_summary_statistics(
    df,
    groupby_column="race_lab",
    value_column="malicious_min",
    percentiles=percentiles,
)
pandas_to_tex(race_mal_bool_summ, "../tabs/race_mal_minutes_summ")
race_mal_bool_summ

Unnamed: 0,race_lab,count,mean,std,min,5,10,25,50,75,90,95,max
0,White,720 (63.5\%),14.5,140.1,0,0.0,0.0,0.0,0.0,2.0,9.0,30.1,2879
1,Hispanic,168 (14.8\%),6.5,28.0,0,0.0,0.0,0.0,0.0,1.0,9.0,21.3,250
2,Black,144 (12.7\%),19.8,62.3,0,0.0,0.0,0.0,0.0,5.2,43.8,128.5,407
3,Other,56 (4.9\%),10.0,33.3,0,0.0,0.0,0.0,0.0,6.0,21.0,39.8,229
4,Asian,46 (4.1\%),5.8,15.5,0,0.0,0.0,0.0,0.0,1.0,17.0,49.8,64


In [5]:
educ_mal_bool_summ = calculate_summary_statistics(
    df,
    groupby_column="educ_lab",
    value_column="malicious_min",
    percentiles=percentiles,
)
pandas_to_tex(educ_mal_bool_summ, "../tabs/educ_mal_minutes_summ")
educ_mal_bool_summ

Unnamed: 0,educ_lab,count,mean,std,min,5,10,25,50,75,90,95,max
0,HS or Below,411 (36.2\%),20.9,178.3,0,0.0,0.0,0.0,0.0,3.0,10.0,41.0,2879
1,Some college,326 (28.7\%),11.7,67.4,0,0.0,0.0,0.0,0.0,2.0,14.5,40.8,890
2,College,255 (22.5\%),9.2,35.3,0,0.0,0.0,0.0,0.0,2.0,11.0,51.8,321
3,Postgrad,142 (12.5\%),3.0,8.2,0,0.0,0.0,0.0,0.0,1.0,9.0,18.0,49


In [6]:
category_names = {
    "<25": "$<$ 25",
    "25-34": "25--34",
    "35-49": "35--49",
    "50-64": "50--64",
    "65+": "$>$ 65",
}
agegroup_mal_bool_summ = calculate_summary_statistics(
    df,
    groupby_column="agegroup_lab",
    value_column="malicious_min",
    percentiles=percentiles,
    sort_order="custom",
#     custom_order=["<25", "25--50", "50--65", "65+"],
    custom_order=category_names.keys(),
    category_names=category_names,
)
pandas_to_tex(agegroup_mal_bool_summ, "../tabs/agegroup_mal_minutes_summ")
agegroup_mal_bool_summ

Unnamed: 0,agegroup_lab,count,mean,std,min,5,10,25,50,75,90,95,max
0,$<$ 25,93 (8.2\%),13.6,44.6,0,0.0,0.0,0.0,0.0,5.0,31.8,63.2,329
1,25--34,200 (17.6\%),28.7,209.1,0,0.0,0.0,0.0,0.0,2.0,29.4,85.9,2879
2,35--49,285 (25.1\%),5.3,21.7,0,0.0,0.0,0.0,0.0,1.0,8.6,27.8,192
3,50--64,288 (25.4\%),17.0,135.3,0,0.0,0.0,0.0,0.0,2.0,13.0,34.6,2067
4,$>$ 65,268 (23.6\%),6.6,45.0,0,0.0,0.0,0.0,0.0,3.0,9.0,19.6,711
