In [1]:
import pandas as pd
import janitor
from utilities import calculate_summary_statistics, pandas_to_tex, save_mpl_fig

import warnings

warnings.filterwarnings("ignore")
from matplotlib.lines import Line2D

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats.mstats import winsorize
import statsmodels.api as sm

sns.set_theme(context="notebook", font_scale=1.35, style="whitegrid", palette="dark")

df = pd.read_csv("../data/ind_data.csv")
df.head(3)

Unnamed: 0,caseid,private_domain,duration,visits,duration_min,duration_hr,filename,harmless,malicious,suspicious,...,educ,pid3,pid7,presvote20post,inputstate,region,gender_lab,race_lab,educ_lab,agegroup_lab
0,47541,10best.com1800petmeds.com2uf4ta.net3m.com8x8.c...,263115,17194,4385.25,73.0875,10best.com1800petmeds.com2uf4ta.net3m.com8x8.c...,37198.0,19.0,5.0,...,2,2,6,2,12,3,Female,White,HS or Below,65+
1,56565,11thstreetexpress.com1a-lab.neta-mo.netactivem...,187793,11479,3129.883333,52.164722,11thstreetexpress.com1a-lab.neta-mo.netactivem...,22541.0,10.0,3.0,...,3,3,5,2,17,2,Female,White,Some college,65+
2,203271,acop.comaepohio.comalchemer.comalldayidreamabo...,94510,6540,1575.166667,26.252778,acop.comaepohio.comalchemer.comalldayidreamabo...,10298.0,8.0,2.0,...,6,1,2,-1,54,3,Female,White,Postgrad,35-49


In [2]:
percentiles = [5, 10, 25, 50, 75, 90, 95]

In [3]:
gender_malvisits_summ = calculate_summary_statistics(
    df,
    groupby_column="gender_lab",
    value_column="malicious_visits",
    percentiles=percentiles,
)
# pandas_to_tex(gender_malvisits_summ, "../tabs/gender_malvisits_summ")
gender_malvisits_summ

Unnamed: 0,gender_lab,count,mean,std,min,5,10,25,50,75,90,95,max
0,Female,595 (52.5\%),18.6,169.7,0,0.0,0.0,0.0,0.0,6.0,23.0,51.3,4006
1,Male,539 (47.5\%),23.4,87.9,0,0.0,0.0,0.0,2.0,12.0,47.0,94.1,1113


In [4]:
race_malvisits_summ = calculate_summary_statistics(
    df,
    groupby_column="race_lab",
    value_column="malicious_visits",
    percentiles=percentiles,
)
# pandas_to_tex(race_malvisits_summ, "../tabs/race_malvisits_summ")
race_malvisits_summ

Unnamed: 0,race_lab,count,mean,std,min,5,10,25,50,75,90,95,max
0,White,720 (63.5\%),18.9,158.2,0,0.0,0.0,0.0,0.0,8.0,29.0,58.0,4006
1,Hispanic,168 (14.8\%),13.8,39.0,0,0.0,0.0,0.0,0.0,7.0,33.3,60.6,317
2,Black,144 (12.7\%),43.6,140.2,0,0.0,0.0,0.0,3.0,14.2,100.2,213.7,1113
3,Other,56 (4.9\%),11.8,29.2,0,0.0,0.0,0.0,2.0,10.0,24.5,46.5,161
4,Asian,46 (4.1\%),18.0,38.5,0,0.0,0.0,0.0,0.0,12.5,69.5,107.8,181


In [5]:
educ_malvisits_summ = calculate_summary_statistics(
    df,
    groupby_column="educ_lab",
    value_column="malicious_visits",
    percentiles=percentiles,
)
# pandas_to_tex(educ_malvisits_summ, "../tabs/educ_malvisits_summ")
educ_malvisits_summ

Unnamed: 0,educ_lab,count,mean,std,min,5,10,25,50,75,90,95,max
0,HS or Below,411 (36.2\%),29.8,212.7,0,0.0,0.0,0.0,2.0,10.0,32.0,86.5,4006
1,Some college,326 (28.7\%),17.3,64.8,0,0.0,0.0,0.0,0.0,9.0,38.0,70.2,801
2,College,255 (22.5\%),17.4,68.2,0,0.0,0.0,0.0,2.0,8.0,31.2,60.5,848
3,Postgrad,142 (12.5\%),9.6,27.1,0,0.0,0.0,0.0,0.0,5.0,25.8,47.9,212


In [6]:
category_names = {
    "<25": "$<$ 25",
    "25-34": "25--34",
    "35-49": "35--49",
    "50-64": "50--64",
    "65+": "$>$ 65",
}
agegroup_malvisits_summ = calculate_summary_statistics(
    df,
    groupby_column="agegroup_lab",
    value_column="malicious_visits",
    percentiles=percentiles,
    sort_order="custom",
#     custom_order=["<25", "25--50", "50--65", "65+"],
    custom_order=category_names.keys(),
    category_names=category_names,
)
# pandas_to_tex(agegroup_malvisits_summ, "../tabs/agegroup_malvisits_summ")
agegroup_malvisits_summ

Unnamed: 0,agegroup_lab,count,mean,std,min,5,10,25,50,75,90,95,max
0,$<$ 25,93 (8.2\%),28.9,89.6,0,0.0,0.0,0.0,2.0,14.0,62.8,98.0,661
1,25--34,200 (17.6\%),25.9,103.5,0,0.0,0.0,0.0,0.5,11.2,40.7,116.0,1113
2,35--49,285 (25.1\%),11.2,35.4,0,0.0,0.0,0.0,0.0,6.0,25.0,45.0,382
3,50--64,288 (25.4\%),31.8,244.5,0,0.0,0.0,0.0,1.5,9.0,36.0,82.9,4006
4,$>$ 65,268 (23.6\%),12.9,54.2,0,0.0,0.0,0.0,2.0,9.0,26.6,51.6,801
