In [1]:
import pandas as pd
import janitor
import numpy as np
pd.set_option('display.max_columns', 100)
from utilities import pandas_to_tex
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

## CPS data

In [2]:
# https://cps.ipums.org/cps-action/variables/RACE#codes_section
race_codes_cps = {
    100: "White",
    200: "Black",
    300: "American Indian/Aluet/Eskimo",
    650: "Asian or Pacific Islander",
    651: "Asian only",
    652: "Hawaiian/Pacific Islander only",
    700: "Other (single) race, n.e.c.",
    801: "White-Black",
    802: "White-American Indian",
    803: "White-Asian",
    804: "White-Hawaiian/Pacific Islander",
    805: "Black-American Indian",
    806: "Black-Asian",
    807: "Black-Hawaiian/Pacific Islander",
    808: "American Indian-Asian",
    809: "Asian-Hawaiian/Pacific Islander",
    810: "White-Black-American Indian",
    811: "White-Black-Asian",
    812: "White-American Indian-Asian",
    813: "White-Asian-Hawaiian/Pacific Islander",
    814: "White-Black-American Indian-Asian",
    815: "American Indian-Hawaiian/Pacific Islander",
    816: "White-Black-Hawaiian/Pacific Islander",
    817: "White-American Indian-Hawaiian/Pacific Islander",
    818: "Black-American Indian-Asian",
    819: "White-American Indian-Asian-Hawaiian/Pacific Islander",
    820: "Two or three races, unspecified",
    830: "Four or five races, unspecified",
    999: "Blank",
}
race_map = {
    "White": "White",
    "Black": "Black",
    "American Indian/Aluet/Eskimo": "Other",
    "Asian or Pacific Islander": "Asian",
    "Asian only": "Asian",
    "Hawaiian/Pacific Islander only": "Asian",
    "Other (single) race, n.e.c.": "Other",
    "White-Black": "Other",
    "White-American Indian": "Other",
    "White-Asian": "Other",
    "White-Hawaiian/Pacific Islander": "Other",
    "Black-American Indian": "Other",
    "Black-Asian": "Other",
    "Black-Hawaiian/Pacific Islander": "Other",
    "American Indian-Asian": "Other",
    "Asian-Hawaiian/Pacific Islander": "Asian",
    "White-Black-American Indian": "Other",
    "White-Black-Asian": "Other",
    "White-American Indian-Asian": "Other",
    "White-Asian-Hawaiian/Pacific Islander": "Other",
    "White-Black-American Indian-Asian": "Other",
    "American Indian-Hawaiian/Pacific Islander": "Other",
    "White-Black-Hawaiian/Pacific Islander": "Other",
    "White-American Indian-Hawaiian/Pacific Islander": "Other",
    "Black-American Indian-Asian": "Other",
    "White-American Indian-Asian-Hawaiian/Pacific Islander": "Other",
    "Two or three races, unspecified": "Other",
    "Four or five races, unspecified": "Other",
    "Blank": np.nan,
}

In [3]:
# https://cps.ipums.org/cps-action/variables/HISPAN#codes_section
hispanic_codes_cps = {
    0: "Not Hispanic",
    100: "Mexican",
    102: "Mexican American",
    103: "Mexicano/Mexicana",
    104: "Chicano/Chicana",
    108: "Mexican (Mexicano)",
    109: "Mexicano/Chicano",
    200: "Puerto Rican",
    300: "Cuban",
    400: "Dominican",
    500: "Salvadoran",
    600: "Other Hispanic",
    610: "Central/South American",
    611: "Central American (excluding Salvadoran)",
    612: "South American",
    901: "Do not know",
    902: "Not available/no response"
}

# Map codes to "Hispanic" or "Non-Hispanic"
hispanic_mapping = {
    0: "Non-Hispanic",  # Not Hispanic
    100: "Hispanic",
    102: "Hispanic",
    103: "Hispanic",
    104: "Hispanic",
    108: "Hispanic",
    109: "Hispanic",
    200: "Hispanic",
    300: "Hispanic",
    400: "Hispanic",
    500: "Hispanic",
    600: "Hispanic",
    610: "Hispanic",
    611: "Hispanic",
    612: "Hispanic",
    901: None,  # Do not know
    902: None   # Not available/no response
}


In [4]:
# https://cps.ipums.org/cps-action/variables/EDUC#codes_section
education_mapping = {
    # HS or Below
    0: "HS or Below", 1: "HS or Below", 2: "HS or Below", 
    10: "HS or Below", 11: "HS or Below", 12: "HS or Below",
    13: "HS or Below", 14: "HS or Below", 20: "HS or Below",
    21: "HS or Below", 22: "HS or Below", 30: "HS or Below",
    31: "HS or Below", 32: "HS or Below", 40: "HS or Below",
    50: "HS or Below", 60: "HS or Below", 70: "HS or Below",
    71: "HS or Below", 72: "HS or Below", 73: "HS or Below",
    # Some college
    80: "Some college", 81: "Some college", 
    90: "Some college", 91: "Some college", 92: "Some college",
    # College
    100: "College", 110: "College", 111: "College",
    # Postgrad
    120: "Postgrad", 121: "Postgrad", 122: "Postgrad", 
    123: "Postgrad", 124: "Postgrad", 125: "Postgrad",
    # Missing
    999: None,
}


In [5]:
age_bins = [-1, 17, 25, 34, 49, 64, float('inf')]  # Define bin edges
age_labels = ["<18", "18--24", "25--34", "35--49", "50--64", "65+"]  # Labels for each bin

In [6]:
region_codes_cps = {
    11: "Northeast",
    12: "Northeast",
    21: "Midwest",
    22: "Midwest",
    31: "South",
    32: "South",
    33: "South",
    41: "West",
    42: "West",
    97: "Unknown"
}

In [7]:
df_cps = (
    pd.read_csv("../data/cps_00002.csv.gz")
    .clean_names()
    .assign(
        race_lab_cps=lambda df_: df_["race"].replace(race_codes_cps),
        race_lab_nohisp=lambda df_: df_["race_lab_cps"].replace(race_map),
        hispanic_lab=lambda df_: df_["hispan"].replace(hispanic_codes_cps),
        race_lab=lambda df_: np.where(df_["hispanic_lab"]=="Not Hispanic", df_["race_lab_nohisp"], "Hispanic"),
        educ_lab=lambda df_: df_["educ"].replace(education_mapping),
        agegroup_lab=lambda df_: pd.cut(df_['age'], bins=age_bins, labels=age_labels),
        # https://cps.ipums.org/cps-action/variables/SEX#codes_section
        gender_lab=lambda df_: df_["sex"].map({1: "Male", 2: "Female"}),
        # https://cps.ipums.org/cps-action/variables/VOTED#codes_section
        voted_lab=lambda df_: np.where(df_["voted"]==2, "Voted", "DNV/DK/NA"),
        region_lab=lambda df_: df_["region"].replace(region_codes_cps),
    )
    .query("agegroup_lab!='<18'")
)
display(df_cps.head())
df_cps.info()

Unnamed: 0,year,serial,month,hwtfinl,cpsid,asecflag,asecwth,region,pernum,wtfinl,cpsidv,cpsidp,asecwt,age,sex,race,hispan,educ,voted,race_lab_cps,race_lab_nohisp,hispanic_lab,race_lab,educ_lab,agegroup_lab,gender_lab,voted_lab,region_lab
0,2022,2,1,1662.5757,20210100000400,,,32,1,1662.5757,202101000004011,20210100000401,,36,2,100,0,111,,White,White,Not Hispanic,White,College,35--49,Female,DNV/DK/NA,South
1,2022,2,1,1662.5757,20210100000400,,,32,2,1978.1985,202101000004021,20210100000402,,41,1,100,0,123,,White,White,Not Hispanic,White,Postgrad,35--49,Male,DNV/DK/NA,South
4,2022,3,1,2037.9611,20220100000300,,,32,1,2037.9611,202201000003011,20220100000301,,50,2,200,0,81,,Black,Black,Not Hispanic,Black,Some college,50--64,Female,DNV/DK/NA,South
6,2022,3,1,2037.9611,20220100000300,,,32,3,2500.9415,202201000003031,20220100000303,,27,1,200,0,73,,Black,Black,Not Hispanic,Black,HS or Below,25--34,Male,DNV/DK/NA,South
7,2022,4,1,2094.5077,20211200000200,,,32,1,2094.5077,202112000002011,20211200000201,,38,1,200,0,111,,Black,Black,Not Hispanic,Black,College,35--49,Male,DNV/DK/NA,South


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1079085 entries, 0 to 1375634
Data columns (total 28 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   year             1079085 non-null  int64   
 1   serial           1079085 non-null  int64   
 2   month            1079085 non-null  int64   
 3   hwtfinl          964927 non-null   float64 
 4   cpsid            1079085 non-null  int64   
 5   asecflag         193374 non-null   float64 
 6   asecwth          114158 non-null   float64 
 7   region           1079085 non-null  int64   
 8   pernum           1079085 non-null  int64   
 9   wtfinl           964927 non-null   float64 
 10  cpsidv           1079085 non-null  int64   
 11  cpsidp           1079085 non-null  int64   
 12  asecwt           114158 non-null   float64 
 13  age              1079085 non-null  int64   
 14  sex              1079085 non-null  int64   
 15  race             1079085 non-null  int64   
 16  

## YouGov

In [8]:
region_codes_yougov = {
    1: "Northeast",
    2: "Midwest",
    3: "South",
    4: "West",
    -1: np.nan
}
voting_mapping_yougov = {
    -1: "DNV/DK/NA",  # No Data
    1: "Voted",       # Joe Biden
    2: "Voted",       # Donald Trump
    3: "Voted",       # Jo Jorgensen
    4: "Voted",       # Howie Hawkins
    5: "Voted",       # Other
    6: "DNV/DK/NA"    # Did not vote for President
}

In [9]:
agegroup_yg = {
    "65+": "65+",
    "50--65": "50--64",
    "35--50": "35--49",
    "25--35": "25--34",
    "<25": "18--24",
}
df_yougov = (
    pd.read_csv("../data/ind_data.csv")
    .assign(
        region_lab=lambda df_: df_["region"].replace(region_codes_yougov),
        voted_lab=lambda df_: df_["presvote20post"].map(voting_mapping_yougov),
        agegroup_lab=lambda df_: df_["agegroup_lab"].map(agegroup_yg),
        age=lambda df_: 2022-df_["birthyr"],
    )
)
df_yougov.head()

Unnamed: 0,caseid,duration,visits,duration_min,duration_hr,harmless,malicious,suspicious,undetected,timeout,malicious_bool,malicious_visits,malicious_min,malicious_hr,suspicious_bool,birthyr,gender,race,educ,pid3,pid7,presvote20post,inputstate,region,gender_lab,race_lab,educ_lab,agegroup_lab,region_lab,voted_lab,age
0,47541,263115,17194,4385,73,37198,19,5,11033,0,2,4,0,0,5,1955.0,2,1,2,2,6,2,12,3,Female,White,HS or Below,65+,South,Voted,67.0
1,56565,187793,11479,3129,52,22541,10,3,6576,0,0,0,0,0,3,1940.0,2,1,3,3,5,2,17,2,Female,White,Some college,65+,Midwest,Voted,82.0
2,203271,94510,6540,1575,26,10298,8,2,2943,0,0,0,0,0,2,1980.0,2,1,6,1,2,-1,54,3,Female,White,Postgrad,35--49,South,DNV/DK/NA,42.0
3,216457,52109,1770,868,14,3295,1,0,971,0,0,0,0,0,0,1976.0,2,1,2,2,7,2,27,2,Female,White,HS or Below,35--49,Midwest,Voted,46.0
4,257495,188945,10012,3149,52,19151,33,6,5589,0,4,19,2,0,6,1952.0,2,7,1,1,1,1,15,4,Female,Other,HS or Below,65+,West,Voted,70.0


## Compare

In [10]:
# import pandas as pd

# # Define a function to summarize field distributions
# def summarize_sample(df, fields):
#     summary = {}
#     for field in fields:
#         summary[field] = df[field].value_counts(normalize=True).reset_index()
#         summary[field].columns = [field, 'Proportion']
#     return summary

# # Fields to compare
# fields_to_compare = ['gender_lab', 'race_lab', 'educ_lab', 'agegroup_lab', 'region_lab']

# # Summarize both datasets
# summary_cps = summarize_sample(df_cps, fields_to_compare)
# summary_yougov = summarize_sample(df_yougov, fields_to_compare)

# # Combine summaries for comparison
# comparison = {}
# for field in fields_to_compare:
#     comparison[field] = pd.merge(
#         summary_cps[field],
#         summary_yougov[field],
#         on=field,
#         how='outer',
#         suffixes=('_CPS', '_YouGov')
#     ).fillna(0)

# # Display the comparison tables for each field
# for field, table in comparison.items():
#     print(f"--- {field} ---")
#     print(table)
#     print()


In [11]:
# var = "gender_lab"

# _dfyg = (
#     df_yougov[var]
#     .value_counts(normalize=True)
#     .reset_index()
#     .rename_column(var, "yougov")
# )
# _dfcps = (
#     df_cps[var]
#     .value_counts(normalize=True)
#     .reset_index()
#     .rename_column(var, "cps")
# )
# _dfsumm = (
#     _dfyg
#     .merge(_dfcps, on="index", validate="1:1")
# )
# _dfsumm

In [14]:
catvars = ["gender_lab", "race_lab", "educ_lab", "agegroup_lab", "region_lab",]
agegroup_order = ["<18", "18--24", "25--34", "35--49", "50--64", "65+"]
region_order = ["West", "Midwest", "Northeast", "South"]

for var in catvars:
    _dfyg = (
        df_yougov[var]
        .value_counts(normalize=True)
        .reset_index()
        .rename_column(var, "yougov")
    )
    _dfcps = (
        df_cps[var]
        .value_counts(normalize=True)
        .reset_index()
        .rename_column(var, "cps")
    )
    _dfsumm = (
        _dfyg
        .merge(_dfcps, on="index", validate="1:1")
        .round(3)
    )
    
    # reorder agegroup
    if var == "agegroup_lab":
        _dfsumm["index"] = pd.Categorical(
            _dfsumm["index"], categories=agegroup_order, ordered=True
        )
        _dfsumm = _dfsumm.sort_values("index")
    
    # reorder regions    
    if var == "region_lab":
        _dfsumm["index"] = pd.Categorical(
            _dfsumm["index"], categories=region_order, ordered=True
        )
        _dfsumm = _dfsumm.sort_values("index")
        
    pandas_to_tex(_dfsumm, f"../tabs/yg_cps_{var}")
    display(_dfsumm)

Unnamed: 0,index,yougov,cps
0,Female,0.525,0.52
1,Male,0.475,0.48


Unnamed: 0,index,yougov,cps
0,White,0.635,0.673
1,Hispanic,0.148,0.141
2,Black,0.127,0.099
3,Other,0.049,0.023
4,Asian,0.041,0.064


Unnamed: 0,index,yougov,cps
0,HS or Below,0.362,0.382
1,Some college,0.287,0.267
2,College,0.225,0.219
3,Postgrad,0.125,0.132


Unnamed: 0,index,yougov,cps
4,18--24,0.094,0.112
3,25--34,0.177,0.143
0,35--49,0.257,0.24
1,50--64,0.247,0.248
2,65+,0.226,0.257


Unnamed: 0,index,yougov,cps
1,West,0.202,0.274
2,Midwest,0.2,0.193
3,Northeast,0.178,0.161
0,South,0.421,0.371


In [15]:
var = "age"
mean_yougov = df_yougov[var].mean()
mean_cps = df_cps[var].mean()

_dfsumm = (
    pd.DataFrame({"var": ["Age (mean)"], "yougov": mean_yougov, "cps": mean_cps})
    .round(1)
)
pandas_to_tex(_dfsumm, "../tabs/yg_cps_agemean")
_dfsumm

Unnamed: 0,var,yougov,cps
0,Age (mean),48.6,49.8
