In [1]:
import pandas as pd
import janitor
import numpy as np
pd.set_option('display.max_columns', 100)
from utilities import pandas_to_tex
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

## CPS data

In [2]:
# https://cps.ipums.org/cps-action/variables/RACE#codes_section
race_codes_cps = {
    100: "White",
    200: "Black",
    300: "American Indian/Aluet/Eskimo",
    650: "Asian or Pacific Islander",
    651: "Asian only",
    652: "Hawaiian/Pacific Islander only",
    700: "Other (single) race, n.e.c.",
    801: "White-Black",
    802: "White-American Indian",
    803: "White-Asian",
    804: "White-Hawaiian/Pacific Islander",
    805: "Black-American Indian",
    806: "Black-Asian",
    807: "Black-Hawaiian/Pacific Islander",
    808: "American Indian-Asian",
    809: "Asian-Hawaiian/Pacific Islander",
    810: "White-Black-American Indian",
    811: "White-Black-Asian",
    812: "White-American Indian-Asian",
    813: "White-Asian-Hawaiian/Pacific Islander",
    814: "White-Black-American Indian-Asian",
    815: "American Indian-Hawaiian/Pacific Islander",
    816: "White-Black-Hawaiian/Pacific Islander",
    817: "White-American Indian-Hawaiian/Pacific Islander",
    818: "Black-American Indian-Asian",
    819: "White-American Indian-Asian-Hawaiian/Pacific Islander",
    820: "Two or three races, unspecified",
    830: "Four or five races, unspecified",
    999: "Blank",
}
race_map = {
    "White": "White",
    "Black": "Black",
    "American Indian/Aluet/Eskimo": "Other",
    "Asian or Pacific Islander": "Asian",
    "Asian only": "Asian",
    "Hawaiian/Pacific Islander only": "Asian",
    "Other (single) race, n.e.c.": "Other",
    "White-Black": "Other",
    "White-American Indian": "Other",
    "White-Asian": "Other",
    "White-Hawaiian/Pacific Islander": "Other",
    "Black-American Indian": "Other",
    "Black-Asian": "Other",
    "Black-Hawaiian/Pacific Islander": "Other",
    "American Indian-Asian": "Other",
    "Asian-Hawaiian/Pacific Islander": "Asian",
    "White-Black-American Indian": "Other",
    "White-Black-Asian": "Other",
    "White-American Indian-Asian": "Other",
    "White-Asian-Hawaiian/Pacific Islander": "Other",
    "White-Black-American Indian-Asian": "Other",
    "American Indian-Hawaiian/Pacific Islander": "Other",
    "White-Black-Hawaiian/Pacific Islander": "Other",
    "White-American Indian-Hawaiian/Pacific Islander": "Other",
    "Black-American Indian-Asian": "Other",
    "White-American Indian-Asian-Hawaiian/Pacific Islander": "Other",
    "Two or three races, unspecified": "Other",
    "Four or five races, unspecified": "Other",
    "Blank": np.nan,
}

In [3]:
# https://cps.ipums.org/cps-action/variables/HISPAN#codes_section
hispanic_codes_cps = {
    0: "Not Hispanic",
    100: "Mexican",
    102: "Mexican American",
    103: "Mexicano/Mexicana",
    104: "Chicano/Chicana",
    108: "Mexican (Mexicano)",
    109: "Mexicano/Chicano",
    200: "Puerto Rican",
    300: "Cuban",
    400: "Dominican",
    500: "Salvadoran",
    600: "Other Hispanic",
    610: "Central/South American",
    611: "Central American (excluding Salvadoran)",
    612: "South American",
    901: "Do not know",
    902: "Not available/no response"
}

hispanic_mapping = {
    0: "Non-Hispanic",  # Not Hispanic
    100: "Hispanic",
    102: "Hispanic",
    103: "Hispanic",
    104: "Hispanic",
    108: "Hispanic",
    109: "Hispanic",
    200: "Hispanic",
    300: "Hispanic",
    400: "Hispanic",
    500: "Hispanic",
    600: "Hispanic",
    610: "Hispanic",
    611: "Hispanic",
    612: "Hispanic",
    901: None,  # Do not know
    902: None   # Not available/no response
}


In [4]:
# https://cps.ipums.org/cps-action/variables/EDUC#codes_section
education_mapping = {
    # HS or Below
    0: "HS or Below", 1: "HS or Below", 2: "HS or Below", 
    10: "HS or Below", 11: "HS or Below", 12: "HS or Below",
    13: "HS or Below", 14: "HS or Below", 20: "HS or Below",
    21: "HS or Below", 22: "HS or Below", 30: "HS or Below",
    31: "HS or Below", 32: "HS or Below", 40: "HS or Below",
    50: "HS or Below", 60: "HS or Below", 70: "HS or Below",
    71: "HS or Below", 72: "HS or Below", 73: "HS or Below",
    # Some college
    80: "Some college", 81: "Some college", 
    90: "Some college", 91: "Some college", 92: "Some college",
    # College
    100: "College", 110: "College", 111: "College",
    # Postgrad
    120: "Postgrad", 121: "Postgrad", 122: "Postgrad", 
    123: "Postgrad", 124: "Postgrad", 125: "Postgrad",
    # Missing
    999: None,
}


In [5]:
age_bins = [-1, 17, 25, 34, 49, 64, float('inf')]
age_labels = ["<18", "18--24", "25--34", "35--49", "50--64", "65+"]

In [6]:
region_codes_cps = {
    11: "Northeast",
    12: "Northeast",
    21: "Midwest",
    22: "Midwest",
    31: "South",
    32: "South",
    33: "South",
    41: "West",
    42: "West",
    97: "Unknown"
}

In [35]:
df_cps = (
    pd.read_csv("../data/cps_00002.csv.gz")
    .clean_names()
    .assign(
        race_lab_cps=lambda df_: df_["race"].replace(race_codes_cps),
        race_lab_nohisp=lambda df_: df_["race_lab_cps"].replace(race_map),
        hispanic_lab=lambda df_: df_["hispan"].replace(hispanic_codes_cps),
        race_lab=lambda df_: np.where(df_["hispanic_lab"]=="Not Hispanic", df_["race_lab_nohisp"], "Hispanic"),
        educ_lab=lambda df_: df_["educ"].replace(education_mapping),
        agegroup_lab=lambda df_: pd.cut(df_['age'], bins=age_bins, labels=age_labels),
        # https://cps.ipums.org/cps-action/variables/SEX#codes_section
        gender_lab=lambda df_: df_["sex"].map({1: "Male", 2: "Female"}),
        # https://cps.ipums.org/cps-action/variables/VOTED#codes_section
        voted_lab=lambda df_: np.where(df_["voted"]==2, "Voted", "Did not vote/NA"),
#         voted_lab=np.nan,
        region_lab=lambda df_: df_["region"].replace(region_codes_cps),
    )
    .query("agegroup_lab!='<18'")
)
display(df_cps.head())
df_cps.info()

Unnamed: 0,year,serial,month,hwtfinl,cpsid,asecflag,asecwth,region,pernum,wtfinl,cpsidv,cpsidp,asecwt,age,sex,race,hispan,educ,voted,race_lab_cps,race_lab_nohisp,hispanic_lab,race_lab,educ_lab,agegroup_lab,gender_lab,voted_lab,region_lab
0,2022,2,1,1662.5757,20210100000400,,,32,1,1662.5757,202101000004011,20210100000401,,36,2,100,0,111,,White,White,Not Hispanic,White,College,35--49,Female,Did not vote/NA,South
1,2022,2,1,1662.5757,20210100000400,,,32,2,1978.1985,202101000004021,20210100000402,,41,1,100,0,123,,White,White,Not Hispanic,White,Postgrad,35--49,Male,Did not vote/NA,South
4,2022,3,1,2037.9611,20220100000300,,,32,1,2037.9611,202201000003011,20220100000301,,50,2,200,0,81,,Black,Black,Not Hispanic,Black,Some college,50--64,Female,Did not vote/NA,South
6,2022,3,1,2037.9611,20220100000300,,,32,3,2500.9415,202201000003031,20220100000303,,27,1,200,0,73,,Black,Black,Not Hispanic,Black,HS or Below,25--34,Male,Did not vote/NA,South
7,2022,4,1,2094.5077,20211200000200,,,32,1,2094.5077,202112000002011,20211200000201,,38,1,200,0,111,,Black,Black,Not Hispanic,Black,College,35--49,Male,Did not vote/NA,South


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1079085 entries, 0 to 1375634
Data columns (total 28 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   year             1079085 non-null  int64   
 1   serial           1079085 non-null  int64   
 2   month            1079085 non-null  int64   
 3   hwtfinl          964927 non-null   float64 
 4   cpsid            1079085 non-null  int64   
 5   asecflag         193374 non-null   float64 
 6   asecwth          114158 non-null   float64 
 7   region           1079085 non-null  int64   
 8   pernum           1079085 non-null  int64   
 9   wtfinl           964927 non-null   float64 
 10  cpsidv           1079085 non-null  int64   
 11  cpsidp           1079085 non-null  int64   
 12  asecwt           114158 non-null   float64 
 13  age              1079085 non-null  int64   
 14  sex              1079085 non-null  int64   
 15  race             1079085 non-null  int64   
 16  

## YouGov

In [8]:
region_codes_yougov = {
    1: "Northeast",
    2: "Midwest",
    3: "South",
    4: "West",
    -1: np.nan
}
voting_mapping_yougov = {
    -1: "Did not vote/NA",  # No Data
    1: "Voted",       # Joe Biden
    2: "Voted",       # Donald Trump
    3: "Voted",       # Jo Jorgensen
    4: "Voted",       # Howie Hawkins
    5: "Voted",       # Other
    6: "Did not vote/NA"    # Did not vote for President
}

In [9]:
agegroup_yg = {
    "65+": "65+",
    "50--65": "50--64",
    "35--50": "35--49",
    "25--35": "25--34",
    "<25": "18--24",
}
df_yougov = (
    pd.read_csv("../data/ind_data.csv")
    .assign(
        region_lab=lambda df_: df_["region"].replace(region_codes_yougov),
        voted_lab=lambda df_: df_["presvote20post"].map(voting_mapping_yougov),
        agegroup_lab=lambda df_: df_["agegroup_lab"].map(agegroup_yg),
        age=lambda df_: 2022-df_["birthyr"],
    )
)
df_yougov.head()

Unnamed: 0,caseid,duration,visits,duration_min,duration_hr,harmless,malicious,suspicious,undetected,timeout,malicious_bool,malicious_visits,malicious_min,malicious_hr,suspicious_bool,birthyr,gender,race,educ,pid3,pid7,presvote20post,inputstate,region,gender_lab,race_lab,educ_lab,agegroup_lab,region_lab,voted_lab,age
0,47541,263115,17194,4385,73,37198,19,5,11033,0,2,4,0,0,5,1955.0,2,1,2,2,6,2,12,3,Female,White,HS or Below,65+,South,Voted,67.0
1,56565,187793,11479,3129,52,22541,10,3,6576,0,0,0,0,0,3,1940.0,2,1,3,3,5,2,17,2,Female,White,Some college,65+,Midwest,Voted,82.0
2,203271,94510,6540,1575,26,10298,8,2,2943,0,0,0,0,0,2,1980.0,2,1,6,1,2,-1,54,3,Female,White,Postgrad,35--49,South,Did not vote/NA,42.0
3,216457,52109,1770,868,14,3295,1,0,971,0,0,0,0,0,0,1976.0,2,1,2,2,7,2,27,2,Female,White,HS or Below,35--49,Midwest,Voted,46.0
4,257495,188945,10012,3149,52,19151,33,6,5589,0,4,19,2,0,6,1952.0,2,7,1,1,1,1,15,4,Female,Other,HS or Below,65+,West,Voted,70.0


## Compare

In [10]:
# import pandas as pd

# # Define a function to summarize field distributions
# def summarize_sample(df, fields):
#     summary = {}
#     for field in fields:
#         summary[field] = df[field].value_counts(normalize=True).reset_index()
#         summary[field].columns = [field, 'Proportion']
#     return summary

# # Fields to compare
# fields_to_compare = ['gender_lab', 'race_lab', 'educ_lab', 'agegroup_lab', 'region_lab']

# # Summarize both datasets
# summary_cps = summarize_sample(df_cps, fields_to_compare)
# summary_yougov = summarize_sample(df_yougov, fields_to_compare)

# # Combine summaries for comparison
# comparison = {}
# for field in fields_to_compare:
#     comparison[field] = pd.merge(
#         summary_cps[field],
#         summary_yougov[field],
#         on=field,
#         how='outer',
#         suffixes=('_CPS', '_YouGov')
#     ).fillna(0)

# # Display the comparison tables for each field
# for field, table in comparison.items():
#     print(f"--- {field} ---")
#     print(table)
#     print()


In [11]:
# var = "gender_lab"

# _dfyg = (
#     df_yougov[var]
#     .value_counts(normalize=True)
#     .reset_index()
#     .rename_column(var, "yougov")
# )
# _dfcps = (
#     df_cps[var]
#     .value_counts(normalize=True)
#     .reset_index()
#     .rename_column(var, "cps")
# )
# _dfsumm = (
#     _dfyg
#     .merge(_dfcps, on="index", validate="1:1")
# )
# _dfsumm

In [26]:
catvars = ["gender_lab", "race_lab", "educ_lab", "agegroup_lab", "region_lab",]
agegroup_order = ["<18", "18--24", "25--34", "35--49", "50--64", "65+"]
region_order = ["West", "Midwest", "Northeast", "South"]

for var in catvars:
    _dfyg = (
        df_yougov[var]
        .value_counts(normalize=True)
        .reset_index()
        .rename_column(var, "yougov")
    )
    _dfcps = (
        df_cps[var]
        .value_counts(normalize=True)
        .reset_index()
        .rename_column(var, "cps")
    )
    _dfsumm = (
        _dfyg
        .merge(_dfcps, on="index", validate="1:1")
        .round(3)
    )
    
    # reorder agegroup
    if var == "agegroup_lab":
        _dfsumm["index"] = (
            pd.Categorical(
                _dfsumm["index"], categories=agegroup_order, ordered=True)
        )
        _dfsumm = _dfsumm.sort_values("index")
    
    # reorder regions    
    if var == "region_lab":
        _dfsumm["index"] = pd.Categorical(
            _dfsumm["index"], categories=region_order, ordered=True
        )
        _dfsumm = _dfsumm.sort_values("index")
        _dfsumm["index"] = _dfsumm["index"].apply(lambda x: x + " region")
        
    if var=="educ_lab":
#         _dfsumm["ed"]
        _dfsumm.loc[_dfsumm['index'] == 'Postgrad', 'index'] = 'Postgraduate degree'
        _dfsumm.loc[_dfsumm['index'] == 'HS or Below', 'index'] = 'High school or below'
        _dfsumm.loc[_dfsumm['index'] == 'College', 'index'] = 'College degree'

    pandas_to_tex(_dfsumm, f"../tabs/yg_cps_{var}")
    display(_dfsumm)

Unnamed: 0,index,yougov,cps
0,Female,0.525,0.52
1,Male,0.475,0.48


Unnamed: 0,index,yougov,cps
0,White,0.635,0.673
1,Hispanic,0.148,0.141
2,Black,0.127,0.099
3,Other,0.049,0.023
4,Asian,0.041,0.064


Unnamed: 0,index,yougov,cps
0,High school or below,0.362,0.382
1,Some college,0.287,0.267
2,College degree,0.225,0.219
3,Postgraduate degree,0.125,0.132


Unnamed: 0,index,yougov,cps
4,18--24,0.094,0.112
3,25--34,0.177,0.143
0,35--49,0.257,0.24
1,50--64,0.247,0.248
2,65+,0.226,0.257


Unnamed: 0,index,yougov,cps
1,West region,0.202,0.274
2,Midwest region,0.2,0.193
3,Northeast region,0.178,0.161
0,South region,0.421,0.371


In [20]:
var = "age"
mean_yougov = df_yougov[var].mean()
mean_cps = df_cps[var].mean()

_dfsumm = (
    pd.DataFrame({"var": ["Age (mean)"], "yougov": mean_yougov, "cps": mean_cps})
    .round(1)
)
pandas_to_tex(_dfsumm, "../tabs/yg_cps_agemean")
_dfsumm

Unnamed: 0,var,yougov,cps
0,Age (mean),48.6,49.8


## Nationscape

In [14]:
# https://www.voterstudygroup.org/downloads/nationscape?key=1015588
race_mapping_ns = {
    "Black, or African American": "Black",
    "White": "White",
    "Asian (Filipino)": "Asian",
    "Asian (Vietnamese)": "Asian",
    "Asian (Asian Indian)": "Asian",
    "Asian (Korean)": "Asian",
    "Pacific Islander (Other)": "Other",
    "Pacific Islander (Native Hawaiian)": "Other",
    "Pacific Islander (Guamanian)": "Other",
    "American Indian or Alaska Native": "Other",
    "Some other race": "Other",
    "Asian (Chinese)": "Asian",
    "Asian (Japanese)": "Asian"
}
education_mapping_ns = {
    "3rd Grade or less": "HS or Below",
    "Middle School - Grades 4 - 8": "HS or Below",
    "Completed some high school": "HS or Below",
    "High school graduate": "HS or Below",
    "Completed some college, but no degree": "Some college",
    "Associate Degree": "Some college",
    "Other post high school vocational training": "Some college",
    "College Degree (such as B.A., B.S.)": "College",
    "Completed some graduate, but no degree": "Postgrad",
    "Masters degree": "Postgrad",
    "Doctorate degree": "Postgrad"
}

In [15]:
vote_2020_mapping_ns = {
    "Joe Biden": "Voted",
    "Donald Trump": "Voted",
    "Someone else": "Voted",
    "I abstained": "Did not vote/NA",
    "I don't recall": "Did not vote/NA",
    "Not Asked": "Did not vote/NA",
    None: "Did not vote/NA"  # Handle NaN or missing values
}

In [16]:
df_ns = (
    pd.read_stata("../data/Nationscape-Weekly-Materials-DTA-2021Dec/phase_3_v20210301/ns20210112/ns20210112.dta")
    .assign(
        race_lab_nohisp=lambda df_: df_["race_ethnicity"].map(race_mapping_ns),
        race_lab=lambda df_: np.where(df_["hispanic"]=="Not Hispanic", df_["race_lab_nohisp"], "Hispanic"),
        gender_lab=lambda df_: df_["gender"],
        educ_lab=lambda df_: df_["education"].map(education_mapping_ns),
        agegroup_lab=lambda df_: pd.cut(df_['age'], bins=age_bins, labels=age_labels),
        region_lab=lambda df_: df_["census_region"],
        voted_lab=lambda df_: df_["vote_2020_retro"].map(vote_2020_mapping_ns),
    )
)
display(df_ns.head())
df_ns.info(verbose=True)

Unnamed: 0,response_id,start_date,right_track,economy_better,interest,registration,news_sources_facebook,news_sources_cnn,news_sources_msnbc,news_sources_fox,news_sources_network,news_sources_localtv,news_sources_telemundo,news_sources_npr,news_sources_amtalk,news_sources_new_york_times,news_sources_local_newspaper,news_sources_other,news_sources_other_TEXT,pres_approval,vote_2016,vote_2016_other_text,vote_intention_retro,vote_2020_retro,vote_2020_retro_other_text,who_won,who_won_other_text,primary_party_retro,group_favorability_whites,group_favorability_blacks,group_favorability_latinos,group_favorability_asians,group_favorability_evangelicals,group_favorability_socialists,group_favorability_muslims,group_favorability_labor_unions,group_favorability_the_police,group_favorability_undocumented,group_favorability_lgbt,group_favorability_republicans,group_favorability_democrats,group_favorability_white_men,group_favorability_jews,group_favorability_blm,group_favorability_trump_s,group_favorability_biden_s,cand_favorability_trump,cand_favorability_obama,cand_favorability_biden,cand_favorability_harris,...,statements_confront_china,statements_foreign_interests,elect_conf_conduct_retro,elect_conf_vote_retro,extra_vote_mail_retr,extra_vacc_flu,extra_vacc_covid,extra_dem_violence,extra_ind_violence,extra_rep_violence,extra_corona_concern,extra_sick_you,extra_sick_family,extra_sick_work,extra_sick_other,extra_covid_worn_mask,extra_covid_socialize_distance,extra_covid_socialize_no_dist,extra_trump_corona,extra_gub_corona,extra_covid_cancel_meet,extra_covid_close_business,extra_covid_close_schools,extra_covid_work_home,extra_covid_restrict_home,extra_covid_testing,extra_covid_require_mask,capitol_approval,capitol_trump_approv,capitol_trump_more,twitter_ban,age,gender,census_region,hispanic,race_ethnicity,household_income,education,state,congress_district,weight,weight_2020,weight_both,race_lab_nohisp,race_lab,gender_lab,educ_lab,agegroup_lab,region_lab,voted_lab
0,7700007,2021-01-12 10:52:22,Off on the wrong track,Better,Most of the time,Registered,Yes,No,Yes,No,Yes,No,Yes,No,Yes,No,Yes,Yes,,Strongly disapprove,Don't recall,,"Yes, I voted",Joe Biden,,Joe Biden,,The Democratic Primary/Caucus,Very unfavorable,Haven't heard enough,Very unfavorable,Very unfavorable,Very unfavorable,Very unfavorable,Very unfavorable,Very unfavorable,Haven't heard enough,Very unfavorable,Haven't heard enough,Very unfavorable,Haven't heard enough,Very unfavorable,Haven't heard enough,Haven't heard enough,Very unfavorable,Haven't heard enough,Somewhat unfavorable,Very favorable,Haven't heard enough,Somewhat favorable,...,Somewhat Agree,Somewhat Disagree,Very confident,Somewhat confident,Voted on Election Day in-person,0.0,0.0,Not at all,Not Asked,Not Asked,Somewhat concerned,No,No,No,No,Yes,No,Yes,Strongly disapprove,Strongly disapprove,Don't know,Strongly oppose,Strongly oppose,Strongly support,Somewhat oppose,Somewhat support,Somewhat oppose,Disapprove,Disapprove,Yes,Somewhat worried,20,Female,South,Not Hispanic,"Black, or African American","$75,000 to $79,999",High school graduate,NC,NC02,2.170623,0.76852,1.577373,Black,Black,Female,HS or Below,18--24,South,Voted
1,7700008,2021-01-12 10:55:11,Generally headed in the right direction,Better,,Registered,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Nothing,Strongly approve,Donald Trump,,"Yes, I voted",Donald Trump,,Donald Trump,,The Republican Primary/Caucus,Very favorable,Very favorable,Very favorable,Somewhat unfavorable,Somewhat favorable,Very favorable,Very favorable,Very favorable,Very favorable,Very favorable,Very favorable,Somewhat unfavorable,Very favorable,Very favorable,Very favorable,Very favorable,Very favorable,Somewhat unfavorable,Somewhat favorable,Somewhat unfavorable,Very favorable,Somewhat unfavorable,...,Not Asked,Not Asked,Very confident,Very confident,Voted on Election Day in-person,100.0,100.0,Not Asked,Not Asked,A lot,Very concerned,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Strongly approve,Strongly approve,Strongly support,Strongly support,Strongly support,Strongly support,Strongly support,Strongly support,Strongly support,Approve,Approve,Yes,Very worried,59,Male,Midwest,Mexican,"Black, or African American","$150,000 to $174,999","College Degree (such as B.A., B.S.)",MI,MI06,3.612443,4.039212,3.223077,Black,Hispanic,Male,College,50--64,Midwest,Voted
2,7700009,2021-01-12 10:50:00,Off on the wrong track,About the same,Some of the time,Registered,Yes,No,No,Yes,Yes,Yes,No,No,Yes,No,No,No,,Strongly approve,Was not eligible to vote,,"Yes, I voted",Donald Trump,,Donald Trump,,The Republican Primary/Caucus,Very favorable,Somewhat favorable,Very favorable,Very favorable,Very favorable,Somewhat unfavorable,Very favorable,Very favorable,Very favorable,Somewhat favorable,Very favorable,Very favorable,Somewhat unfavorable,Very favorable,Very favorable,Very unfavorable,Very favorable,Somewhat unfavorable,Very favorable,Somewhat unfavorable,Somewhat favorable,Somewhat unfavorable,...,Not Asked,Somewhat Disagree,Not at all confident,Not too confident,Received my ballot through the mail and return...,100.0,50.0,Not Asked,Not Asked,Not at all,Very concerned,No,Yes,No,Yes,Yes,Yes,No,Somewhat approve,Somewhat approve,Strongly support,Strongly support,Somewhat oppose,Strongly support,Strongly support,Strongly support,Strongly support,Disapprove,Not sure,Not sure,Very worried,21,Female,West,Not Hispanic,White,"$100,000 to $124,999",Associate Degree,UT,UT02,5.011087,3.066427,5.010591,White,White,Female,Some college,18--24,West,Voted
3,7700010,2021-01-12 10:47:32,Off on the wrong track,About the same,Most of the time,Registered,No,Yes,Yes,No,Yes,Yes,No,No,No,No,No,No,,Somewhat approve,Donald Trump,,"Yes, I voted",Donald Trump,,Joe Biden,,Neither,Somewhat favorable,Somewhat favorable,Somewhat favorable,Somewhat favorable,Haven't heard enough,Very unfavorable,Somewhat favorable,Haven't heard enough,Haven't heard enough,Very unfavorable,Somewhat favorable,Somewhat favorable,Somewhat favorable,Somewhat favorable,Somewhat favorable,Very unfavorable,Somewhat favorable,Somewhat unfavorable,Somewhat favorable,Very unfavorable,Very unfavorable,Very unfavorable,...,Somewhat Disagree,Somewhat Disagree,Somewhat confident,Somewhat confident,Received my ballot through the mail and return...,100.0,100.0,Not Asked,Not Asked,Not at all,Very concerned,No,No,No,Yes,Yes,No,No,Somewhat approve,Somewhat approve,Somewhat support,Somewhat support,Somewhat support,Somewhat support,Somewhat support,Somewhat support,Somewhat support,Disapprove,Approve,Yes,Somewhat worried,68,Female,Midwest,Not Hispanic,White,"$80,000 to $84,999","Completed some college, but no degree",IA,IA01,0.968854,1.306608,1.041669,White,White,Female,Some college,65+,Midwest,Voted
4,7700011,2021-01-12 10:52:57,Off on the wrong track,Worse,Only now and then,Don't know,No,No,No,No,Yes,Yes,No,No,No,No,No,No,,Strongly disapprove,"Did not vote, but was eligible",,"No, I was eligible but did not vote",Not Asked,,Joe Biden,,Neither,Haven't heard enough,Somewhat favorable,Haven't heard enough,Somewhat unfavorable,Haven't heard enough,Somewhat favorable,Very unfavorable,Haven't heard enough,Haven't heard enough,Very unfavorable,Haven't heard enough,Somewhat favorable,Very favorable,Somewhat favorable,Very favorable,Very unfavorable,Somewhat unfavorable,Very favorable,Somewhat unfavorable,Somewhat unfavorable,Very favorable,Very favorable,...,Not Asked,Not Asked,Somewhat confident,Not at all confident,Not Asked,50.0,10.0,Not Asked,Not at all,Not Asked,Very concerned,No,Yes,No,Yes,Yes,No,No,Not sure,Somewhat approve,Strongly oppose,Somewhat oppose,Strongly support,Don't know,Strongly oppose,Somewhat oppose,Don't know,Disapprove,Disapprove,Not sure,Haven't thought much about this,46,Female,South,Not Hispanic,White,"$15,000 to $19,999",High school graduate,VA,VA04,0.17251,0.169041,0.169441,White,White,Female,HS or Below,35--49,South,Did not vote/NA


<class 'pandas.core.frame.DataFrame'>
Int64Index: 4138 entries, 0 to 4137
Data columns (total 241 columns):
 #    Column                           Dtype         
---   ------                           -----         
 0    response_id                      object        
 1    start_date                       datetime64[ns]
 2    right_track                      category      
 3    economy_better                   category      
 4    interest                         category      
 5    registration                     category      
 6    news_sources_facebook            category      
 7    news_sources_cnn                 category      
 8    news_sources_msnbc               category      
 9    news_sources_fox                 category      
 10   news_sources_network             category      
 11   news_sources_localtv             category      
 12   news_sources_telemundo           category      
 13   news_sources_npr                 category      
 14   news_sources_amtalk   

In [37]:
catvars = ["gender_lab", "race_lab", "educ_lab", "agegroup_lab", "region_lab", "voted_lab"]

for var in catvars:
    _dfyg = (
        df_yougov[var]
        .value_counts(normalize=True)
        .reset_index()
        .rename_column(var, "yougov")
    )
    _dfcps = (
        df_cps[var]
        .value_counts(normalize=True)
        .reset_index()
        .rename_column(var, "cps")
    )
    _dfns = (
        df_ns[var]
        .value_counts(normalize=True)
        .reset_index()
        .rename_column(var, "nscape")
    )    
    _dfsumm = (
        _dfyg
        .merge(_dfcps, on="index", validate="1:1")
        .merge(_dfns, on="index", validate="1:1")
        .round(3)
    )
    
    # reorder agegroup
    if var == "agegroup_lab":
        _dfsumm["index"] = (
            pd.Categorical(
                _dfsumm["index"], categories=agegroup_order, ordered=True)
        )
        _dfsumm = _dfsumm.sort_values("index")
    
    # reorder regions    
    if var == "region_lab":
        _dfsumm["index"] = pd.Categorical(
            _dfsumm["index"], categories=region_order, ordered=True
        )
        _dfsumm = _dfsumm.sort_values("index")
        _dfsumm["index"] = _dfsumm["index"].apply(lambda x: x + " region")
        
    if var=="educ_lab":
#         _dfsumm["ed"]
        _dfsumm.loc[_dfsumm['index'] == 'Postgrad', 'index'] = 'Postgraduate degree'
        _dfsumm.loc[_dfsumm['index'] == 'HS or Below', 'index'] = 'High school or below'
        _dfsumm.loc[_dfsumm['index'] == 'College', 'index'] = 'College degree'

    if var=="voted_lab":
        _dfsumm['cps'] = "---"

#     pandas_to_tex(_dfsumm, f"../tabs/yg_cps_nscape_{var}")
    display(_dfsumm)

Unnamed: 0,index,yougov,cps,nscape
0,Female,0.525,0.52,0.59
1,Male,0.475,0.48,0.41


Unnamed: 0,index,yougov,cps,nscape
0,White,0.635,0.673,0.704
1,Hispanic,0.148,0.141,0.124
2,Black,0.127,0.099,0.109
3,Other,0.049,0.023,0.023
4,Asian,0.041,0.064,0.04


Unnamed: 0,index,yougov,cps,nscape
0,High school or below,0.362,0.382,0.277
1,Some college,0.287,0.267,0.354
2,College degree,0.225,0.219,0.248
3,Postgraduate degree,0.125,0.132,0.121


Unnamed: 0,index,yougov,cps,nscape
4,18--24,0.094,0.112,0.111
3,25--34,0.177,0.143,0.142
0,35--49,0.257,0.24,0.258
1,50--64,0.247,0.248,0.319
2,65+,0.226,0.257,0.17


Unnamed: 0,index,yougov,cps,nscape
1,West region,0.202,0.274,0.216
2,Midwest region,0.2,0.193,0.246
3,Northeast region,0.178,0.161,0.176
0,South region,0.421,0.371,0.362


Unnamed: 0,index,yougov,cps,nscape
0,Voted,0.638,---,0.772
1,Did not vote/NA,0.362,---,0.228
