# YouGov Benchmark Survey
Code that reproduces the benchmark survey from YouGov

We start by loading the API key

In [19]:
import openai
import os
import pandas as pd
import numpy as np
from ipfn import ipfn
# Remember to activate environment with source synthsurvey/bin/activate

In [20]:
# Read the API key from a file
with open('../code/api_key.txt', 'r') as file:
    api_key = file.read().strip()

os.environ['OPENAI_API_KEY'] = api_key

Next, we sample our respondents

In [21]:
# Open ipums data
df = pd.read_csv('../data/ipums/ipums_2022.csv')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,YEAR,SERIAL,REGION,STATEFIP,CITY,OWNERSHP,HHINCOME,PERNUM,PERWT,SEX,...,RACE,HISPAN,CITIZEN,SCHOOL,EDUC,EMPSTAT,CLASSWKR,OCC,INCTOT,INCWAGE
0,2022,1.0,East South Central Div.,Alabama,Not in identifiable city (or size group),,,1,69,Female,...,Two major races,Not Hispanic,,"No, not in school",1 year of college,Not in labor force,,0,18800.0,0.0
1,2022,2.0,East South Central Div.,Alabama,Not in identifiable city (or size group),,,1,22,Male,...,White,Not Hispanic,,"No, not in school",Grade 12,Not in labor force,Works for wages,9645,12500.0,12500.0
2,2022,3.0,East South Central Div.,Alabama,Not in identifiable city (or size group),,,1,45,Female,...,Black/African American,Not Hispanic,,"No, not in school","Grade 5, 6, 7, or 8",Not in labor force,Works for wages,8800,16400.0,16400.0
3,2022,4.0,East South Central Div.,Alabama,Not in identifiable city (or size group),,,1,4,Male,...,Black/African American,Not Hispanic,,"No, not in school",N/A or no schooling,Not in labor force,,0,8600.0,0.0
4,2022,5.0,East South Central Div.,Alabama,Not in identifiable city (or size group),,,1,47,Male,...,White,Not Hispanic,,"No, not in school",1 year of college,Not in labor force,Self-employed,6230,5000.0,0.0


### Data preparation

In [None]:
# 1. Filter out individuals younger than 18
df = df[df['AGE'] >= 18].copy()

# 2. Create AGE_GROUP variable
age_bins = [18, 29, 44, 64, np.inf]
age_labels = ['18-29', '30-44', '45-64', '65+']
df['AGE_GROUP'] = pd.cut(df['AGE'], bins=age_bins, labels=age_labels, right=True, include_lowest=True)

# 3. Create RACE_ETHNICITY variable by combining RACE and HISPAN
def determine_race_ethnicity(row):
    hispan = str(row['HISPAN']).strip()
    race = str(row['RACE']).strip()
    if hispan != 'Not Hispanic':
        return 'Hispanic'
    elif race == 'White':
        return 'White'
    elif race == 'Black/African American':
        return 'Black'
    else:
        return 'Other'

df['RACE_ETHNICITY'] = df.apply(determine_race_ethnicity, axis=1)

# 4. Create INCOME_GROUP variable using 'HHINCOME' and handle N/A values
def determine_income_group(row):
    hhincome = row['HHINCOME']
    if pd.isnull(hhincome) or hhincome == 0:
        return 'No income/did not report'
    elif hhincome < 50000:
        return 'Less than $50k'
    elif 50000 <= hhincome < 100000:
        return '$50k-$100k'
    else:
        return '$100k or higher'

df['INCOME_GROUP'] = df.apply(determine_income_group, axis=1)

# 5. Create REGION_GROUP variable
def determine_region_group(division):
    division = str(division).strip()
    if division in ['New England Division', 'Middle Atlantic Division']:
        return 'Northeast'
    elif division in ['East North Central Div.', 'West North Central Div.']:
        return 'Midwest'
    elif division in ['South Atlantic Division', 'East South Central Div.', 'West South Central Div.']:
        return 'South'
    elif division in ['Mountain Division', 'Pacific Division']:
        return 'West'
    else:
        return 'Other'  # Handle any unexpected divisions

df['REGION_GROUP'] = df['REGION'].apply(determine_region_group)

# Ensure that all variables are strings to avoid any type issues
variables = ['SEX', 'AGE_GROUP', 'RACE_ETHNICITY', 'INCOME_GROUP', 'REGION_GROUP']
for var in variables:
    df[var] = df[var].astype(str).str.strip()

# New var called total
df['total'] = 1.0


#SI AQUI EXPORTO LA BASE ES LA "LIMPIA"


In [23]:
xsex = df.groupby('SEX')['total'].sum().astype(float)
xage = df.groupby('AGE_GROUP')['total'].sum().astype(float)
xrace = df.groupby('RACE_ETHNICITY')['total'].sum().astype(float)
xinc = df.groupby('INCOME_GROUP')['total'].sum().astype(float)
xreg = df.groupby('REGION_GROUP')['total'].sum().astype(float)

In [24]:
print(xage)

AGE_GROUP
18-29    476466.0
30-44    617299.0
45-64    879183.0
65+      754724.0
Name: total, dtype: float64


In [None]:
N = df.shape[0]

#el usuario tiene un slider donde elige esto
xsex.loc['Female'] = 0.53 * N
xsex.loc['Male'] = 0.47 * N

xage.loc['18-29'] = 0.22 * N
xage.loc['30-44'] = 0.17 * N
xage.loc['45-64'] = 0.36 * N
xage.loc['65+'] = 0.25 * N

xrace.loc['White'] = 0.62 * N
xrace.loc['Black'] = 0.14 * N
xrace.loc['Hispanic'] = 0.14 * N
xrace.loc['Other'] = 0.10 * N

xinc.loc['Less than $50k'] = 0.40 * N
xinc.loc['$50k-$100k'] = 0.30 * N
xinc.loc['$100k or higher'] = 0.19 * N
xinc.loc['No income/did not report'] = 0.11 * N

xreg.loc['Northeast'] = 0.18 * N
xreg.loc['Midwest'] = 0.21 * N
xreg.loc['South'] = 0.39 * N
xreg.loc['West'] = 0.22 * N

#qué tal specifics no presentes en la encuesta?
    # eres un female de 27 ...., usas Android?
        # si sí, entonces te hago la siguiente pregunta


In [27]:
print(xreg)

REGION_GROUP
Midwest       572811.12
Northeast     490980.96
South        1063792.08
West          600087.84
Name: total, dtype: float64


In [None]:
#este dice cuántas personas quiero por cada grupo
aggregates = [xsex, xage, xrace, xinc, xreg]
dimensions = [['SEX'], ['AGE_GROUP'], ['RACE_ETHNICITY'], ['INCOME_GROUP'], ['REGION_GROUP']]

#esta madre hace linear algebra para obtener los pesos que tiene cada fila 
IPF = ipfn.ipfn(df, aggregates, dimensions)
df = IPF.iteration()

#df lo guardas en csv

In [34]:
df.head()

Unnamed: 0,REGION_GROUP,SEX,YEAR,SERIAL,REGION,STATEFIP,CITY,OWNERSHP,HHINCOME,PERNUM,...,EDUC,EMPSTAT,CLASSWKR,OCC,INCTOT,INCWAGE,AGE_GROUP,RACE_ETHNICITY,INCOME_GROUP,total
0,South,Female,2022,1.0,East South Central Div.,Alabama,Not in identifiable city (or size group),,,1,...,1 year of college,Not in labor force,,0,18800.0,0.0,65+,Other,No income/did not report,0.996966
1,South,Male,2022,2.0,East South Central Div.,Alabama,Not in identifiable city (or size group),,,1,...,Grade 12,Not in labor force,Works for wages,9645,12500.0,12500.0,45-64,White,No income/did not report,1.563485
2,South,Female,2022,3.0,East South Central Div.,Alabama,Not in identifiable city (or size group),,,1,...,"Grade 5, 6, 7, or 8",Not in labor force,Works for wages,8800,16400.0,16400.0,30-44,Black,No income/did not report,1.565231
3,South,Male,2022,4.0,East South Central Div.,Alabama,Not in identifiable city (or size group),,,1,...,N/A or no schooling,Not in labor force,,0,8600.0,0.0,65+,Black,No income/did not report,1.425004
4,South,Male,2022,5.0,East South Central Div.,Alabama,Not in identifiable city (or size group),,,1,...,1 year of college,Not in labor force,Self-employed,6230,5000.0,0.0,45-64,White,No income/did not report,1.563485


### Sample Observations Using Adjusted Weights
Sample observations using the adjusted weights.

In [35]:
sample_size == 1098

# Normalize the adjusted weights to sum to the sample size
df['adjusted_weight_normalized'] = df['total'] / df['total'].sum() 

# Sample observations
random_seed = 50
df_sample = df.sample(n=sample_size, weights='adjusted_weight_normalized', replace=True, random_state=random_seed)


### Validate the Sample's Marginal Distributions
Check that the sample aligns closely with the desired demographic proportions.

In [37]:
# Function to get the distribution of a variable in counts and percentages
def get_distribution(df, variable):
    counts = df[variable].value_counts().sort_index()
    percentages = df[variable].value_counts(normalize=True).sort_index()
    return counts, percentages

variables = ['SEX', 'AGE_GROUP', 'RACE_ETHNICITY', 'INCOME_GROUP', 'REGION_GROUP']

for var in variables:
    print(f'Distribution of {var}:\n')

    # Sample distribution in df_sample
    sample_counts, sample_percentages = get_distribution(df_sample, var)
    print('Sample distribution in df_sample:')
    print('Percentages (%):')
    print((sample_percentages * 100).round(2))


Distribution of SEX:

Sample distribution in df_sample:
Percentages (%):
SEX
Female    51.37
Male      48.63
Name: proportion, dtype: float64
Distribution of AGE_GROUP:

Sample distribution in df_sample:
Percentages (%):
AGE_GROUP
18-29    21.40
30-44    16.85
45-64    37.25
65+      24.50
Name: proportion, dtype: float64
Distribution of RACE_ETHNICITY:

Sample distribution in df_sample:
Percentages (%):
RACE_ETHNICITY
Black       15.66
Hispanic    13.66
Other        9.20
White       61.48
Name: proportion, dtype: float64
Distribution of INCOME_GROUP:

Sample distribution in df_sample:
Percentages (%):
INCOME_GROUP
$100k or higher             18.58
$50k-$100k                  29.51
Less than $50k              38.89
No income/did not report    13.02
Name: proportion, dtype: float64
Distribution of REGION_GROUP:

Sample distribution in df_sample:
Percentages (%):
REGION_GROUP
Midwest      20.58
Northeast    17.30
South        39.89
West         22.22
Name: proportion, dtype: float64
