In [None]:
import time
import numpy as np
import openai
from tqdm import tqdm
import pickle
import re
import sys
import os
import pandas as pd
from dotenv import load_dotenv

# Load environment variables
load_dotenv()


# ==========================================================================

def do_query(prompt, max_tokens=2):
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=max_tokens,
        top_p=1,
        logprobs=100,
    )
    time.sleep(1.0)  # to avoid rate limiters
    return response

# ==========================================================================
def uniqvals(users, field):
    vals = [users[id][field] for id in users.keys()]
    return list(set(vals))

fields_of_interest = {
    "Gender": {
        "Male": "male",
        "Female": "female",
        '': ''
    },
    "Hisp": {
        "Hispanic": "Hispanic",
        "Not Hispanic": '',
        '': ''
    },
    "WHITE": {
        "White": "white",
        "Non-white": '',
        '': ''
    },
    "Ideo": {
        '': '',
        'Liberal': 'liberal',
        'Slightly conservative': 'slightly conservative',
        'Conservative': 'conservative',
        'Slightly liberal': 'slightly liberal',
        "Moderate/Haven't thought about it": 'moderate',
        'Extremely Liberal': 'extremely liberal',
        'Extremely conservative': 'extremely conservative',
    },
    "PID7": {
        '': '',
        'Ind': 'am an independent',
        'Strong D': 'am a strong Democrat',
        'Strong R': 'am a strong Republican',
        'Lean D': 'lean towards Democrats',
        'Lean R': 'lean towards Rebublicans',
        'Weak D': 'am a weak Democrat',
        'Weak R': 'am a weak Republican',
    },
    "Inc": {
        '': '',
        'Less than $15K': 'very poor',
        '$15K to $25K': 'poor',
        '$25K to $50K': 'poor',
        '$50K to $75K': 'middle-class',
        '$75K to $100K': 'middle-class',
        '$100K to $150K': 'middle-class',
        '$150K to $200K': 'upper-class',
        '$200K to $250K': 'upper-class',
        '$250K to $500K': 'upper-class',
        'Prefer not to answer': '',
        '-8': '',
    },
}

def mapper(profile):
    results = {}
    for k in profile.keys():
        if k in fields_of_interest:
            results[k] = fields_of_interest[k].get(profile[k], '')
    if profile['Age'] != '':
        age = int(profile['Age'])
        if age >= 18 and age < 25:
            results['Age'] = 'young'
        elif age >= 25 and age < 40: 
            results['Age'] = 'middle-aged'
        elif age >= 40 and age < 60: 
            results['Age'] = 'old'
        elif age >= 60 and age < 100: 
            results['Age'] = 'very old'
        else:
            results['Age'] = ''
    return results


In [None]:

# Read the CSV file into a DataFrame
df = pd.read_csv("../data/ppfull.csv")

# Drop rows where both race categories are empty or have values that will map to empty
df = df[
    (df['WHITE'].isin(['White'])) |  # Only keep 'White' for WHITE column
    (df['Hisp'].isin(['Hispanic']))   # Only keep 'Hispanic' for Hisp column
]

# Drop rows with empty/NaN values or values that will map to empty for other features
df = df[df['Age'].notna() & (df['Age'] != '') & (df['Age'].astype(float) >= 18) & (df['Age'].astype(float) < 100)]
df = df[df['Ideo'].isin(fields_of_interest['Ideo'].keys()) & (df['Ideo'] != '')]
df = df[df['PID7'].isin(fields_of_interest['PID7'].keys()) & (df['PID7'] != '')]
df = df[df['Gender'].isin(fields_of_interest['Gender'].keys()) & (df['Gender'] != '')]
df = df[df['Inc'].isin(fields_of_interest['Inc'].keys()) & (df['Inc'] != '')]

# Convert the DataFrame to a dictionary
dmap = df.set_index(df.columns[0]).T.to_dict()


results = {}
ids = dmap.keys()
for id in tqdm(ids):
    user_profile = mapper(dmap[id])
    
    # Store all features and bio
    results[id] = {
        'id': id,
        'ideology': user_profile.get('Ideo', ''),
        'political_affiliation': user_profile.get('PID7', ''),
        'race_white': user_profile.get('WHITE', ''),
        'hispanic': user_profile.get('Hisp', ''),
        'gender': user_profile.get('Gender', ''),
        'income': user_profile.get('Inc', ''),
        'age': user_profile.get('Age', '')
    }
    
    # Construct bio
    prompt = ""
    if user_profile['Ideo'] != '':
        prompt += "Ideologically, I describe myself as " + user_profile['Ideo'] + ". "
    if user_profile['PID7'] != '':
        prompt += "Politically, I " + user_profile['PID7'] + ". "
    if user_profile['WHITE'] == 'White':
        prompt += "Racially, I am white. "
    if user_profile['Hisp'] == 'Hispanic':
        prompt += "Racially, I am Hispanic. "
    if user_profile['Gender'] != '':
        prompt += "I am " + user_profile['Gender'] + ". "
    if user_profile['Inc'] != '':
        prompt += "Financially, I am " + user_profile['Inc'] + ". "
    if user_profile.get('Age', '') != '':
        prompt += "In terms of my age, I am " + user_profile['Age'] + ". "
    
    results[id]['bio'] = prompt

# Convert to DataFrame (you can add this after the loop)
import pandas as pd
df = pd.DataFrame.from_dict(results, orient='index')


In [None]:
df

In [None]:
# Count the occurrences of each duplicated row, excluding the (unique) id
duplicate_value_counts = df.drop(columns=['id']).value_counts()

# Display the value counts of duplicated rows
duplicate_value_counts

In [None]:

# Create a copy of df with only the unique rows
unique_df = df.drop_duplicates(subset=df.columns.difference(['id']))

# Save the unique DataFrame to a CSV file
unique_df.to_csv('../data/pigeonhole_human_data.csv', index=False)
