### Import necessary libraries, set options

In [None]:
import numpy as np
import os
import pandas as pd
import re

pd.set_option('display.max_columns', 175)

### Read in dataset

In [None]:
path_to_raw_data = os.path.join("path/to/raw/data")

data = pd.read_csv(os.path.join(path_to_raw_data, 
                                "raw-data.csv"))

# Initialize condition columns
data["condition"] = np.nan
data["ineq_condition"] = np.nan
data["mobil_condition"] = np.nan

In [None]:
print(data.shape)
data.head()

### Remove unnecessary rows

In [None]:
data2 = data.drop([0,1], axis = 0)
data2 = data2[data2.Status != "Survey Preview"]
print(data2.shape)
data2.head()

In [None]:
data2.tail()

### Verify respondents unique

In [None]:
len(np.unique(data2.rid))

In [None]:
data2[data2.duplicated(subset = "rid")]

In [None]:
#data2[data2.rid == "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX"] # Didn't give consent, so will be dropped anyway

### Remove respondents who said "No" to the consent question

In [None]:
data3 = data2[data2["IRB Consent"] == 
              "Yes, I would like to take part in this study, and confirm that I AM A US RESIDENT and am 18 or older"]

In [None]:
print(data3.shape)
data3.head()

### Reorganize/rename columns, removing sensitive info along the way

In [None]:
exp_main = [
    "condition", "ineq_condition", "mobil_condition",
    "IIQ", "IDQ", "MDQ", "MIQ",
    "FL_3_DO", "FL_19_DO", "FL_20_DO"
]

outcome = [
    "O1.1", "O1.2", "O1.3", 
    "O2.1", "O2.2_1", "O2.2_2", "O2.2_3", "O2.2_4", "O2.2_5", "O2.3", 
    "O3.1", "O3.2", "O3.3", 
    "O4.1", "O4.2", "O4.3", "O4.4", "O4.5",
    "FL_34_DO", "O1_DO", "O2_DO", "O3_DO", "O4_DO"
]

mc = [
    "MC1", "MC2", "MC_DO"
]

demog = [
    "DQ1", "DQ2", "DQ3", "DQ4", "DQ5", 
    "DQ6", "DQ7", "DQ8", "DQ9", "DQ10", 
    "DQ11", "DQ12", "DQ13", "DQ14", "DQ15"
]

In [None]:
exp_aux = [
    "SEQ",
    "TSEAT_First Click", "TSEAT_Last Click", "TSEAT_Page Submit", "TSEAT_Click Count",
    "TSEAF_First Click", "TSEAF_Last Click", "TSEAF_Page Submit", "TSEAF_Click Count",
    "LTQ",
    "TLTAT_First Click", "TLTAT_Last Click", "TLTAT_Page Submit", "TLTAT_Click Count",
    "TLTAF_First Click", "TLTAF_Last Click", "TLTAF_Page Submit", "TLTAF_Click Count",
    "TIIAT_First Click", "TIIAT_Last Click", "TIIAT_Page Submit", "TIIAT_Click Count", 
    "TIIAF_First Click", "TIIAF_Last Click", "TIIAF_Page Submit", "TIIAF_Click Count",
    "TIDAT_First Click", "TIDAT_Last Click", "TIDAT_Page Submit", "TIDAT_Click Count",
    "TIDAF_First Click", "TIDAF_Last Click", "TIDAF_Page Submit", "TIDAF_Click Count",
    "TMDAT_First Click", "TMDAT_Last Click", "TMDAT_Page Submit", "TMDAT_Click Count",
    "TMDAF_First Click", "TMDAF_Last Click", "TMDAF_Page Submit", "TMDAF_Click Count",
    "TMIAT_First Click", "TMIAT_Last Click", "TMIAT_Page Submit", "TMIAT_Click Count",
    "TMIAF_First Click", "TMIAF_Last Click", "TMIAF_Page Submit", "TMIAF_Click Count"
]

consent = [
    "IRB Consent"
]

qualtrics_other = [
    "StartDate", "EndDate", "Status", "Progress", "Duration (in seconds)", "Finished", "RecordedDate", 
    "RecipientLastName", "RecipientFirstName", "RecipientEmail", "ExternalReference", "DistributionChannel", 
    "UserLanguage"
]

lucid_other = [
    "age", "gender", "hhi", "ethnicity", "hispanic", "education", "political_party", "region"
]

In [None]:
qualtrics_sensitive = [
    "IPAddress", "ResponseId", "LocationLatitude", "LocationLongitude"
]

lucid_sensitive = [
    "rid", "zip"
]

In [None]:
data4 = data3[
   exp_main + outcome + mc + demog + exp_aux + consent + qualtrics_other + lucid_other
]

In [None]:
data4.head()

In [None]:
data5 = data4[
   exp_main + outcome + mc + demog
]

In [None]:
data5.head()

In [None]:
data6 = data5.rename(columns = {
    "IIQ" :      "ineq_ans_ii",
    "IDQ" :      "ineq_ans_id",
    "MDQ" :      "mobil_ans_md",
    "MIQ" :      "mobil_ans_mi",
    "FL_3_DO" :  "ineq_or_mobil_first", 
    "FL_19_DO" : "ineq_condition_text", 
    "FL_20_DO" : "mobil_condition_text",
    "O1.1" :     "income_result_of_circumstances", 
    "O1.2" :     "equal_outcomes", 
    "O1.3" :     "govt_should_take_active_steps",
    "O2.1" :     "ineq_is_a_serious_problem",
    "O2.2_1" :   "how_to_fix_ineq_1st_choice",
    "O2.2_2" :   "how_to_fix_ineq_2nd_choice",
    "O2.2_3" :   "how_to_fix_ineq_3rd_choice",
    "O2.2_4" :   "how_to_fix_ineq_4th_choice",    
    "O2.2_5" :   "how_to_fix_ineq_5th_choice",
    "O2.3"  :    "how_to_fix_ineq_text",
    "O3.1" :     "high_earners_rarely_deserving",
    "O3.2" :     "increase_taxes_on_millionares",
    "O3.3" :     "increase_estate_tax",
    "O4.1" :     "increase_min_wage",
    "O4.2" :     "increase_aid_to_poor",
    "O4.3" :     "increase_food_stamps",
    "O4.4" :     "support_entrepreneurs",
    "O4.5" :     "support_housing",
    "FL_34_DO" : "outcome_blocks_order",
    "O1_DO" :    "outcome_block1_order", 
    "O2_DO" :    "outcome_block2_order",
    "O3_DO" :    "outcome_block3_order",
    "O4_DO" :    "outcome_block4_order",
    "MC1" :      "gap_btw_rich_and_poor_increasing",
    "MC2" :      "children_have_worse_chances",
    "MC_DO" :    "mc_order",
    "DQ1" :      "is_us_resident",
    "DQ2" :      "state",
    "DQ3" :      "gender",
    "DQ4" :      "age",
    "DQ5" :      "marital_status",
    "DQ6" :      "has_children",
    "DQ7" :      "race",
    "DQ8" :      "education",
    "DQ9" :      "employment_status",
    "DQ10" :     "income",
    "DQ11" :     "intergen_mobility_down",
    "DQ12" :     "intragen_mobility_down",
    "DQ13" :     "income_volatility",
    "DQ14" :     "liberal_vs_conservative",
    "DQ15" :     "democrat_vs_republican"
})

In [None]:
data6.head()

### Fill in condition values

In [None]:
data7 = data6.reset_index(drop = True).copy()

In [None]:
for index, row in data7.iterrows():
    if row['ineq_ans_ii'] == row['ineq_ans_ii']:
        data7.loc[index, 'ineq_condition'] = 1
    elif row['ineq_ans_id'] == row['ineq_ans_id']:
        data7.loc[index, 'ineq_condition'] = 0
        
    if row['mobil_ans_md'] == row['mobil_ans_md']:
        data7.loc[index, 'mobil_condition'] = 1
    elif row['mobil_ans_mi'] == row['mobil_ans_mi']:
        data7.loc[index, 'mobil_condition'] = 0

In [None]:
data7.head()

In [None]:
for index, row in data7.iterrows():
    if row['ineq_condition'] == 0 and row['mobil_condition'] == 0:
        data7.loc[index, 'condition'] = 'decr_ineq_incr_mobil'
    elif row['ineq_condition'] == 0 and row['mobil_condition'] == 1:
        data7.loc[index, 'condition'] = 'decr_ineq_decr_mobil'
    elif row['ineq_condition'] == 1 and row['mobil_condition'] == 0:
        data7.loc[index, 'condition'] = 'incr_ineq_incr_mobil'
    elif row['ineq_condition'] == 1 and row['mobil_condition'] == 1:
        data7.loc[index, 'condition'] = 'incr_ineq_decr_mobil'

In [None]:
data7.head()

### Combine quiz answers in a single column

In [None]:
data8 = data7.copy()

data8["ineq_quiz_ans"]  = data8["ineq_ans_ii"]
data8["ineq_quiz_ans"]  = data8['ineq_quiz_ans'].fillna(data8["ineq_ans_id"])

data8["mobil_quiz_ans"] = data8["mobil_ans_md"]
data8["mobil_quiz_ans"] = data8['mobil_quiz_ans'].fillna(data8["mobil_ans_mi"])

data8.head()

### Parse display order columns

In [None]:
data9 = data8.copy()

In [None]:
def parse_exp_do(row):
    entry = row['ineq_or_mobil_first']
    if entry != entry:
        display_order = []
    else:
        temp = re.sub('FL_19', 'ineq', entry)
        display_order = re.sub('FL_20', 'mobil', temp).split("|")
    return display_order

data9['exp_display_order'] = data9.apply(parse_exp_do, axis = 1)
data9.head()

In [None]:
data10 = data9.copy()

In [None]:
def parse_outcome_do(row):
    entry = row['outcome_blocks_order']
    if entry != entry:
        display_order = []
    else:
        temp_list = entry.split("|")
        temp_str = ""
        for e in temp_list:
            if e == "O1":
                temp_str += row['outcome_block1_order'] + "|"
            elif e == "O2":
                temp_str += row['outcome_block2_order'] + "|"
            elif e == "O3":
                temp_str += row['outcome_block3_order'] + "|"
            elif e == "O4":
                temp_str += row['outcome_block4_order'] + "|"
        temp_str2 = re.sub("O1.1", "income_result_of_circumstances", temp_str[:-1])
        temp_str2 = re.sub("O1.2", "equal_outcomes", temp_str2)
        temp_str2 = re.sub("O1.3", "govt_should_take_active_steps", temp_str2)
        temp_str2 = re.sub("O2.1", "ineq_is_a_serious_problem", temp_str2)
        temp_str2 = re.sub("O2.2", "how_to_fix_ineq_rank", temp_str2)
        temp_str2 = re.sub("O2.3", "how_to_fix_ineq_text", temp_str2)
        temp_str2 = re.sub("O3.1", "high_earners_rarely_deserving", temp_str2)
        temp_str2 = re.sub("O3.2", "increase_taxes_on_millionares", temp_str2)
        temp_str2 = re.sub("O3.3", "increase_estate_tax", temp_str2)
        temp_str2 = re.sub("O4.1", "increase_min_wage", temp_str2)
        temp_str2 = re.sub("O4.2", "increase_aid_to_poor", temp_str2)
        temp_str2 = re.sub("O4.3", "increase_food_stamps", temp_str2)
        temp_str2 = re.sub("O4.4", "support_entrepreneurs", temp_str2)
        temp_str2 = re.sub("O4.5", "support_housing", temp_str2)
        display_order = temp_str2.split("|")
        return display_order

data10['outcome_display_order'] = data10.apply(parse_outcome_do, axis = 1)
data10.head()

In [None]:
data11 = data10.copy()

In [None]:
def parse_mc_do(row):
    entry = row['mc_order']
    if entry != entry:
        display_order = []
    else:
        temp = re.sub('MC1', 'ineq_mc', entry)
        display_order = re.sub('MC2', 'mobil_mc', temp).split("|")
    return display_order

data11['mc_display_order'] = data11.apply(parse_mc_do, axis = 1)
data11.head()

### Recode values into numeric

In [None]:
data12 = data11.copy()

In [None]:
data12['ineq_quiz_ans'] = data12['ineq_quiz_ans'].map({
    "Decreased (More wealth for most)": 1,
    "Stayed the same":                  2,
    "Increased (Less wealth for most)": 3})

data12['mobil_quiz_ans'] = data12['mobil_quiz_ans'].map({
    "Increased (Rising incomes for most)":  1,
    "Stayed the same":                      2,
    "Decreased (Falling incomes for most)": 3})
    
data12.head(3)

In [None]:
data12['income_result_of_circumstances'] = data12['income_result_of_circumstances'].map({
    "One’s income and position in society is mostly the result of one’s individual effort": 0, 
    "One’s income and position in society is to a large extent the outcome of elements outside of one’s control \
(e.g., family background, luck, health issues)": 1
})

data12['equal_outcomes'] = data12['equal_outcomes'].map({
    "Equal opportunity": 0, 
    "Equal outcomes":    1
})

data12['govt_should_take_active_steps'] = data12['govt_should_take_active_steps'].map({
    "1 - The government should do only those things necessary to provide the most basic government functions": 1, 
    "2": 2,
    "3": 3,
    "4": 4,
    "5 - The government should take active steps in every area it can to try and improve the lives of its \
citizens": 5,
})

data12.head(3)

In [None]:
data12['ineq_is_a_serious_problem'] = data12['ineq_is_a_serious_problem'].map({
    "Not a problem at all":   1,
    "A small problem":        2,
    "A problem":              3,
    "A serious problem":      4,
    "A very serious problem": 5 
})

data12['how_to_fix_ineq_1st_choice'] = data12['how_to_fix_ineq_1st_choice'].map({
    "1": 1, # Education Policies
    "2": 2, # Government Regulation (e.g., min wage, caps on top compensation)
    "3": 3, # Government Transfers (e.g., food stamps, Medicaid)
    "4": 4, # Private Charity
    "5": 5, # Progressive Taxes
})

data12['how_to_fix_ineq_2nd_choice'] = data12['how_to_fix_ineq_2nd_choice'].map({
    "1": 1, # Education Policies
    "2": 2, # Government Regulation (e.g., min wage, caps on top compensation)
    "3": 3, # Government Transfers (e.g., food stamps, Medicaid)
    "4": 4, # Private Charity
    "5": 5, # Progressive Taxes
})

data12['how_to_fix_ineq_3rd_choice'] = data12['how_to_fix_ineq_3rd_choice'].map({
    "1": 1, # Education Policies
    "2": 2, # Government Regulation (e.g., min wage, caps on top compensation)
    "3": 3, # Government Transfers (e.g., food stamps, Medicaid)
    "4": 4, # Private Charity
    "5": 5, # Progressive Taxes
})

data12['how_to_fix_ineq_4th_choice'] = data12['how_to_fix_ineq_4th_choice'].map({
    "1": 1, # Education Policies
    "2": 2, # Government Regulation (e.g., min wage, caps on top compensation)
    "3": 3, # Government Transfers (e.g., food stamps, Medicaid)
    "4": 4, # Private Charity
    "5": 5, # Progressive Taxes
})

data12['how_to_fix_ineq_5th_choice'] = data12['how_to_fix_ineq_5th_choice'].map({
    "1": 1, # Education Policies
    "2": 2, # Government Regulation (e.g., min wage, caps on top compensation)
    "3": 3, # Government Transfers (e.g., food stamps, Medicaid)
    "4": 4, # Private Charity
    "5": 5, # Progressive Taxes
})

# how_to_fix_ineq_text stays as is for now

data12.head(3)

In [None]:
data12['high_earners_rarely_deserving'] = data12['high_earners_rarely_deserving'].map({
    "Most of the time": 1, 
    "Sometimes":        2,
    "Rarely":           3
})

data12['increase_taxes_on_millionares'] = data12['increase_taxes_on_millionares'].map({
    "Decreased":     1, 
    "Stay the same": 2,
    "Increased":     3
})

data12['increase_estate_tax'] = data12['increase_estate_tax'].map({
    "Decreased":  1, 
    "Left as is": 2,
    "Increased":  3
})

data12.head(3)

In [None]:
data12['increase_min_wage'] = data12['increase_min_wage'].map({
    "Decreased":     1, 
    "Stay the same": 2,
    "Increased":     3
})

data12['increase_aid_to_poor'] = data12['increase_aid_to_poor'].map({
    "Significantly decrease": 1,
    "Slightly decrease":      2,
    "Keep at current level":  3,
    "Slightly increase":      4,
    "Significantly increase": 5
})

data12['increase_food_stamps'] = data12['increase_food_stamps'].map({
    "Significantly decrease": 1,
    "Slightly decrease":      2,
    "Keep at current level":  3,
    "Slightly increase":      4,
    "Significantly increase": 5
})

data12['support_entrepreneurs'] = data12['support_entrepreneurs'].map({
    "No":  0, 
    "Yes": 1
})

data12['support_housing'] = data12['support_housing'].map({
    "No":  0, 
    "Yes": 1
})

data12.head(3)

In [None]:
data12['gap_btw_rich_and_poor_increasing'] = data12['gap_btw_rich_and_poor_increasing'].map({
    "Decreasing": 1, 
    "Same":       2,
    "Increasing": 3,    
})

data12['children_have_worse_chances'] = data12['children_have_worse_chances'].map({
    "Better": 1, 
    "Same":   2,
    "Worse":  3,    
})

data12.head(3)

In [None]:
data12['is_us_resident'] = data12['is_us_resident'].map({
    "No":  0,
    "Yes": 1
})

# state stays as is for now

data12.head(3)

In [None]:
data12['gender'] = data12['gender'].map({
    "Other":  1,
    "Male":   2,
    "Female": 3
})

data12['age'] = data12['age'].astype(int)

data12['marital_status'] = data12['marital_status'].map({
    "Single":  0,
    "Married": 1
})

data12['has_children'] = data12['has_children'].map({
    "No":  0,
    "Yes": 1
})

data12.head(3)

In [None]:
data12['race'] = data12['race'].map({
    "European American/White": 1,
    "African American/Black":  2 ,
    "Hispanic/Latino":         3,
    "Asian/Asian American":    4,
    "Other":                   5
})

data12['education'] = data12['education'].map({
    "Eighth Grade or Less":              1,
    "Some High School":                  2,
    "High School Degree/GED":            3,
    "Some College":                      4,
    "2-year College Degree":             5,
    "4-year College Degree":             6,
    "Master's Degree":                   7,
    "Doctoral Degree":                   8,
    "Professional Degree (JD, MD, MBA)": 9,
})

data12.head(3)

In [None]:
data12['employment_status'] = data12['employment_status'].map({
    "Full-time employee":                                             1,
    "Part-time employee":                                             2,
    "Self-employed or small business owner":                          3,
    "Unemployed and looking for work":                                4,
    "Student":                                                        5,
    "Not in labor force (for example: retired, or full-time parent)": 6,
})

data12['income'] = data12['income'].map({
    "$0 - $9,999":         1,
    "$10,000 - 14,999":    2, # Typo in the original survey (missing $)
    "$15,000 - $19,999":   3,
    "$20,000 - $29,999":   4,
    "$30,000 - $39,999":   5,
    "$40,000 - $49,999":   6,
    "$50,000 - $74,999":   7,
    "$75,000 - $99,999":   8,
    "$100,00 - $124,999":  9,
    "$125,000 - $149,999": 10,
    "$150,000 - $199,999": 11,
    "$200,000+":           12 
})

data12['intergen_mobility_down'] = data12['intergen_mobility_down'].map({
    "Much worse":      5,
    "Somewhat worse":  4,
    "About the same":  3,
    "Somewhat better": 2,
    "Much better":     1
})

data12['intragen_mobility_down'] = data12['intragen_mobility_down'].map({
    "Much worse":      5,
    "Somewhat worse":  4,
    "About the same":  3,
    "Somewhat better": 2,
    "Much better":     1
})

data12['income_volatility'] = data12['income_volatility'].map({
    "Income varies a lot from month to month":    3,
    "Income varies somewhat from month to month": 2,
    "Income is about the same each month":        1
})

data12.head(3)

In [None]:
data12['liberal_vs_conservative'] = data12['liberal_vs_conservative'].map({
    "Very conservative": 1,
    "Conservative":      2,
    "Moderate":          3,
    "Liberal":           4,
    "Very liberal":      5
})

data12['democrat_vs_republican'] = data12['democrat_vs_republican'].map({
    "Republican":  1,
    "Democrat":    2,
    "Independent": 3,
    "None":        4
})

data12.head(3)

### Reorganize columns again, leaving out unnecessary ones along the way

In [None]:
exp_cols = [
    "condition", "ineq_condition", "mobil_condition", "ineq_quiz_ans", "mobil_quiz_ans", "exp_display_order"
]

outcome_cols = [
    "income_result_of_circumstances", "equal_outcomes", "govt_should_take_active_steps",
    "ineq_is_a_serious_problem", 
    "how_to_fix_ineq_1st_choice", "how_to_fix_ineq_2nd_choice", "how_to_fix_ineq_3rd_choice",
    "how_to_fix_ineq_4th_choice", "how_to_fix_ineq_5th_choice", 
    "how_to_fix_ineq_text",
    "high_earners_rarely_deserving", "increase_taxes_on_millionares", "increase_estate_tax",
    "increase_min_wage", "increase_aid_to_poor", "increase_food_stamps", "support_entrepreneurs", "support_housing",
    "outcome_display_order"
]

mc_cols = [
    "gap_btw_rich_and_poor_increasing", "children_have_worse_chances", "mc_display_order"
]

demog_cols = [
    "is_us_resident", "state", 
    "gender", "age", "marital_status", "has_children", 
    "race", "education", 
    "employment_status", "income", "intergen_mobility_down", "intragen_mobility_down", "income_volatility", 
    "liberal_vs_conservative", "democrat_vs_republican"
]

data13 = data12[exp_cols + outcome_cols + mc_cols + demog_cols]
data13.head()

### Export to csv/pkl

In [None]:
path_to_processed_data = os.path.join("path/to/processed/data")

In [None]:
data13.to_csv(os.path.join(path_to_processed_data, "survey_exp_data.csv"))

In [None]:
data13.to_pickle(os.path.join(path_to_processed_data, "survey_exp_data.pkl"))

### Export text only data

In [None]:
data14 = data13[['condition', 'ineq_condition', 'mobil_condition', 
                 'is_us_resident', 'state', 
                 'gender', 'age', 'marital_status', 'has_children', 'race', 'education', 
                 'employment_status', 'income', 'income_volatility', 
                 'liberal_vs_conservative', 'democrat_vs_republican', 
                 'how_to_fix_ineq_text']]
print(data14.shape)
data14.head()

In [None]:
data14.to_csv(os.path.join(path_to_processed_data, "survey_exp_data_text.csv"))