In [1]:
import pandas as pd
import numpy as np
import json
from typing import List, Dict, Any

import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

import yaml
with open("../../src/configs.yml", "r") as configs:
    configs = yaml.safe_load(configs)

ROOTPATH = configs["ROOTPATH"]
DATAPATH = configs["DATAPATH"]
DATAPATH2 = configs["DATAPATH2"]
OUTPATH = "../../data/"

## First extract relevant columns from the raw survey data

In [2]:
df = pd.read_spss(DATAPATH + "/survey/NRC_Adatdonacio_sulyozott_20230620_ENG_v02.SAV")
df.to_csv(DATAPATH + "/survey/NRC_Adatdonacio_sulyozott_20230620_ENG_v02.csv", index=False)

with open("../../index/recoding_scheme.json", "r") as jsonf:
    COL_RENAME_MAP = json.load(jsonf)

pol_df = df[COL_RENAME_MAP.keys()].copy()
pol_df.rename(columns=COL_RENAME_MAP, inplace=True)

In [3]:
# recode age, gender, education
pol_df["age"] = pol_df["birth_year"].map(lambda x: 2024-x)
GENDER_CODE = {"male":0, "female":1}
pol_df["gender"] = pol_df["gender"].map(lambda x: GENDER_CODE[x])
EDU_CODE = {"basic education":0, "secondary education":1, "higher education":2}
pol_df["education"] = pol_df["education"].map(lambda x: EDU_CODE[x])

## Merge sm politics cols

In [4]:
def merge_action(val1, val2):
    out = [x for x in [val1, val2] if str(x)!="nan"]
    if len(out) == 1:
        return out[0]
    elif len(out) == 0:
        return np.NaN  
    else:
        print(out)
        return np.NaN

for action in ["post", "share", "react", "comment"]:
    pol_df[f"fb_{action}_politics"] = pol_df.apply(
        lambda x: merge_action(x[f"fb_{action}_politics_A"], x[f"fb_{action}_politics_B"]), axis=1)
    pol_df.drop(columns=[f"fb_{action}_politics_A", f"fb_{action}_politics_B"], inplace=True)

## Recode survey response

In [5]:
## first group the columns 

SM_USAGE1 = [
    "fb_post_politics",
    "fb_share_politics",
    "fb_react_politics",
    "fb_comment_politics",
]

SM_USAGE2 = [
    "sns_view_politics",
    "sns_view_politics_friends",
    "sns_view_politics_disagree",
    "sns_view_politics_progov",
    "sns_view_politics_antigov",
]

SM_USAGE3 = [
    "fb_freq",
    "ig_freq",
    "tw_freq",
    "yt_freq",
    "lk_freq",
    "tt_freq",
    "sp_freq",
]

SEEK_POL = [
    "seek_politics_tv",
    "seek_politics_radio",
    "seek_politics_newpp",
    "seek_politics_sites",
    "seek_politics_sns",
    "seek_politics_convo",
]

KNOW_SUPPORT = [
    "know_sppt_fdsz",
    "know_sppt_mszp",
    "know_sppt_jbbk",
    "know_sppt_lmp",
    "know_sppt_dk",
    "know_sppt_mtum",
    "know_sppt_mhzk",
]

KNOW_RELATE = [
    "know_relate_fdsz",
    "know_relate_mszp",
    "know_relate_jbbk",
    "know_relate_lmp",
    "know_relate_dk",
    "know_relate_mtum",
    "know_relate_mhzk",
]

FEEL_THM = [
    "feel_thm_fdsz",
    "feel_thm_mszp",
    "feel_thm_jbbk",
    "feel_thm_lmp",
    "feel_thm_dk",
    "feel_thm_mtum",
    "feel_thm_mhzk",
]

ATTITUDE_COLS = [
    "lftRght",
    "lbrCsvt",
    "strEnvm",
    "regBisn",
    "sanChck",
    "porEasy",
    "favImgr",
    "govInef",
    "milStrg",
    "racDisc",
    "govHelp",
    "bizProf",
    "homAcpt",
]

MISC_COLS = [
    "talk_politics_frfa",
    "align_politics_frfa",
    "interest_politics",
    "vote_party_2022",
    "would_go_vote_sunday",
    "would_vote_party_sunday",
]

In [6]:
FREQ_MAP1 = {
    "Never": 0,
    "Less often": 1,
    "Every few weeks": 2,
    "1 to 2 days a week": 3,
    "3 to 6 days a week": 4,
    "About once a day": 5,
    "Several times a day": 6,
    "DK/NA": np.NaN,
}

FREQ_MAP2 = {
    "Never":0,
    "Rarely":1,
    "Sometimes":2,
    "Often":3,
    "Very often":4,
    "DK/NA": np.NaN,
}

FREQ_MAP3 = {
    "Never":0,
    "Less frequent":1,
    "Monthly":2,
    "Weekly":3,
    "Daily":4,
    "DK/NA": np.NaN,
}

BINARY_MAP = {
    "Yes":1,
    "No":0,
    "DK/NA": np.NaN,
}

MISC_MAP = {
    "Strongly dislike": 0,
    "Strongly like": 10,
    "Left wing": 1,
    "Right wing": 7,
    "Extremely liberal": 1,
    "Extremely conservative": 7,
    "Strongly disagree": 1,
    "Strongly agree": 7,
    "DK/NA": np.NaN,
    "They are mostly the same":2,
    "Mixed":1,
    "They are mostly different":0,
}

INTEREST_MAP = {
    "I don't care at all":0,
    "I don't really care":1,
    "I'm neutral":2,
    "I am interested":3,
    "I'm very interested":4,
    "DK/NA": np.NaN,
}

PARTY_MAP = {
    "Fidesz-KDNP":"FDSZ", 
    "Opposition cooperation":"OPPO", 
    "Hungarian Two-Tailed Dog Party": "HTDP", 
    "Other party, national minority list": "OTHER", 
    "Our Homeland Movement": "OHM", 
    "Could not vote (e.g.: was under 18)": "COULDN'T",
    "DK/ NA": np.NaN,
    "Did not vote": "DIDN'T"
}

VOTE_ACT_MAP = {
    "I'm sure I wouldn't go":0, 
    "I think I wouldn't go":1, 
    "I think I would go":2, 
    "I'm sure I would go":3, 
    "DK/NA": np.NaN,
}

PARTY_MAP2 = {
    "Fidesz-KDNP":"FDSZ",
    "DK":"DK",
    "DK (Democratic Coalition)":"DK(DC)",
    "Other party":"OTHER",
    "Our Homeland Movement":"OHW",
    "Momentum":"MMTM",
    "NA":np.NaN,
    "Jobbik":"JBBK",
    "MSZP":"MSZP",
    "LMP":"LMP",
    "Would not want to vote":"WOULDN'T"
}

def clean_response_en(answer, mapper):
    if answer in mapper.keys():
        out = mapper[answer]
    else:
        out = int(answer)
    return out

In [7]:
for c in SM_USAGE1:
    pol_df[c] = pol_df[c].map(lambda x: clean_response_en(x, FREQ_MAP1))

for c in SM_USAGE2:
    pol_df[c] = pol_df[c].map(lambda x: clean_response_en(x, FREQ_MAP2))

for c in SM_USAGE3:
    pol_df[c] = pol_df[c].map(lambda x: clean_response_en(x, FREQ_MAP3))

for c in SEEK_POL:
    pol_df[c] = pol_df[c].map(lambda x: clean_response_en(x, FREQ_MAP2))

for c in KNOW_SUPPORT:
    pol_df[c] = pol_df[c].map(lambda x: clean_response_en(x, BINARY_MAP))

for c in KNOW_RELATE:
    pol_df[c] = pol_df[c].map(lambda x: clean_response_en(x, BINARY_MAP))

for c in FEEL_THM:
    pol_df[c] = pol_df[c].map(lambda x: clean_response_en(x, MISC_MAP))

for c in ATTITUDE_COLS:
    pol_df[c] = pol_df[c].map(lambda x: clean_response_en(x, MISC_MAP))
    pol_df[c] = pol_df[c].replace(99, np.NaN)

In [8]:
pol_df["talk_politics_frfa"] = pol_df["talk_politics_frfa"].map(lambda x: clean_response_en(x, FREQ_MAP2))
pol_df["align_politics_frfa"] = pol_df["align_politics_frfa"].map(lambda x: clean_response_en(x, MISC_MAP))
pol_df["interest_politics"] = pol_df["interest_politics"].map(lambda x: clean_response_en(x, INTEREST_MAP))
pol_df["vote_party_2022"] = pol_df["vote_party_2022"].map(lambda x: clean_response_en(x, PARTY_MAP))
pol_df["would_go_vote_sunday"] = pol_df["would_go_vote_sunday"].map(lambda x: clean_response_en(x, VOTE_ACT_MAP))
pol_df["would_vote_party_sunday"] = pol_df["would_vote_party_sunday"].map(lambda x: clean_response_en(x, PARTY_MAP2))

In [9]:
pol_df["user_id"] = pol_df["user_id"].astype(int)
print("NUMBER OF RESPONDENTS:", len(pol_df))
pol_df.to_csv(OUTPATH + "pol_survey_cleaned_NEW_INCLUDEALL.csv", index=False)
pol_df.head()


NUMBER OF RESPONDENTS: 758


Unnamed: 0,user_id,weight,gender,birth_year,education,fb_freq,ig_freq,tw_freq,yt_freq,lk_freq,...,milStrg,racDisc,govHelp,bizProf,homAcpt,age,fb_post_politics,fb_share_politics,fb_react_politics,fb_comment_politics
0,20145,1.563181,0,1960.0,1,3,1.0,1.0,1.0,0.0,...,4,3,2,5,6,64.0,0.0,0.0,0.0,0.0
1,20377,1.116234,0,1977.0,2,4,3.0,2.0,4.0,0.0,...,2,5,2,5,5,47.0,1.0,1.0,3.0,3.0
2,21712,1.733286,0,1975.0,1,4,3.0,1.0,4.0,1.0,...,7,1,5,2,1,49.0,1.0,2.0,1.0,1.0
3,23815,1.116234,0,1975.0,1,4,0.0,0.0,4.0,0.0,...,5,4,4,2,5,49.0,0.0,0.0,0.0,0.0
4,26713,1.689703,0,1949.0,2,4,0.0,0.0,2.0,0.0,...,1,4,1,4,6,75.0,2.0,2.0,1.0,1.0
