# Data loading and process

extract and process the domain features

In [1]:
! pip install pyreadstat

Collecting pyreadstat
  Downloading pyreadstat-1.2.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 4.1 MB/s eta 0:00:01
Installing collected packages: pyreadstat
Successfully installed pyreadstat-1.2.6


In [2]:
import pyreadstat

# data path
file_path = '../data/cumulative_2022_v3_9.sav'

# read data
df, meta = pyreadstat.read_sav(file_path)

# df is a pandas DataFrame (most common data type for data analysis in Python)

# meta is a dict containing metadata, like column names, labels, missing values, etc.



In [3]:
# check the shape of the DataFrame
print( df.shape)
print( 'there are', df.shape[0], 'samples(rows) and', df.shape[1], 'variables(columns) in the DataFrame\n\n')

# check the head sample of the DataFrame
# print(df.head())

# save the DataFrame as a csv file
df.to_csv('../data/cumulative_2022_v3_9.csv', index=False)


(68224, 1030)
there are 68224 samples(rows) and 1030 variables(columns) in the DataFrame




In [4]:


variable_labels = meta.column_labels

variable_to_column_dict = {}

column_to_variable_dict = {}

# we also build a dictionary to map the variable labels in meta to column names in df, which may make the feature indexing more conveniently

for i in range(len(variable_labels)):

    variable_to_column_dict[variable_labels[i]] = df.columns[i]
    column_to_variable_dict[df.columns[i]] = variable_labels[i]

# we further check the meaning(label) of the values for each variable

value_labels = meta.variable_value_labels

# save two dictionaries npw
import numpy as np

np.save('../data/variable_to_column_dict.npy', variable_to_column_dict)
np.save('../data/column_to_variable_dict.npy', column_to_variable_dict)




# import pandas as pd

# variable_to_column_df = pd.DataFrame.from_dict(variable_to_column_dict, orient='index', columns=['column_name'])
# variable_to_column_df.to_csv('../data/variable_to_column_dict.csv')

# column_to_variable_df = pd.DataFrame.from_dict(column_to_variable_dict, orient='index', columns=['variable_label'])

# column_to_variable_df.to_csv('../data/column_to_variable_dict.csv')



In [5]:
# add a dict to store the features for each domain

domain_features = {}

domain_features['Contextual_Domain'] = [
                                        'South',
                                        'region',
                                        'racial_composition_nbhood',
                                        'racial_composition_gradeSchool',
                                        'racial_composition_juniorHigh',
                                        'racial_composition_highSchool', 
                                        'racial_composition_shops',
                                        'racial_composition_friends',
                                        'length_residence_home',
                                        'length_residence_community',
                                        'workedWithcommunity',
                                        'volunteer',
                                        'meetingCommuntySchool',
                                        'living_withFamily',
                                        'Age',
                                        'Gender',
                                        'Race3',
                                        'Race4',
                                        'Race7',
                                        'Education4',
                                        'education6',
                                        'education7',
                                        'church_attendance',
                                        'VCF0130',
                                        'VCF0131',
                                        'religion',
                                        'religion_fullCode',
                                        
]

domain_features['Identity_Domain'] = [
                                        'sex_orientation',
                                        'bisexalFamilyorFriends',
                                        'have_healthInsurance',
                                        'satisfactionLife',
                                        'bornAgain',
                                        'howOftenTrust',
                                        'trustPeople',
                                        'VCF0626',
                                        'VCF0627',
                                        'VCF0628',
                                        'VCF0629',
                                        'concern_war',
                                        'concern_nuclear_war',


]
    

domain_features['Presidential_Politics_Domain'] = [
                                      'approve_pres',
                                      'approve_pres_strength',
                                      'Presidency_performance',
                                      'vote_always_sameParty',
                                      'Will_PresElectionClose',
                                      'approve_president_economy',
                                      'party_handle_problem',
                                      'VCF9219',
                                      'VCF9220',
]

domain_features['affect_domain'] = [
                                'therm_Blacks',
                                'therm_Whites',
                                'therm_hispanics',
                                'therm_Asians',
                                'therm_Protestants',
                                'therm_Catholics',
                                'therm_Jews',
                                'therm_Christians',
                                'therm_ChrFundament',
                                'therm_Mislims',
                                'thermometer_evangelical',
                                'thermometer_women',
                                'therm_liberals',
                                'therm_conservatives',
                                'VCF0886',
                                'VCF0887',
                                'VCF0890',
                                'VCF0891',
                    
]

domain_features['Cognitive domain'] = [
    'ideology7',
    'authoritarian1',
    'authoritarian2',
    'authoritarian3',
    'authoritarian4',
    'votingMakedifference',
    'understand_poliIssues',
    'politics2complicated',
    'powerDifference',
    'satisfactionDemocracy',
    'follow_political_info',
    'blackInfluence_Politics',
    'media_type_count',
]


domain_features['Electoral_Engagement_domain'] =[
'Vote_Nonvote_Pres',
'Pre_election_inten_vote',
'intend_v_actual_presVote',
'Voted_Congress',
'Voted_Senate',
'VCF9027',
'VCF9028',
'VCF9029',
'VCF9030',
'VCF9030a',
'VCF9030b',
'VCF9030c',
'VCF9031',
'VCF9022',
'VCF9023',    
'Registered',
'Voted',
'Registered_voted',
'VCF0717',
'VCF0718',
'VCF0719',
'VCF0720',
'VCF0721',
]

domain_features['Political_Inclinations_Domain'] = \
[
    'Party_id3',
    'Party_id7',
    'Interest_elections',
    'interest_pubAffair',
    'therm_DemParty',
    'therm_RepParty',
    'Voted_party',
    'Voted_D_R',
]

domain_features['Socio_Eco_Domain'] = [
    'Family_income',
    'occupation',
    'occupation14',
    'occupation71',
    'home_ownership',
    'urbanism',
]


# check the variable labels for each domain is in the df.columns

for domain in domain_features.keys():
    for feature in domain_features[domain]:
        if feature not in df.columns:
            print(feature, 'is not in the df.columns')

            
# number of the total domain_features
print('there are', sum([len(domain_features[domain]) for domain in domain_features.keys()]), 'domain_features in total\n\n')

all_domain_features = ['Year']
for domain in domain_features.keys():
    all_domain_features += domain_features[domain]
    



there are 117 domain_features in total




In [6]:
df_domain = df[all_domain_features]

# save the DataFrame as a csv file
df_domain.to_csv('../data/cumulative_2022_v3_9_domain.csv', index=False)

