# Data loading and process


count the missing values of each dommain features, set the threshold to 0.3 to filter out the features with too many missing values

In [17]:
import pandas as pd
import numpy as np


# data path
file_path = '../data/cumulative_2022_v3_9_domain.csv'

data = pd.read_csv(file_path)

column_to_variable_dict = np.load('../data/column_to_variable_dict.npy', allow_pickle=True).item()
variable_to_column_dict = np.load('../data/variable_to_column_dict.npy', allow_pickle=True).item()

# check the "Year" column's max and min value
print(data['Year'].max())
print(data['Year'].min())

2020.0
1948.0


In [11]:
data.isnull().sum()

Year                                  0
South                              1801
region                             1801
racial_composition_nbhood         59420
racial_composition_gradeSchool    60327
                                  ...  
occupation                        28223
occupation14                      51795
occupation71                      51795
home_ownership                    13672
urbanism                          24972
Length: 116, dtype: int64

In [11]:
# analysis the missing value of each column in differernt period:
# 1. overall missing value and percentage
# 2. missing value and percentage in recent 10 years ("Year" >= 2012)")
# 3. missing value and percentage in recent 20 years ("Year" >= 2002)")
# 4. missing value and percentage in recent 30 years ("Year" >= 1992)")
# 5. missing value and percentage in recent 40 years ("Year" >= 1982)")
# 6. missing value and percentage in recent 50 years ("Year" >= 1972)")
# 7. missing value and percentage in recent 60 years ("Year" >= 1962)")

# save the result in csv file, the first column is the feature name


def missing_value_analysis(data):
    # get the number of missing value of each column
    missing_value_num = data.isnull().sum()
    # get the percentage of missing value of each column
    missing_value_percentage = missing_value_num / len(data)

    missing_value_percentage_10 = data[data['Year'] >= 2012].isnull().sum() / len(data[data['Year'] >= 2012])
    missing_value_percentage_20 = data[data['Year'] >= 2002].isnull().sum() / len(data[data['Year'] >= 2002])
    missing_value_percentage_30 = data[data['Year'] >= 1992].isnull().sum() / len(data[data['Year'] >= 1992])
    missing_value_percentage_40 = data[data['Year'] >= 1982].isnull().sum() / len(data[data['Year'] >= 1982])
    missing_value_percentage_50 = data[data['Year'] >= 1972].isnull().sum() / len(data[data['Year'] >= 1972])
    missing_value_percentage_60 = data[data['Year'] >= 1962].isnull().sum() / len(data[data['Year'] >= 1962])
    missing_value_percentage_70 = data[data['Year'] >= 1952].isnull().sum() / len(data[data['Year'] >= 1952])

    # get the variable name of each column by using the column_to_variable_dict
    # missing_value_num.index = column_to_variable_dict['variable']


    # combine the result
    missing_value = pd.concat([missing_value_num, missing_value_percentage,
                               missing_value_percentage_10, missing_value_percentage_20,
                               missing_value_percentage_30, missing_value_percentage_40,
                               missing_value_percentage_50, missing_value_percentage_60,    missing_value_percentage_70], axis=1)
    missing_value.columns = ['missing_value_num', 'missing_value_percentage',
                                'missing_value_percentage_10(>=2012)', 'missing_value_percentage_20(>=2002)',
                                'missing_value_percentage_30(>=1992)', 'missing_value_percentage_40(>=1982)',
                                'missing_value_percentage_50(>=1972)', 'missing_value_percentage_60(>=1962)', 'missing_value_percentage_60(>=1952)']

    # sort the result by missing value percentage
    missing_value = missing_value.sort_values(by='missing_value_percentage', ascending=False)


    return missing_value

missing_value = missing_value_analysis(data)

# save the result
# massing_value.to_csv('../data/missing_value_analysis.csv')


In [19]:
# add one column to indicate the variable name of each row,using the index of the missing_value as the key

variable_name = [ column_to_variable_dict[var] for var in missing_value.index]
missing_value['variable_name'] = variable_name

In [40]:
# set the filter-out thresholds:
# 1. missing_value_percentage_10(>=2012) < 0.3
# 2. missing_value_percentage_20(>=2002) < 0.4
# 3. missing_value_percentage_30(>=1992) < 0.5

threshold_10 = 0.3
threshold_20 = 0.4
threshold_30 = 0.5


# filter out the features
missing_value_used = missing_value[(
                missing_value['missing_value_percentage_10(>=2012)'] < threshold_10) & 
                                        (missing_value['missing_value_percentage_20(>=2002)'] < threshold_20) &
                                        (missing_value['missing_value_percentage_30(>=1992)'] < threshold_30)]

missing_value_not_used = missing_value[(
                missing_value['missing_value_percentage_10(>=2012)'] >= threshold_10) | 
                                        (missing_value['missing_value_percentage_20(>=2002)'] >= threshold_20) |
                                        (missing_value['missing_value_percentage_30(>=1992)'] >= threshold_30)]

# count the number of features
print('number of features used: ', len(missing_value_used))
print('number of features not used: ', len(missing_value_not_used))

# save the result
# make folder namsed with threshold:
folder_name = '../data/threshold_10_' + str(threshold_10) + '_threshold_20_' + str(threshold_20) + '_threshold_30_' + str(threshold_30)

# make folder if not exist
import os
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

missing_value_used.to_csv(folder_name + '/missing_value_analysis_used.csv')
missing_value_not_used.to_csv(folder_name + '/missing_value_analysis_not_used.csv')

# save the used features names (row names) and the variable names
used_features = missing_value_used.index.tolist()
with open(folder_name + '/used_features.txt', 'w') as f:
    for item in used_features:
        f.write("%s (%s)\n" % (item, column_to_variable_dict[item]))

# save the not used features names (row names)
not_used_features = missing_value_not_used.index.tolist()
with open(folder_name + '/not_used_features.txt', 'w') as f:
    for item in not_used_features:
        f.write("%s (%s)\n" % (item, column_to_variable_dict[item]))

number of features used:  72
number of features not used:  44


In [27]:
# use the used features to filter out the data

# set the target variable set and index variable set, these variables will not be used for training

target_variable_list = ['Voted','Registered_voted','Voted_party','Vote_Nonvote_Pres']

index_variable_list = ['Year', ]

# check the missing ratio of the target variable
print('missing value of the target variable: ')
print(data[target_variable_list].isnull().sum() / len(data))


missing value of the target variable: 
Voted                0.091551
Registered_voted     0.218061
Voted_party          0.536483
Vote_Nonvote_Pres    0.377067
dtype: float64


In [35]:
target_variable = 'Voted'

'''Voted  {0.0: '0. DK; NA; no Post IW; refused to say if voted;', 1.0: '1. No, did not vote', 2.0: '2. Yes, voted'}'''

# filter out the samples with missing value of the target variable,drop the index
data_new = data[data[target_variable].notnull()]
# filter out the samples with target variable value = 0, count the number of samples whose target variable value = 0, 1 or 2
print('number of samples who not vote : ', len(data_new[data_new[target_variable] == 1]))
print('number of samples who vote : ', len(data_new[data_new[target_variable] == 2]))
print('number of samples who vote case DK : ', len(data_new[data_new[target_variable] == 0]))

data_new = data_new[data_new[target_variable] != 0]
data_new = data_new.reset_index(drop=True)
data_new.shape

number of samples who not vote :  17790
number of samples who vote :  44188
number of samples who vote case DK :  0


(61978, 116)

In [39]:
# delete the target variable from the used features
# used_features.remove(target_variable_set)


# further filter out the samples with missing value of the used features



data_new = data_new[used_features]
data_new = data_new[data_new.notnull().all(axis=1)]
data_new = data_new.reset_index(drop=True)
data_new.shape

ValueError: list.remove(x): x not in list

In [38]:
used_features

['blackInfluence_Politics',
 'powerDifference',
 'votingMakedifference',
 'bisexalFamilyorFriends',
 'therm_Christians',
 'satisfactionDemocracy',
 'follow_political_info',
 'howOftenTrust',
 'therm_Mislims',
 'sex_orientation',
 'satisfactionLife',
 'living_withFamily',
 'meetingCommuntySchool',
 'workedWithcommunity',
 'VCF9022',
 'authoritarian2',
 'authoritarian4',
 'authoritarian1',
 'authoritarian3',
 'have_healthInsurance',
 'volunteer',
 'bornAgain',
 'therm_ChrFundament',
 'VCF0886',
 'church_attendance',
 'Voted_party',
 'therm_hispanics',
 'VCF9029',
 'VCF0890',
 'VCF9031',
 'VCF9028',
 'approve_president_economy',
 'Vote_Nonvote_Pres',
 'approve_pres_strength',
 'therm_RepParty',
 'therm_DemParty',
 'Pre_election_inten_vote',
 'therm_Whites',
 'Will_PresElectionClose',
 'therm_liberals',
 'therm_conservatives',
 'therm_Blacks',
 'ideology7',
 'approve_pres',
 'VCF9030c',
 'VCF9030b',
 'Registered_voted',
 'VCF9030a',
 'VCF0130',
 'VCF9030',
 'home_ownership',
 'VCF0720',
 '