# Feature importance analysis acroos intend-to-vote-but-final-vote groups

In [39]:
import pandas as pd
import numpy as np
import utils
# import model
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

import warnings
warnings.filterwarnings("ignore")



# data path
file_path = '../data/cumulative_2022_v3_9_domain.csv'

data = pd.read_csv(file_path)

column_to_variable_dict = np.load('../data/column_to_variable_dict.npy', allow_pickle=True).item()
variable_to_column_dict = np.load('../data/variable_to_column_dict.npy', allow_pickle=True).item()

value_label_dict = np.load('../data/value_labels.npy', allow_pickle=True).item()

# check the "Year" column's max and min value
print(data['Year'].max())
print(data['Year'].min())

2020.0
1948.0


In [42]:
target_variable = 'Voted'

'''Voted  {0.0: '0. DK; NA; no Post IW; refused to say if voted;', 1.0: '1. No, did not vote', 2.0: '2. Yes, voted'}'''

data_new = data[data[target_variable].notnull()]

sub_target_variable = 'Pre_election_inten_vote'

'''Pre_election_inten_vote  {0.0: '0. DK (1964 only); NA; no Pre IW; DK/NA/RF (1952', 1.0: '1. Democratic candidate (with or without qualifications,', 2.0: '2. Republican candidate (with or without qualifications,', 3.0: '3. Undecided; DK (except 1964)', 4.0: "4. R does not intend to vote (incl. 'no, qualified' if", 9.0: '9. Other candidate'}
'''


# the total valid number of samples, who intend to vote for Democratic candidate or Republican candidate



data_new = data_new[(data_new[sub_target_variable]== 1) | (data_new[sub_target_variable] == 2) | (data_new[sub_target_variable] == 9)   ]
# folder_name = '../data/non-voter-feature-analysis/intent-vote/'



# folder_name = '../data/non-voter-feature-analysis/all/'

data_new = data_new[(data_new["State"]== "WA")  ]

folder_name = '../data/non-voter-feature-analysis/WA/intent-vote/'



len(data_new)

712

In [43]:
missing_value = utils.missing_value_analysis(data)

# threshold_list = [0.2, 0.3, 0.4, 0.5]
threshold_list = [0.2, 0.3, 0.4, 0.5]


# must_include_list = ['urbanism']
must_include_list = None


# folder_name = '../data/non-voter-feature-analysis/'

used_features, not_used_features, folder_name = utils.feature_filter(data, threshold_list,column_to_variable_dict, folder_name, must_include_list)




In [44]:
# use the used features to filter out the data

# set the target variable set and index variable set, these variables will not be used for training

target_variable_list = ['Voted','Registered_voted','Voted_party','Vote_Nonvote_Pres']

race_variable_list = ['Race3','Race4','Race7']

religion_variable_list = ['religion']

index_variable_list = ['Year', ]

not_used_features = ['Pre_election_inten_vote']
# not_used_features = []


state_variable_list = ['State']

non_feature_list = target_variable_list +  race_variable_list + religion_variable_list + index_variable_list + not_used_features + state_variable_list

# check the missing ratio of the target variable
print('missing value of the non-feature variable: ')
print(data_new[non_feature_list].isnull().sum() / len(data))



missing value of the non-feature variable: 
Voted                      0.000000
Registered_voted           0.000293
Voted_party                0.001143
Vote_Nonvote_Pres          0.000117
Race3                      0.000059
Race4                      0.000059
Race7                      0.000059
religion                   0.000059
Year                       0.000000
Pre_election_inten_vote    0.000000
State                      0.000000
dtype: float64


In [45]:
target_variable = 'Voted'

'''Voted  {0.0: '0. DK; NA; no Post IW; refused to say if voted;', 1.0: '1. No, did not vote', 2.0: '2. Yes, voted'}'''

# filter out the samples with missing value of the target variable,drop the index
data_new = data_new[data_new[target_variable].notnull()]
# filter out the samples with target variable value = 0, count the number of samples whose target variable value = 0, 1 or 2
print('number of samples who not vote : ', len(data_new[data_new[target_variable] == 1]))
print('number of samples who vote : ', len(data_new[data_new[target_variable] == 2]))
print('number of samples who vote case DK : ', len(data_new[data_new[target_variable] == 0]))

data_new = data_new[data_new[target_variable] != 0]
data_new = data_new.reset_index(drop=True)
print(data_new.shape)

year_threshold = 1948

folder_name = folder_name + '/'+ str(year_threshold)+ '/'

# filter out the samples whose year > year_threshold
data_new = data_new[data_new['Year'] > year_threshold]
data_new = data_new.reset_index(drop=True)
print(data_new.shape)


numerical_feature_list, categorical_feature_list = utils.feature_type_analysis(data_new, used_features, non_feature_list)

number of samples who not vote :  48
number of samples who vote :  664
number of samples who vote case DK :  0
(712, 119)
(712, 119)
number of numerical features:  9
number of categorical features:  38
numerical features list: ['therm_ChrFundament', 'therm_hispanics', 'therm_RepParty', 'therm_DemParty', 'therm_Whites', 'therm_liberals', 'therm_conservatives', 'therm_Blacks', 'Age']


In [46]:
# slipt the group by race and religion

data_race7_dict = utils.group_split_race7(data_new) 
data_religion_dict = utils.group_split_religon(data_new)


number of samples of White:  610
number of samples of Black:  17
number of samples of Asian:  22
number of samples of American_Indian:  4
number of samples of Hispanic:  40
number of samples of Other:  15
number of samples of Protestant:  373
number of samples of Catholic:  107
number of samples of Jewish:  8
number of samples of Other:  220


In [60]:
# from sklearn.inspection import permutation_importance
# result = permutation_importance(model, X, Y, n_repeats=10, random_state=0)

In [47]:
utils.feature_importance_analysis(data_new, numerical_feature_list, categorical_feature_list, target_variable, value_label_dict, folder_name, group='whole-group', group_cat='')

(712, 48)
average accuracy:  0.9241701960011819
average recall:  0.0988095238095238
average precision:  0.19666666666666666
average f1 score:  0.12803030303030302
average roc auc score:  0.5418725203267188


In [48]:

group = 'race'

for group_cat in data_race7_dict.keys():

    data_group = data_race7_dict[group_cat]

    utils.feature_importance_analysis(data_group, numerical_feature_list, categorical_feature_list, target_variable, value_label_dict, folder_name, group=group, group_cat=group_cat)

(610, 48)
average accuracy:  0.9344262295081966
average recall:  0.02857142857142857
average precision:  0.05
average f1 score:  0.03636363636363636
average roc auc score:  0.5090831873440569
(17, 48)


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [49]:
group = 'religon'

for group_cat in data_religion_dict.keys():

    data_group = data_religion_dict[group_cat]

    utils.feature_importance_analysis(data_group, numerical_feature_list, categorical_feature_list, target_variable, value_label_dict, folder_name, group=group, group_cat=group_cat)

(373, 48)
average accuracy:  0.9141981981981981
average recall:  0.02857142857142857
average precision:  0.2
average f1 score:  0.05
average roc auc score:  0.5043029675638371
(107, 48)
average accuracy:  0.8874458874458874
average recall:  0.0
average precision:  0.0
average f1 score:  0.0
average roc auc score:  0.48473684210526313
(8, 48)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0