# Feature importance analysis acroos intend-to-vote-but-final-vote groups

In [19]:
import pandas as pd
import numpy as np
import utils
# import model
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

import warnings
warnings.filterwarnings("ignore")



# data path
file_path = '../data/cumulative_2022_v3_9_domain.csv'

data = pd.read_csv(file_path)

column_to_variable_dict = np.load('../data/column_to_variable_dict.npy', allow_pickle=True).item()
variable_to_column_dict = np.load('../data/variable_to_column_dict.npy', allow_pickle=True).item()

value_label_dict = np.load('../data/value_labels.npy', allow_pickle=True).item()

# check the "Year" column's max and min value
print(data['Year'].max())
print(data['Year'].min())

2020.0
1948.0


In [20]:
target_variable = 'Voted'

'''Voted  {0.0: '0. DK; NA; no Post IW; refused to say if voted;', 1.0: '1. No, did not vote', 2.0: '2. Yes, voted'}'''

data_new = data[data[target_variable].notnull()]

sub_target_variable = 'Pre_election_inten_vote'

'''Pre_election_inten_vote  {0.0: '0. DK (1964 only); NA; no Pre IW; DK/NA/RF (1952', 1.0: '1. Democratic candidate (with or without qualifications,', 2.0: '2. Republican candidate (with or without qualifications,', 3.0: '3. Undecided; DK (except 1964)', 4.0: "4. R does not intend to vote (incl. 'no, qualified' if", 9.0: '9. Other candidate'}
'''


# the total valid number of samples, who intend to vote for Democratic candidate or Republican candidate
data_new = data_new[(data_new[sub_target_variable]== 1) | (data_new[sub_target_variable] == 2)]

len(data_new)

31087

In [21]:
missing_value = utils.missing_value_analysis(data)

threshold_list = [0.2, 0.3, 0.4, 0.5]

# must_include_list = ['urbanism']
must_include_list = None


folder_name = '../data/non-voter-feature-analysis/'

used_features, not_used_features, folder_name = utils.feature_filter(data, threshold_list,column_to_variable_dict, folder_name, must_include_list)




In [22]:
# use the used features to filter out the data

# set the target variable set and index variable set, these variables will not be used for training

target_variable_list = ['Voted','Registered_voted','Voted_party','Vote_Nonvote_Pres']

race_variable_list = ['Race3','Race4','Race7']

religion_variable_list = ['religion']

index_variable_list = ['Year', ]

not_used_features = ['Pre_election_inten_vote']

state_variable_list = ['State']

non_feature_list = target_variable_list +  race_variable_list + religion_variable_list + index_variable_list + not_used_features + state_variable_list

# check the missing ratio of the target variable
print('missing value of the non-feature variable: ')
print(data_new[non_feature_list].isnull().sum() / len(data))



missing value of the non-feature variable: 
Voted                      0.000000
Registered_voted           0.016666
Voted_party                0.058205
Vote_Nonvote_Pres          0.003811
Race3                      0.001832
Race4                      0.001832
Race7                      0.001656
religion                   0.000835
Year                       0.000000
Pre_election_inten_vote    0.000000
State                      0.000000
dtype: float64


In [23]:
target_variable = 'Voted'

'''Voted  {0.0: '0. DK; NA; no Post IW; refused to say if voted;', 1.0: '1. No, did not vote', 2.0: '2. Yes, voted'}'''

# filter out the samples with missing value of the target variable,drop the index
data_new = data_new[data_new[target_variable].notnull()]
# filter out the samples with target variable value = 0, count the number of samples whose target variable value = 0, 1 or 2
print('number of samples who not vote : ', len(data_new[data_new[target_variable] == 1]))
print('number of samples who vote : ', len(data_new[data_new[target_variable] == 2]))
print('number of samples who vote case DK : ', len(data_new[data_new[target_variable] == 0]))

data_new = data_new[data_new[target_variable] != 0]
data_new = data_new.reset_index(drop=True)
print(data_new.shape)

year_threshold = 1950

folder_name = folder_name + '/'+ str(year_threshold)+ '/'

# filter out the samples whose year > year_threshold
data_new = data_new[data_new['Year'] > year_threshold]
data_new = data_new.reset_index(drop=True)
print(data_new.shape)


numerical_feature_list, categorical_feature_list = utils.feature_type_analysis(data_new, used_features, non_feature_list)

number of samples who not vote :  3472
number of samples who vote :  27615
number of samples who vote case DK :  0
(31087, 119)
(30757, 119)
number of numerical features:  9
number of categorical features:  38
numerical features list: ['therm_ChrFundament', 'therm_hispanics', 'therm_RepParty', 'therm_DemParty', 'therm_Whites', 'therm_liberals', 'therm_conservatives', 'therm_Blacks', 'Age']


In [24]:
def custom_combiner(feature, category):
    return str(feature) + "_XX_" + str(category)

data_XY = data_new[numerical_feature_list + categorical_feature_list+[target_variable]]
# data_XY = data_XY[data_XY.notnull().all(axis=1)]
data_XY = data_XY.reset_index(drop=True)
print(data_XY.shape)

X_continuous = data_XY[numerical_feature_list]
X_categorical = data_XY[categorical_feature_list]
Y_target = data_XY[target_variable]

# impute + process(one-hot)  categorical features (also get the new names)

X_categorical_imp = X_categorical.fillna(-1.0)

enc = OneHotEncoder(handle_unknown='ignore',feature_name_combiner=custom_combiner).fit(X_categorical_imp)

# enc

X_categorical_transformed = enc.transform(X_categorical_imp).toarray()

initial_list = enc.get_feature_names_out().tolist()
# enc_categorical_feature_list = utils.enc_feature_list(initial_list, enc, value_label_dict)    

(30757, 48)


In [27]:
def enc_feature_list(initial_list, enc, value_label_dict):
    new_list = []

    for string in initial_list:
        feature_name, category_name = get_feature_name_category_name(string, enc, value_label_dict)
        new_list.append((feature_name+'_'+ category_name))
    return new_list

def get_feature_name_category_name(string, enc, value_label_dict):
    # feature_id = int(string.split('_XX_')[0][1:])
    feature_name = string.split('_XX_')[0]
    category_index = float(string.split('_XX_')[1])
    
    if category_index == -1:
        category_name = 'Missing'
    else:
        category_name = value_label_dict[feature_name][category_index]

    return feature_name, category_name

In [28]:
enc_categorical_feature_list = enc_feature_list(initial_list, enc, value_label_dict)    

In [7]:
# print sklearn version
import sklearn
print(sklearn.__version__)

1.4.1.post1


In [17]:
# slipt the group by race and religion

data_race7_dict = utils.group_split_race7(data_new) 
data_religion_dict = utils.group_split_religon(data_new)


number of samples of White:  23640
number of samples of Black:  3555
number of samples of Asian:  439
number of samples of American_Indian:  226
number of samples of Hispanic:  2288
number of samples of Other:  500
number of samples of Protestant:  16883
number of samples of Catholic:  7126
number of samples of Jewish:  766
number of samples of Other:  5926


In [37]:
enc.feature_names_in_

array(['howOftenTrust', 'follow_political_info', 'bisexalFamilyorFriends',
       'blackInfluence_Politics', 'sex_orientation', 'powerDifference',
       'satisfactionLife', 'votingMakedifference', 'living_withFamily',
       'satisfactionDemocracy', 'workedWithcommunity',
       'meetingCommuntySchool', 'have_healthInsurance', 'authoritarian2',
       'authoritarian4', 'authoritarian1', 'authoritarian3', 'VCF0886',
       'volunteer', 'VCF9022', 'approve_president_economy', 'VCF0890',
       'approve_pres_strength', 'VCF9031', 'VCF9029', 'approve_pres',
       'ideology7', 'VCF0130', 'VCF9028', 'Voted_D_R', 'VCF9030b',
       'VCF9030c', 'VCF9030a', 'VCF9030', 'VCF0720', 'Family_income',
       'Interest_elections', 'Will_PresElectionClose', 'education6',
       'education7', 'VCF0721', 'VCF0719', 'VCF0718', 'VCF0717',
       'Party_id7', 'Party_id3', 'South', 'region', 'Education4',
       'Gender'], dtype=object)

In [18]:

# start from all-clear case:  further filter out the samples with missing value of the used features

group = 'race'

for group_cat in data_race7_dict.keys():

    data_group = data_race7_dict[group_cat]

    X_categorical_transformed, X_continuous_transformed, Y_target, enc_categorical_feature_list = utils.feature_process(data_group, numerical_feature_list, categorical_feature_list, target_variable,value_label_dict)

    X_continuous_categorical = np.concatenate((X_continuous_transformed, X_categorical_transformed), axis=1)

    model = LogisticRegression(l1_ratio = 0.5, max_iter = 500, solver = 'saga', penalty = 'elasticnet')

    accuracy_list, recall_list, precision_list, f1_list, roc_auc_list, importance_list = utils.cross_validation(X_continuous_categorical, Y_target, model, k = 5)

    print('average accuracy: ', np.mean(accuracy_list))
    print('average recall: ', np.mean(recall_list))
    print('average precision: ', np.mean(precision_list))
    print('average f1 score: ', np.mean(f1_list))
    print('average roc auc score: ', np.mean(roc_auc_list))

    # build the feature importance dataframe
    feature_importance = pd.DataFrame({'feature': numerical_feature_list + enc_categorical_feature_list, 'importance': np.mean(importance_list, axis=0)})

    top_15_positive = feature_importance.sort_values('importance', ascending = False).head(15)
    top_15_negative = feature_importance.sort_values('importance', ascending = True).head(15)

    # build a folder to save the results
    sub_folder_name = folder_name + group + '/' + group_cat + '/'
    if not os.path.exists(sub_folder_name):
        os.makedirs(sub_folder_name)

    feature_importance.to_csv(sub_folder_name + 'feature_importance.csv', index = False)
    top_15_positive.to_csv(sub_folder_name + 'top_15_voter.csv', index = False)
    top_15_negative.to_csv(sub_folder_name + 'top_15_non_voter.csv', index = False)

    # save the mean of the metrics
    metrics = pd.DataFrame({'accuracy': np.mean(accuracy_list), 'recall': np.mean(recall_list), 'precision': np.mean(precision_list), 'f1': np.mean(f1_list), 'roc_auc': np.mean(roc_auc_list)}, index = [0])
    metrics.to_csv(sub_folder_name + 'metrics.csv', index = False)


(23640, 62)


KeyError: 4

In [24]:
    data_XY = data_new[numerical_feature_list + categorical_feature_list+[target_variable]]
    # data_XY = data_XY[data_XY.notnull().all(axis=1)]
    data_XY = data_XY.reset_index(drop=True)
    print(data_XY.shape)

    X_continuous = data_XY[numerical_feature_list]
    X_categorical = data_XY[categorical_feature_list]
    Y_target = data_XY[target_variable]

    # impute + process(one-hot)  categorical features (also get the new names)

    X_categorical_imp = X_categorical.fillna(-1)

    enc = OneHotEncoder(handle_unknown='ignore')

    enc.fit(X_categorical_imp)

    X_categorical_transformed = enc.transform(X_categorical_imp).toarray()

    initial_list = enc.get_feature_names().tolist()

(30757, 62)


In [33]:
    feature_id = int(string.split('_')[0][1:])
    category_index = int(float(string.split('_')[1]))
    feature_name = enc.feature_names_in_[feature_id]
    print('feature id: ', feature_id)
    print('category index: ', category_index)
    print('feature name: ', feature_name)

feature id:  4
category index:  4
feature name:  sex_orientation


In [36]:
value_label_dict['sex_orientation']

{-9.0: '-9. RF; NA; Inap',
 -8.0: '-8. DK',
 1.0: '1. Heterosexual or straight',
 2.0: '2. Bisexual',
 3.0: '3. Homosexual or gay (or lesbian)'}

In [28]:
    new_list = []

    for string in initial_list:
        # feature_name, category_name = utils.get_feature_name_category_name(string, enc, value_label_dict)




        
        new_list.append((feature_name+'_'+ category_name))
    return new_list

KeyError: 4

In [13]:

# start from all-clear case:  further filter out the samples with missing value of the used features

group = 'religon'

for group_cat in data_religion_dict.keys():

    data_group = data_religion_dict[group_cat]

    X_categorical_transformed, X_continuous_transformed, Y_target, enc_categorical_feature_list = utils.feature_process(data_group, numerical_feature_list, categorical_feature_list, target_variable,value_label_dict)

    X_continuous_categorical = np.concatenate((X_continuous_transformed, X_categorical_transformed), axis=1)

    model = LogisticRegression(l1_ratio = 0.5, max_iter = 500, solver = 'saga', penalty = 'elasticnet')

    accuracy_list, recall_list, precision_list, f1_list, roc_auc_list, importance_list = utils.cross_validation(X_continuous_categorical, Y_target, model, k = 5)

    print('average accuracy: ', np.mean(accuracy_list))
    print('average recall: ', np.mean(recall_list))
    print('average precision: ', np.mean(precision_list))
    print('average f1 score: ', np.mean(f1_list))
    print('average roc auc score: ', np.mean(roc_auc_list))

    # build the feature importance dataframe
    feature_importance = pd.DataFrame({'feature': numerical_feature_list + enc_categorical_feature_list, 'importance': np.mean(importance_list, axis=0)})

    top_15_positive = feature_importance.sort_values('importance', ascending = False).head(15)
    top_15_negative = feature_importance.sort_values('importance', ascending = True).head(15)

    # build a folder to save the results
    sub_folder_name = folder_name + group + '/' + group_cat + '/'
    if not os.path.exists(sub_folder_name):
        os.makedirs(sub_folder_name)

    feature_importance.to_csv(sub_folder_name + 'feature_importance.csv', index = False)
    top_15_positive.to_csv(sub_folder_name + 'top_15_voter.csv', index = False)
    top_15_negative.to_csv(sub_folder_name + 'top_15_non_voter.csv', index = False)

    # save the mean of the metrics
    metrics = pd.DataFrame({'accuracy': np.mean(accuracy_list), 'recall': np.mean(recall_list), 'precision': np.mean(precision_list), 'f1': np.mean(f1_list), 'roc_auc': np.mean(roc_auc_list)}, index = [0])
    metrics.to_csv(sub_folder_name + 'metrics.csv', index = False)


(18623, 50)


average accuracy:  0.8480371686647107
average recall:  0.5845162305310698
average precision:  0.7658612660156082
average f1 score:  0.6629330300518849
average roc auc score:  0.7615593240322965
(8686, 50)
average accuracy:  0.8411231750839543
average recall:  0.5523585929784071
average precision:  0.7290241880938019
average f1 score:  0.6281950296782736
average roc auc score:  0.7431751796911741
(746, 50)
average accuracy:  0.8780492170022371
average recall:  0.3091754385964912
average precision:  0.5468686868686868
average f1 score:  0.39288689611270255
average roc auc score:  0.6361456378935157
(9319, 50)
average accuracy:  0.8415062695960873
average recall:  0.6603457886651961
average precision:  0.7736979566419017
average f1 score:  0.7124332614824447
average roc auc score:  0.7893140910932284
