# Feature importance analysis acroos different group (race/religon)

In [7]:
import pandas as pd
import numpy as np
import utils
# import model
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

import warnings
warnings.filterwarnings("ignore")



# data path
file_path = '../data/cumulative_2022_v3_9_domain.csv'

data = pd.read_csv(file_path)

column_to_variable_dict = np.load('../data/column_to_variable_dict.npy', allow_pickle=True).item()
variable_to_column_dict = np.load('../data/variable_to_column_dict.npy', allow_pickle=True).item()

value_label_dict = np.load('../data/value_labels.npy', allow_pickle=True).item()

# check the "Year" column's max and min value
print(data['Year'].max())
print(data['Year'].min())

2020.0
1948.0


In [8]:
missing_value = utils.missing_value_analysis(data)

threshold_list = [0.2, 0.3, 0.4, 0.5]

must_include_list = ['urbanism']

used_features, not_used_features, folder_name = utils.feature_filter(data, threshold_list,column_to_variable_dict, must_include_list)




In [9]:
# use the used features to filter out the data

# set the target variable set and index variable set, these variables will not be used for training

target_variable_list = ['Voted','Registered_voted','Voted_party','Vote_Nonvote_Pres']

race_variable_list = ['Race3','Race4','Race7']

religion_variable_list = ['religion']

index_variable_list = ['Year', ]

state_variable_list = ['State']

non_feature_list = target_variable_list +  race_variable_list + religion_variable_list + index_variable_list

# check the missing ratio of the target variable
print('missing value of the non-feature variable: ')
print(data[non_feature_list].isnull().sum() / len(data))



missing value of the non-feature variable: 
Voted                0.091551
Registered_voted     0.218061
Voted_party          0.536483
Vote_Nonvote_Pres    0.377067
Race3                0.024874
Race4                0.024874
Race7                0.024068
religion             0.007065
Year                 0.000000
dtype: float64


In [10]:
target_variable = 'Voted'

'''Voted  {0.0: '0. DK; NA; no Post IW; refused to say if voted;', 1.0: '1. No, did not vote', 2.0: '2. Yes, voted'}'''

# filter out the samples with missing value of the target variable,drop the index
data_new = data[data[target_variable].notnull()]
# filter out the samples with target variable value = 0, count the number of samples whose target variable value = 0, 1 or 2
print('number of samples who not vote : ', len(data_new[data_new[target_variable] == 1]))
print('number of samples who vote : ', len(data_new[data_new[target_variable] == 2]))
print('number of samples who vote case DK : ', len(data_new[data_new[target_variable] == 0]))

data_new = data_new[data_new[target_variable] != 0]
data_new = data_new.reset_index(drop=True)
print(data_new.shape)

year_threshold = 1982

folder_name = folder_name + '/'+ str(year_threshold)+ '/'

# filter out the samples whose year > year_threshold
data_new = data_new[data_new['Year'] > year_threshold]
data_new = data_new.reset_index(drop=True)
print(data_new.shape)


numerical_feature_list, categorical_feature_list = utils.feature_type_analysis(data, used_features, non_feature_list)

number of samples who not vote :  17790
number of samples who vote :  44188
number of samples who vote case DK :  0
(61978, 118)
(37513, 118)
number of numerical features:  9
number of categorical features:  40
numerical features list: ['therm_ChrFundament', 'therm_hispanics', 'therm_RepParty', 'therm_DemParty', 'therm_Whites', 'therm_liberals', 'therm_conservatives', 'therm_Blacks', 'Age']


In [11]:
# slipt the group by race and religion

data_race7_dict = utils.group_split_race7(data_new) 
data_religion_dict = utils.group_split_religon(data_new)


number of samples of White:  26879
number of samples of Black:  4655
number of samples of Asian:  696
number of samples of American_Indian:  428
number of samples of Hispanic:  3849
number of samples of Other:  725
number of samples of Protestant:  18623
number of samples of Catholic:  8686
number of samples of Jewish:  746
number of samples of Other:  9319


In [12]:

# start from all-clear case:  further filter out the samples with missing value of the used features

group = 'race'

for group_cat in data_race7_dict.keys():

    data_group = data_race7_dict[group_cat]

    X_categorical_transformed, X_continuous_transformed, Y_target, enc_categorical_feature_list = utils.feature_process(data_group, numerical_feature_list, categorical_feature_list, target_variable,value_label_dict)

    X_continuous_categorical = np.concatenate((X_continuous_transformed, X_categorical_transformed), axis=1)

    model = LogisticRegression(l1_ratio = 0.5, max_iter = 500, solver = 'saga', penalty = 'elasticnet')

    accuracy_list, recall_list, precision_list, f1_list, roc_auc_list, importance_list = utils.cross_validation(X_continuous_categorical, Y_target, model, k = 5)

    print('average accuracy: ', np.mean(accuracy_list))
    print('average recall: ', np.mean(recall_list))
    print('average precision: ', np.mean(precision_list))
    print('average f1 score: ', np.mean(f1_list))
    print('average roc auc score: ', np.mean(roc_auc_list))

    # build the feature importance dataframe
    feature_importance = pd.DataFrame({'feature': numerical_feature_list + enc_categorical_feature_list, 'importance': np.mean(importance_list, axis=0)})

    top_15_positive = feature_importance.sort_values('importance', ascending = False).head(15)
    top_15_negative = feature_importance.sort_values('importance', ascending = True).head(15)

    # build a folder to save the results
    sub_folder_name = folder_name + group + '/' + group_cat + '/'
    if not os.path.exists(sub_folder_name):
        os.makedirs(sub_folder_name)

    feature_importance.to_csv(sub_folder_name + 'feature_importance.csv', index = False)
    top_15_positive.to_csv(sub_folder_name + 'top_15_voter.csv', index = False)
    top_15_negative.to_csv(sub_folder_name + 'top_15_non_voter.csv', index = False)

    # save the mean of the metrics
    metrics = pd.DataFrame({'accuracy': np.mean(accuracy_list), 'recall': np.mean(recall_list), 'precision': np.mean(precision_list), 'f1': np.mean(f1_list), 'roc_auc': np.mean(roc_auc_list)}, index = [0])
    metrics.to_csv(sub_folder_name + 'metrics.csv', index = False)


(26879, 50)


average accuracy:  0.857844545957918
average recall:  0.5902974140840849
average precision:  0.7661200618794732
average f1 score:  0.6667050852190337
average roc auc score:  0.766497464955573
(4655, 50)
average accuracy:  0.8161117078410312
average recall:  0.5658855535320377
average precision:  0.7169315645735456
average f1 score:  0.6322157806803577
average roc auc score:  0.7395545146375999
(696, 50)
average accuracy:  0.794563206577595
average recall:  0.5269227970084763
average precision:  0.6782051282051281
average f1 score:  0.5916610997462569
average roc auc score:  0.7131760024691828
(428, 50)
average accuracy:  0.7569630642954857
average recall:  0.6892585321655089
average precision:  0.7031446351802303
average f1 score:  0.6942463412974214
average roc auc score:  0.7465784968500943
(3849, 50)
average accuracy:  0.7843598534105687
average recall:  0.6491549672106925
average precision:  0.7342200353076385
average f1 score:  0.6881180998862091
average roc auc score:  0.75641104

In [13]:

# start from all-clear case:  further filter out the samples with missing value of the used features

group = 'religon'

for group_cat in data_religion_dict.keys():

    data_group = data_religion_dict[group_cat]

    X_categorical_transformed, X_continuous_transformed, Y_target, enc_categorical_feature_list = utils.feature_process(data_group, numerical_feature_list, categorical_feature_list, target_variable,value_label_dict)

    X_continuous_categorical = np.concatenate((X_continuous_transformed, X_categorical_transformed), axis=1)

    model = LogisticRegression(l1_ratio = 0.5, max_iter = 500, solver = 'saga', penalty = 'elasticnet')

    accuracy_list, recall_list, precision_list, f1_list, roc_auc_list, importance_list = utils.cross_validation(X_continuous_categorical, Y_target, model, k = 5)

    print('average accuracy: ', np.mean(accuracy_list))
    print('average recall: ', np.mean(recall_list))
    print('average precision: ', np.mean(precision_list))
    print('average f1 score: ', np.mean(f1_list))
    print('average roc auc score: ', np.mean(roc_auc_list))

    # build the feature importance dataframe
    feature_importance = pd.DataFrame({'feature': numerical_feature_list + enc_categorical_feature_list, 'importance': np.mean(importance_list, axis=0)})

    top_15_positive = feature_importance.sort_values('importance', ascending = False).head(15)
    top_15_negative = feature_importance.sort_values('importance', ascending = True).head(15)

    # build a folder to save the results
    sub_folder_name = folder_name + group + '/' + group_cat + '/'
    if not os.path.exists(sub_folder_name):
        os.makedirs(sub_folder_name)

    feature_importance.to_csv(sub_folder_name + 'feature_importance.csv', index = False)
    top_15_positive.to_csv(sub_folder_name + 'top_15_voter.csv', index = False)
    top_15_negative.to_csv(sub_folder_name + 'top_15_non_voter.csv', index = False)

    # save the mean of the metrics
    metrics = pd.DataFrame({'accuracy': np.mean(accuracy_list), 'recall': np.mean(recall_list), 'precision': np.mean(precision_list), 'f1': np.mean(f1_list), 'roc_auc': np.mean(roc_auc_list)}, index = [0])
    metrics.to_csv(sub_folder_name + 'metrics.csv', index = False)


(18623, 50)


average accuracy:  0.8480371686647107
average recall:  0.5845162305310698
average precision:  0.7658612660156082
average f1 score:  0.6629330300518849
average roc auc score:  0.7615593240322965
(8686, 50)
average accuracy:  0.8411231750839543
average recall:  0.5523585929784071
average precision:  0.7290241880938019
average f1 score:  0.6281950296782736
average roc auc score:  0.7431751796911741
(746, 50)
average accuracy:  0.8780492170022371
average recall:  0.3091754385964912
average precision:  0.5468686868686868
average f1 score:  0.39288689611270255
average roc auc score:  0.6361456378935157
(9319, 50)
average accuracy:  0.8415062695960873
average recall:  0.6603457886651961
average precision:  0.7736979566419017
average f1 score:  0.7124332614824447
average roc auc score:  0.7893140910932284
