# Feature importance analysis acroos different group (race/religon)

In [1]:
import pandas as pd
import numpy as np
import utils
# import model
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

import warnings
warnings.filterwarnings("ignore")



# data path
file_path = '../data/cumulative_2022_v3_9_domain.csv'

data = pd.read_csv(file_path)

column_to_variable_dict = np.load('../data/column_to_variable_dict.npy', allow_pickle=True).item()
variable_to_column_dict = np.load('../data/variable_to_column_dict.npy', allow_pickle=True).item()

value_label_dict = np.load('../data/value_labels.npy', allow_pickle=True).item()

# check the "Year" column's max and min value
print(data['Year'].max())
print(data['Year'].min())

2020.0
1948.0


In [2]:
missing_value = utils.missing_value_analysis(data)

threshold_list = [0.2, 0.3, 0.4, 0.5]

used_features, not_used_features, folder_name = utils.feature_filter(data, threshold_list,column_to_variable_dict)




In [3]:
# use the used features to filter out the data

# set the target variable set and index variable set, these variables will not be used for training

target_variable_list = ['Voted','Registered_voted','Voted_party','Vote_Nonvote_Pres']

race_variable_list = ['Race3','Race4','Race7']

religion_variable_list = ['religion']

index_variable_list = ['Year', ]

non_feature_list = target_variable_list +  race_variable_list + religion_variable_list + index_variable_list

# check the missing ratio of the target variable
print('missing value of the non-feature variable: ')
print(data[non_feature_list].isnull().sum() / len(data))



missing value of the non-feature variable: 
Voted                0.091551
Registered_voted     0.218061
Voted_party          0.536483
Vote_Nonvote_Pres    0.377067
Race3                0.024874
Race4                0.024874
Race7                0.024068
religion             0.007065
Year                 0.000000
dtype: float64


In [4]:
target_variable = 'Voted'

'''Voted  {0.0: '0. DK; NA; no Post IW; refused to say if voted;', 1.0: '1. No, did not vote', 2.0: '2. Yes, voted'}'''

# filter out the samples with missing value of the target variable,drop the index
data_new = data[data[target_variable].notnull()]
# filter out the samples with target variable value = 0, count the number of samples whose target variable value = 0, 1 or 2
print('number of samples who not vote : ', len(data_new[data_new[target_variable] == 1]))
print('number of samples who vote : ', len(data_new[data_new[target_variable] == 2]))
print('number of samples who vote case DK : ', len(data_new[data_new[target_variable] == 0]))

data_new = data_new[data_new[target_variable] != 0]
data_new = data_new.reset_index(drop=True)
print(data_new.shape)

year_threshold = 1982

folder_name = folder_name + '/'+ str(year_threshold)+ '/'

# filter out the samples whose year > year_threshold
data_new = data_new[data_new['Year'] > year_threshold]
data_new = data_new.reset_index(drop=True)
print(data_new.shape)


numerical_feature_list, categorical_feature_list = utils.feature_type_analysis(data, used_features, non_feature_list)

number of samples who not vote :  17790
number of samples who vote :  44188
number of samples who vote case DK :  0
(61978, 118)
(37513, 118)
number of numerical features:  9
number of categorical features:  39
numerical features list: ['therm_ChrFundament', 'therm_hispanics', 'therm_RepParty', 'therm_DemParty', 'therm_Whites', 'therm_liberals', 'therm_conservatives', 'therm_Blacks', 'Age']


In [5]:
# slipt the group by race and religion

data_race7_dict = utils.group_split_race7(data_new) 
data_religion_dict = utils.group_split_religon(data_new)


number of samples of White:  26879
number of samples of Black:  4655
number of samples of Asian:  696
number of samples of American_Indian:  428
number of samples of Hispanic:  3849
number of samples of Other:  725
number of samples of Protestant:  18623
number of samples of Catholic:  8686
number of samples of Jewish:  746
number of samples of Other:  9319


In [6]:

# start from all-clear case:  further filter out the samples with missing value of the used features

group = 'race'

for group_cat in data_race7_dict.keys():

    data_group = data_race7_dict[group_cat]

    X_categorical_transformed, X_continuous_transformed, Y_target, enc_categorical_feature_list = utils.feature_process(data_group, numerical_feature_list, categorical_feature_list, target_variable,value_label_dict)

    X_continuous_categorical = np.concatenate((X_continuous_transformed, X_categorical_transformed), axis=1)

    model = LogisticRegression(l1_ratio = 0.5, max_iter = 500, solver = 'saga', penalty = 'elasticnet')

    accuracy_list, recall_list, precision_list, f1_list, roc_auc_list, importance_list = utils.cross_validation(X_continuous_categorical, Y_target, model, k = 5)

    print('average accuracy: ', np.mean(accuracy_list))
    print('average recall: ', np.mean(recall_list))
    print('average precision: ', np.mean(precision_list))
    print('average f1 score: ', np.mean(f1_list))
    print('average roc auc score: ', np.mean(roc_auc_list))

    # build the feature importance dataframe
    feature_importance = pd.DataFrame({'feature': numerical_feature_list + enc_categorical_feature_list, 'importance': np.mean(importance_list, axis=0)})

    top_15_positive = feature_importance.sort_values('importance', ascending = False).head(15)
    top_15_negative = feature_importance.sort_values('importance', ascending = True).head(15)

    # build a folder to save the results
    sub_folder_name = folder_name + group + '/' + group_cat + '/'
    if not os.path.exists(sub_folder_name):
        os.makedirs(sub_folder_name)

    feature_importance.to_csv(sub_folder_name + 'feature_importance.csv', index = False)
    top_15_positive.to_csv(sub_folder_name + 'top_15_positive.csv', index = False)
    top_15_negative.to_csv(sub_folder_name + 'top_15_negative.csv', index = False)

    # save the mean of the metrics
    metrics = pd.DataFrame({'accuracy': np.mean(accuracy_list), 'recall': np.mean(recall_list), 'precision': np.mean(precision_list), 'f1': np.mean(f1_list), 'roc_auc': np.mean(roc_auc_list)}, index = [0])
    metrics.to_csv(sub_folder_name + 'metrics.csv', index = False)


(26879, 49)


average accuracy:  0.8574725290697675
average recall:  0.5887577990088648
average precision:  0.7656928979271854
average f1 score:  0.6655579085581008
average roc auc score:  0.7657272141667288
(4655, 49)
average accuracy:  0.8154672395273899
average recall:  0.5643850656788532
average precision:  0.7156280357109586
average f1 score:  0.6308105964822539
average roc auc score:  0.7386576631243853
(696, 49)
average accuracy:  0.8003186022610482
average recall:  0.5477195654368238
average precision:  0.6866038452245349
average f1 score:  0.6081165096614191
average roc auc score:  0.7235743866833566
(428, 49)
average accuracy:  0.7593160054719562
average recall:  0.6892585321655089
average precision:  0.7080707435546145
average f1 score:  0.696432133647148
average roc auc score:  0.7484652893029244
(3849, 49)
average accuracy:  0.784619255906642
average recall:  0.648563057207304
average precision:  0.7351665565639365
average f1 score:  0.6881565163399478
average roc auc score:  0.75648104

In [8]:

# start from all-clear case:  further filter out the samples with missing value of the used features

group = 'religon'

for group_cat in data_religion_dict.keys():

    data_group = data_religion_dict[group_cat]

    X_categorical_transformed, X_continuous_transformed, Y_target, enc_categorical_feature_list = utils.feature_process(data_group, numerical_feature_list, categorical_feature_list, target_variable,value_label_dict)

    X_continuous_categorical = np.concatenate((X_continuous_transformed, X_categorical_transformed), axis=1)

    model = LogisticRegression(l1_ratio = 0.5, max_iter = 500, solver = 'saga', penalty = 'elasticnet')

    accuracy_list, recall_list, precision_list, f1_list, roc_auc_list, importance_list = utils.cross_validation(X_continuous_categorical, Y_target, model, k = 5)

    print('average accuracy: ', np.mean(accuracy_list))
    print('average recall: ', np.mean(recall_list))
    print('average precision: ', np.mean(precision_list))
    print('average f1 score: ', np.mean(f1_list))
    print('average roc auc score: ', np.mean(roc_auc_list))

    # build the feature importance dataframe
    feature_importance = pd.DataFrame({'feature': numerical_feature_list + enc_categorical_feature_list, 'importance': np.mean(importance_list, axis=0)})

    top_15_positive = feature_importance.sort_values('importance', ascending = False).head(15)
    top_15_negative = feature_importance.sort_values('importance', ascending = True).head(15)

    # build a folder to save the results
    sub_folder_name = folder_name + group + '/' + group_cat + '/'
    if not os.path.exists(sub_folder_name):
        os.makedirs(sub_folder_name)

    feature_importance.to_csv(sub_folder_name + 'feature_importance.csv', index = False)
    top_15_positive.to_csv(sub_folder_name + 'top_15_positive.csv', index = False)
    top_15_negative.to_csv(sub_folder_name + 'top_15_negative.csv', index = False)

    # save the mean of the metrics
    metrics = pd.DataFrame({'accuracy': np.mean(accuracy_list), 'recall': np.mean(recall_list), 'precision': np.mean(precision_list), 'f1': np.mean(f1_list), 'roc_auc': np.mean(roc_auc_list)}, index = [0])
    metrics.to_csv(sub_folder_name + 'metrics.csv', index = False)


(18623, 49)
average accuracy:  0.8486815648901735
average recall:  0.5851334293953852
average precision:  0.7679692980600379
average f1 score:  0.6641036069347583
average roc auc score:  0.7621923530162531
(8686, 49)
average accuracy:  0.8411230425856253
average recall:  0.551874282844073
average precision:  0.729324176612425
average f1 score:  0.62800348391284
average roc auc score:  0.743009682663672
(746, 49)
average accuracy:  0.880724832214765
average recall:  0.3011754385964912
average precision:  0.5705050505050504
average f1 score:  0.39262515262515263
average roc auc score:  0.6344243241195704
(9319, 49)
average accuracy:  0.8412917925078155
average recall:  0.6575916779559444
average precision:  0.7749745275570239
average f1 score:  0.7113819161035778
average roc auc score:  0.7883793962221805
