# Feature importance analysis acroos different group (race/religon)

In [1]:
import pandas as pd
import numpy as np
import utils
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score


# data path
file_path = '../data/cumulative_2022_v3_9_domain.csv'

data = pd.read_csv(file_path)

column_to_variable_dict = np.load('../data/column_to_variable_dict.npy', allow_pickle=True).item()
variable_to_column_dict = np.load('../data/variable_to_column_dict.npy', allow_pickle=True).item()

value_label_dict = np.load('../data/value_labels.npy', allow_pickle=True).item()

# check the "Year" column's max and min value
print(data['Year'].max())
print(data['Year'].min())

2020.0
1948.0


In [4]:
missing_value = utils.missing_value_analysis(data)

threshold_list = [0.2, 0.3, 0.4, 0.5]

used_features, not_used_features, folder_name = utils.feature_filter(data, threshold_list,column_to_variable_dict)




In [4]:
# add one column to indicate the variable name of each row,using the index of the missing_value as the key

variable_name = [ column_to_variable_dict[var] for var in missing_value.index]
missing_value['variable_name'] = variable_name

In [5]:
# use the used features to filter out the data

# set the target variable set and index variable set, these variables will not be used for training

target_variable_list = ['Voted','Registered_voted','Voted_party','Vote_Nonvote_Pres']

race_variable_list = ['Race3','Race4','Race7']

religion_variable_list = ['religion']

index_variable_list = ['Year', ]

non_feature_list = target_variable_list +  race_variable_list + religion_variable_list + index_variable_list

# check the missing ratio of the target variable
print('missing value of the non-feature variable: ')
print(data[non_feature_list].isnull().sum() / len(data))



missing value of the non-feature variable: 
Voted                0.091551
Registered_voted     0.218061
Voted_party          0.536483
Vote_Nonvote_Pres    0.377067
Race3                0.024874
Race4                0.024874
Race7                0.024068
religion             0.007065
Year                 0.000000
dtype: float64


In [6]:
target_variable = 'Voted'

'''Voted  {0.0: '0. DK; NA; no Post IW; refused to say if voted;', 1.0: '1. No, did not vote', 2.0: '2. Yes, voted'}'''

# filter out the samples with missing value of the target variable,drop the index
data_new = data[data[target_variable].notnull()]
# filter out the samples with target variable value = 0, count the number of samples whose target variable value = 0, 1 or 2
print('number of samples who not vote : ', len(data_new[data_new[target_variable] == 1]))
print('number of samples who vote : ', len(data_new[data_new[target_variable] == 2]))
print('number of samples who vote case DK : ', len(data_new[data_new[target_variable] == 0]))

data_new = data_new[data_new[target_variable] != 0]
data_new = data_new.reset_index(drop=True)
print(data_new.shape)

year_threshold = 1982

# filter out the samples whose year > year_threshold
data_new = data_new[data_new['Year'] > year_threshold]
data_new = data_new.reset_index(drop=True)
print(data_new.shape)


numerical_feature_list, categorical_feature_list = utils.feature_type_analysis(data, used_features, non_feature_list)

number of samples who not vote :  17790
number of samples who vote :  44188
number of samples who vote case DK :  0
(61978, 118)
(37513, 118)
number of numerical features:  9
number of categorical features:  39
numerical features list: ['therm_ChrFundament', 'therm_hispanics', 'therm_RepParty', 'therm_DemParty', 'therm_Whites', 'therm_liberals', 'therm_conservatives', 'therm_Blacks', 'Age']


In [7]:
# slipt the group by race and religion

data_race7_dict = utils.group_split_race7(data_new) 
data_religion_dict = utils.group_split_religon(data_new)


number of samples of White:  26879
number of samples of Black:  4655
number of samples of Asian:  696
number of samples of American_Indian:  428
number of samples of Hispanic:  3849
number of samples of Other:  725
number of samples of Protestant:  18623
number of samples of Catholic:  8686
number of samples of Jewish:  746
number of samples of Other:  9319


In [16]:

# start from all-clear case:  further filter out the samples with missing value of the used features

def feature_process(data, numerical_feature_list, categorical_feature_list, target_variable,value_label_dict):
    data_XY = data[numerical_feature_list + categorical_feature_list+[target_variable]]
    # data_XY = data_XY[data_XY.notnull().all(axis=1)]
    data_XY = data_XY.reset_index(drop=True)
    print(data_XY.shape)

    X_continuous = data_XY[numerical_feature_list]
    X_categorical = data_XY[categorical_feature_list]
    Y_target = data_XY[target_variable]

    # impute + process(one-hot)  categorical features (also get the new names)

    X_categorical_imp = X_categorical.fillna(-1)

    enc = OneHotEncoder(handle_unknown='ignore')

    enc.fit(X_categorical_imp)

    X_categorical_transformed = enc.transform(X_categorical_imp).toarray()

    initial_list = enc.get_feature_names().tolist()
    enc_categorical_feature_list = utils.enc_feature_list(initial_list, enc, value_label_dict)    

    #impute + process(normalize) the numerical features
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')

    X_continuous_imp = imp.fit_transform(X_continuous)

    X_continuous_transformed = StandardScaler().fit_transform(X_continuous_imp)


    return X_categorical_transformed, X_continuous_transformed, Y_target, enc_categorical_feature_list


data_group = data_race7_dict['White']

X_categorical_transformed, X_continuous_transformed, Y_target, enc_categorical_feature_list = feature_process(data_group, numerical_feature_list, categorical_feature_list, target_variable,value_label_dict)

(26879, 49)




In [17]:
# concatenate the continuous features and categorical features, then do logistic regression

X_continuous_categorical = np.concatenate((X_continuous_transformed, X_categorical_transformed), axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_continuous_categorical, Y_target, test_size=0.3, random_state=0)

# use the default parameters
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)

# get the accuracy, recall, precision, f1 score

y_pred = logisticRegr.predict(X_test)

print('accuracy: ', accuracy_score(y_test, y_pred))
print('recall: ', recall_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred))
print('f1 score: ', f1_score(y_test, y_pred))
print('roc auc score: ', roc_auc_score(y_test, y_pred))


feature_importance = pd.DataFrame({'feature': numerical_feature_list + enc_categorical_feature_list, 'importance': logisticRegr.coef_[0]})

accuracy:  0.8571428571428571
recall:  0.5905218317358892
precision:  0.7432975871313673
f1 score:  0.6581602373887241
roc auc score:  0.7643039161912553


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# show the top 10 features with the highest positive value of the coefficient
print('top 10 features with the highest positive value of the coefficient: ')
print(feature_importance.sort_values(by='importance', ascending=False).head(10))

top 10 features with the highest positive value of the coefficient: 
                                               feature  importance
75   Pre_election_inten_vote_3. Undecided; DK (exce...    1.040202
74   Pre_election_inten_vote_2. Republican candidat...    0.831866
73   Pre_election_inten_vote_1. Democratic candidat...    0.809941
143         Interest_elections_3. Very much interested    0.619552
24                              authoritarian1_Missing    0.584249
115                     VCF9030_4. Yes, contact: other    0.562360
9                        meetingCommuntySchool_Missing    0.522379
55                                     VCF9028_Missing    0.518244
105  VCF0130_1. Every week (Except 1970: almost eve...    0.517111
130                                     VCF0718_2. Yes    0.493744


In [42]:
enc.get_feature_names().tolist()



['x0_-1.0',
 'x0_1.0',
 'x0_2.0',
 'x1_-1.0',
 'x1_1.0',
 'x1_2.0',
 'x2_-1.0',
 'x2_1.0',
 'x2_2.0',
 'x2_3.0',
 'x2_4.0',
 'x3_-1.0',
 'x3_1.0',
 'x3_2.0',
 'x3_3.0',
 'x4_-1.0',
 'x4_1.0',
 'x4_2.0',
 'x4_3.0',
 'x5_-1.0',
 'x5_1.0',
 'x5_2.0',
 'x5_3.0',
 'x6_-1.0',
 'x6_1.0',
 'x6_2.0',
 'x7_-1.0',
 'x7_1.0',
 'x7_2.0',
 'x8_-1.0',
 'x8_1.0',
 'x8_2.0',
 'x8_3.0',
 'x8_8.0',
 'x9_-1.0',
 'x9_1.0',
 'x9_3.0',
 'x9_8.0',
 'x10_-1.0',
 'x10_1.0',
 'x10_2.0',
 'x10_3.0',
 'x10_8.0',
 'x11_-1.0',
 'x11_1.0',
 'x11_5.0',
 'x12_-1.0',
 'x12_1.0',
 'x12_2.0',
 'x12_3.0',
 'x12_8.0',
 'x13_-1.0',
 'x13_1.0',
 'x13_2.0',
 'x13_4.0',
 'x13_5.0',
 'x13_8.0',
 'x14_-1.0',
 'x14_1.0',
 'x14_2.0',
 'x14_3.0',
 'x14_4.0',
 'x14_8.0',
 'x15_-1.0',
 'x15_1.0',
 'x15_2.0',
 'x15_3.0',
 'x15_4.0',
 'x15_9.0',
 'x16_-1.0',
 'x16_1.0',
 'x16_2.0',
 'x16_9.0',
 'x17_-1.0',
 'x17_1.0',
 'x17_2.0',
 'x17_3.0',
 'x17_4.0',
 'x17_5.0',
 'x17_6.0',
 'x17_7.0',
 'x17_9.0',
 'x18_-1.0',
 'x18_1.0',
 'x18_2.0',

In [54]:
string_example = 'x37_4.0'

feature_id = int(string_example.split('_')[0][1:])
category_index = int(float(string_example.split('_')[1]))

feature_name = enc.feature_names_in_[feature_id]

if category_index == -1:
    category_name = 'Missing'
else:
    category_name = value_label_dict[feature_name][category_index]

print(feature_name, category_name)

# print(enc.categories_[feature_id], value_label_dict[enc.categories_[feature_id]][category_index])


def get_feature_name_category_name(string, enc, value_label_dict):
    feature_id = int(string.split('_')[0][1:])
    category_index = int(float(string.split('_')[1]))

    feature_name = enc.feature_names_in_[feature_id]

    if category_index == -1:
        category_name = 'Missing'
    else:
        category_name = value_label_dict[feature_name][category_index]

    return feature_name, category_name

def enc_feature_list(initial_list, enc, value_label_dict):
    new_list = []

    for string in initial_list:
        feature_name, category_name = get_feature_name_category_name(string, enc, value_label_dict)
        new_list.append((feature_name+'_'+ category_name))
    return new_list


enc_cat_feature_list 

# new_list = []

# for string in initial_list:
#     feature_id = int(string.split('_')[0][1:])
#     category_id = int(float(string.split('_')[1]))

#     feature_name = feature_mapping[feature_id][1]
#     category_name = feature_mapping[2]

#     new_list.append((feature_name+'_'+ category_name))
# new_list    

Education4 4. College or advanced degree (no cases 1948)




['meetingCommuntySchool_Missing',
 'meetingCommuntySchool_1. Yes',
 'meetingCommuntySchool_2. No',
 'workedWithcommunity_Missing',
 'workedWithcommunity_1. Yes',
 'workedWithcommunity_2. No',
 'authoritarian2_Missing',
 'authoritarian2_1. Obedience',
 'authoritarian2_2. Both (VOL)',
 'authoritarian2_3. Self-reliance',
 'authoritarian2_4. Neither (2020)',
 'authoritarian4_Missing',
 'authoritarian4_1. Independence',
 'authoritarian4_2. Both (VOL)',
 'authoritarian4_3. Respect for elders',
 'authoritarian1_Missing',
 'authoritarian1_1. Curiosity',
 'authoritarian1_2. Both (VOL)',
 'authoritarian1_3. Good manners',
 'authoritarian3_Missing',
 'authoritarian3_1. Being considerate',
 'authoritarian3_2. Both (VOL)',
 'authoritarian3_3. Well behaved',
 'have_healthInsurance_Missing',
 'have_healthInsurance_1. Yes',
 'have_healthInsurance_2. No',
 'volunteer_Missing',
 'volunteer_1. Yes',
 'volunteer_2. No',
 'VCF0886_Missing',
 'VCF0886_1.   Increased',
 'VCF0886_2.   Same',
 'VCF0886_3.   De

In [49]:
enc.feature_names_in_[feature_id]

'meetingCommuntySchool'