# Feature importance analysis acroos different group (race/religon)

In [1]:
import pandas as pd
import numpy as np
import utils


# data path
file_path = '../data/cumulative_2022_v3_9_domain.csv'

data = pd.read_csv(file_path)

column_to_variable_dict = np.load('../data/column_to_variable_dict.npy', allow_pickle=True).item()
variable_to_column_dict = np.load('../data/variable_to_column_dict.npy', allow_pickle=True).item()

# check the "Year" column's max and min value
print(data['Year'].max())
print(data['Year'].min())

2020.0
1948.0


In [2]:
missing_value = utils.missing_value_analysis(data)

threshold_list = [0.2, 0.3, 0.4, 0.5]

used_features, not_used_features, folder_name = utils.feature_filter(data, threshold_list,column_to_variable_dict)


In [4]:
# add one column to indicate the variable name of each row,using the index of the missing_value as the key

variable_name = [ column_to_variable_dict[var] for var in missing_value.index]
missing_value['variable_name'] = variable_name

In [6]:
# use the used features to filter out the data

# set the target variable set and index variable set, these variables will not be used for training

target_variable_list = ['Voted','Registered_voted','Voted_party','Vote_Nonvote_Pres']

index_variable_list = ['Year', ]

# check the missing ratio of the target variable
print('missing value of the target variable: ')
print(data[target_variable_list].isnull().sum() / len(data))


missing value of the target variable: 
Voted                0.091551
Registered_voted     0.218061
Voted_party          0.536483
Vote_Nonvote_Pres    0.377067
dtype: float64


In [3]:
target_variable = 'Voted'

'''Voted  {0.0: '0. DK; NA; no Post IW; refused to say if voted;', 1.0: '1. No, did not vote', 2.0: '2. Yes, voted'}'''

# filter out the samples with missing value of the target variable,drop the index
data_new = data[data[target_variable].notnull()]
# filter out the samples with target variable value = 0, count the number of samples whose target variable value = 0, 1 or 2
print('number of samples who not vote : ', len(data_new[data_new[target_variable] == 1]))
print('number of samples who vote : ', len(data_new[data_new[target_variable] == 2]))
print('number of samples who vote case DK : ', len(data_new[data_new[target_variable] == 0]))

data_new = data_new[data_new[target_variable] != 0]
data_new = data_new.reset_index(drop=True)
data_new.shape

number of samples who not vote :  17790
number of samples who vote :  44188
number of samples who vote case DK :  0


(61978, 118)

In [4]:
data_new['Voted'].value_counts()


2.0    44188
1.0    17790
Name: Voted, dtype: int64

In [7]:
# go through all used features, check the num of the categories of each feature: if the num of categories > 10, then this feature is a continuous/numerical feature, otherwise, this feature is a categorical feature-> need to do one-hot encoding

numerical_feature_list = []
categorical_feature_list = []

for feature in used_features:

    if feature not in target_variable_list and feature not in index_variable_list:

        if len(data_new[feature].value_counts()) > 10:
            numerical_feature_list.append(feature)
        else:
            categorical_feature_list.append(feature)

print('number of numerical features: ', len(numerical_feature_list))

print('number of categorical features: ', len(categorical_feature_list))

print('numerical features list:',numerical_feature_list)

number of numerical features:  9
number of categorical features:  43
numerical features list: ['therm_ChrFundament', 'therm_hispanics', 'therm_RepParty', 'therm_DemParty', 'therm_Whites', 'therm_liberals', 'therm_conservatives', 'therm_Blacks', 'Age']


In [10]:

# start from all-clear case:  further filter out the samples with missing value of the used features

data_XY = data_new[numerical_feature_list + categorical_feature_list+[target_variable]]
# data_XY = data_XY[data_XY.notnull().all(axis=1)]
# data_XY = data_XY.reset_index(drop=True)
print(data_XY.shape)

X_continuous = data_XY[numerical_feature_list]
X_categorical = data_XY[categorical_feature_list]
Y_target = data_XY[target_variable]

Y_target.value_counts()

(61978, 70)


2.0    44188
1.0    17790
Name: Voted, dtype: int64

In [21]:
# only use the continuous features to do logistic regression by sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# impute the missing value of the continuous features by using the mean value of the feature

from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

X_continuous_imp = imp.fit_transform(X_continuous)

X_continuous_transformed = StandardScaler().fit_transform(X_continuous_imp)

X_train, X_test, y_train, y_test = train_test_split(X_continuous_transformed, Y_target, test_size=0.3, random_state=1)

# use the default parameters
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)

# get the accuracy, recall, precision, f1 score

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

y_pred = logisticRegr.predict(X_test)

print('accuracy: ', accuracy_score(y_test, y_pred))
print('recall: ', recall_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred))
print('f1 score: ', f1_score(y_test, y_pred))


# count the number of predicted samples for each class
print('number of predicted samples for each class: ')
print(pd.Series(y_pred).value_counts())

print('model just predict the majority class: ', pd.Series(y_pred).value_counts()[1] / len(y_pred))

accuracy:  0.7150693772184575
recall:  0.03727409638554217
precision:  0.518324607329843
f1 score:  0.06954689146469968
number of predicted samples for each class: 
2.0    18212
1.0      382
dtype: int64
model just predict the majority class:  0.020544261589760138


In [22]:
# only use the categorical features to do logistic regression by sklearn, for the NaN value, set as a new category, then do one-hot encoding

from sklearn.preprocessing import OneHotEncoder

X_categorical_imp = X_categorical.fillna(-1)

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_categorical_imp)
X_categorical_transformed = enc.transform(X_categorical_imp).toarray()

X_train, X_test, y_train, y_test = train_test_split(X_categorical_transformed, Y_target, test_size=0.3, random_state=0)

# use the default parameters
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)

# get the accuracy, recall, precision, f1 score

y_pred = logisticRegr.predict(X_test)

print('accuracy: ', accuracy_score(y_test, y_pred))
print('recall: ', recall_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred))
print('f1 score: ', f1_score(y_test, y_pred))

# count the number of predicted samples for each class
print('number of predicted samples for each class: ')
print(pd.Series(y_pred).value_counts())



accuracy:  0.8758739378294074
recall:  0.7312957059816239
precision:  0.8167539267015707
f1 score:  0.7716660071230709
number of predicted samples for each class: 
2.0    13819
1.0     4775
dtype: int64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [23]:
# 建立映射
feature_mapping = {}
current_index = 0
for feature_index, categories in enumerate(enc.categories_):
    for category_index, category in enumerate(categories):
        feature_mapping[current_index] = (f'Feature_id {feature_index}', 
         enc.feature_names_in_[feature_index],                             
        f'Category {category_index + 1}', category)
        current_index += 1

# # 打印映射结果
# print("Feature Mapping:")
# for k, v in feature_mapping.items():
#     print(f"Encoded feature {k}: Original {v}")

# identify the top 10 features that have the largest absolute value of the coefficient,

top_10_index = np.argsort(np.abs(logisticRegr.coef_[0]))[-10:]

# print the top 10 features
print('top 10 features: ')
for index in top_10_index:
    print(feature_mapping[index])

top 10 features: 
('Feature_id 4', 'satisfactionDemocracy', 'Category 1', -1.0)
('Feature_id 25', 'VCF9028', 'Category 4', 3.0)
('Feature_id 44', 'Interest_elections', 'Category 4', 3.0)
('Feature_id 28', 'Pre_election_inten_vote', 'Category 4', 3.0)
('Feature_id 21', 'church_attendance', 'Category 1', -1.0)
('Feature_id 18', 'volunteer', 'Category 1', -1.0)
('Feature_id 28', 'Pre_election_inten_vote', 'Category 5', 4.0)
('Feature_id 12', 'VCF9022', 'Category 3', 5.0)
('Feature_id 12', 'VCF9022', 'Category 2', 1.0)
('Feature_id 12', 'VCF9022', 'Category 1', -1.0)


In [14]:
# concatenate the continuous features and categorical features, then do logistic regression

X_continuous_categorical = np.concatenate((X_continuous_transformed, X_categorical_transformed), axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_continuous_categorical, Y_target, test_size=0.3, random_state=0)

# use the default parameters
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)

# get the accuracy, recall, precision, f1 score

y_pred = logisticRegr.predict(X_test)

print('accuracy: ', accuracy_score(y_test, y_pred))
print('recall: ', recall_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred))
print('f1 score: ', f1_score(y_test, y_pred))

# check the top 10 features with the highest absolute value of the coefficient

feature_importance = pd.DataFrame({'feature': numerical_feature_list + enc.get_feature_names().tolist(), 'importance': logisticRegr.coef_[0]})

accuracy:  0.8796923738840486
recall:  0.7416088505531596
precision:  0.8215621105110096
f1 score:  0.7795407509608752


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [16]:
# try the decision tree model

from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X_continuous_categorical, Y_target, test_size=0.3, random_state=0)

# use the default parameters
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# get the accuracy, recall, precision, f1 score

y_pred = clf.predict(X_test)

print('accuracy: ', accuracy_score(y_test, y_pred))
print('recall: ', recall_score(y_test, y_pred))
print('precision: ', precision_score(y_test, y_pred))
print('f1 score: ', f1_score(y_test, y_pred))


accuracy:  0.8295148972786921
recall:  0.7022313894618414
precision:  0.7030223390275953
f1 score:  0.7026266416510318
