## Universal Prediction Model for Non-voter-Group

- Idea 1: train model on voter-group, predict on non-voter-group
- Idea 2: use transfer learning (TCA), to apply the feature transformation, then train model on voter-group, predict on non-voter-group




In [1]:
import pandas as pd
import numpy as np
import utils
# import model
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

import warnings
warnings.filterwarnings("ignore")
import scipy.stats as stats
import matplotlib.pyplot as plt


# data path
file_path = '../data/cumulative_2022_v3_9_domain.csv'

data = pd.read_csv(file_path)

column_to_variable_dict = np.load('../data/column_to_variable_dict.npy', allow_pickle=True).item()
variable_to_column_dict = np.load('../data/variable_to_column_dict.npy', allow_pickle=True).item()

value_label_dict = np.load('../data/value_labels.npy', allow_pickle=True).item()

# check the "Year" column's max and min value
print(data['Year'].max())
print(data['Year'].min())

2020.0
1948.0


In [2]:
target_variable = 'Voted_D_R'

'''Voted_D_R  {0.0: '0. Did not vote; DK/NA if voted; refused to say if', 1.0: '1. Democrat', 2.0: '2. Republican'}'''




data_train = data[(data['Voted_D_R'] == 1) | (data['Voted_D_R'] == 2)]
data_test = data[data['Voted'] == 1]

data_new = pd.concat([data_train, data_test])

print('number of samples who vote Democrat : ', len(data_new[data_new[target_variable] == 1]))
print('number of samples who vote Republican: ', len(data_new[data_new[target_variable] == 2]))
print('number of samples who vote case DK : ', len(data_new[data_new[target_variable] == 0]))
print('number of samples who do not vote : ', len(data_test))
print('number of samples who do vote D or R : ', len(data_train))



number of samples who vote Democrat :  16419
number of samples who vote Republican:  14526
number of samples who vote case DK :  0
number of samples who do not vote :  17790
number of samples who do vote D or R :  30945


In [3]:
missing_value = utils.missing_value_analysis(data_new)

# threshold_list = [0.2, 0.3, 0.4, 0.5]
threshold_list = [0.2, 0.3, 0.4, 0.5]


# must_include_list = ['urbanism']
must_include_list = None


folder_name = '../data/universal_predict/'

used_features, not_used_features, folder_name = utils.feature_filter(data_new, threshold_list,column_to_variable_dict, folder_name, must_include_list)



In [4]:
# use the used features to filter out the data

# set the target variable set and index variable set, these variables will not be used for training

target_variable_list = ['Voted','Registered_voted','Voted_party','Vote_Nonvote_Pres','Voted_D_R']

race_variable_list = ['Race3','Race4','Race7']

religion_variable_list = ['religion']

index_variable_list = ['Year', ]

not_used_features = ['Pre_election_inten_vote']
# not_used_features = []


state_variable_list = ['State']

non_feature_list = target_variable_list +  race_variable_list + religion_variable_list + index_variable_list + not_used_features + state_variable_list

year_threshold = 1982

folder_name = folder_name + '/'+ str(year_threshold)+ '/'

# filter out the samples whose year > year_threshold
data_train = data_train[data_train['Year'] > year_threshold]
data_test = data_test[data_test['Year'] > year_threshold]

data_new = pd.concat([data_train, data_test]).reset_index(drop=True)

print(data_train.shape)
print(data_test.shape)
print(data_new.shape)


numerical_feature_list, categorical_feature_list = utils.feature_type_analysis(data_new, used_features, non_feature_list)



(21348, 119)
(9809, 119)
(31157, 119)
number of numerical features:  11
number of categorical features:  48
numerical features list: ['therm_Christians', 'therm_Mislims', 'therm_ChrFundament', 'therm_hispanics', 'therm_RepParty', 'therm_DemParty', 'therm_Whites', 'therm_liberals', 'therm_conservatives', 'therm_Blacks', 'Age']


In [7]:

target_variable = 'Voted_D_R'

utils.universal_predict(data_train,data_test, numerical_feature_list, categorical_feature_list, target_variable, value_label_dict, folder_name, group='', group_cat='')

(31157, 60)
average accuracy:  0.9270187062878559
average recall:  0.9350664489219414
average precision:  0.9344787175531423
average f1 score:  0.93473250813851
average roc auc score:  0.9260400980063377
1    5722
0    4087
dtype: int64


In [8]:

def universal_predict_TCA(data_source,data_target, numerical_feature_list, categorical_feature_list, target_variable, value_label_dict, folder_name, group='', group_cat=''):

    # apply transfer component analysis (TCA) to the data

     
    N1 = len(data_source)
    N2 = len(data_target)

    data_group = pd.concat([data_source, data_target]).reset_index(drop=True)

    X_categorical_transformed, X_continuous_transformed, Y_target, enc_categorical_feature_list = utils.feature_process(data_group, numerical_feature_list, categorical_feature_list, target_variable,value_label_dict)

    X_continuous_categorical = np.concatenate((X_continuous_transformed, X_categorical_transformed), axis=1)

    # only use the source data to train the model
    Y_target_train = Y_target[:N1]

    X_continuous_categorical_train = X_continuous_categorical[:N1]

    X_continuous_categorical_test = X_continuous_categorical[N1:]



    model = LogisticRegression(l1_ratio = 0.5, max_iter = 500, solver = 'saga', penalty = 'elasticnet')

    accuracy_list, recall_list, precision_list, f1_list, roc_auc_list, importance_list = utils.cross_validation(X_continuous_categorical_train, Y_target_train, model, k = 5)

    # use imbalanced learn to deal with the imbalanced data
    # accuracy_list, recall_list, precision_list, f1_list, roc_auc_list, importance_list = utils.cross_validation_imb(X_continuous_categorical, Y_target, model, k = 5)


    print('average accuracy: ', np.mean(accuracy_list))
    print('average recall: ', np.mean(recall_list))
    print('average precision: ', np.mean(precision_list))
    print('average f1 score: ', np.mean(f1_list))
    print('average roc auc score: ', np.mean(roc_auc_list))

    # build the feature importance dataframe
    feature_importance = pd.DataFrame({'feature': numerical_feature_list + enc_categorical_feature_list, 'importance': np.mean(importance_list, axis=0)})

     # further process the feature importance dataframe, drop the features whose name includes {DK', 'NA', 'RF', 'Missing'}
    feature_importance_effect = feature_importance[~feature_importance['feature'].str.contains('DK|NA|RF|Missing')]


    top_15_positive = feature_importance_effect.sort_values('importance', ascending = False).head(15)
    top_15_negative = feature_importance_effect.sort_values('importance', ascending = True).head(15)

    # build a folder to save the results

    if group_cat == '':
        sub_folder_name = folder_name + group + '/'
    else:
        sub_folder_name = folder_name + group + '/' + group_cat + '/' 
    
    if not os.path.exists(sub_folder_name):
        os.makedirs(sub_folder_name)

    # recall: the non-voter are the positive samples, the voter are the negative samples

    feature_importance.to_csv(sub_folder_name + 'feature_importance_full.csv', index = False)
    feature_importance_effect.to_csv(sub_folder_name + 'feature_importance_effect.csv', index = False)
    top_15_positive.to_csv(sub_folder_name + 'top_15_Demo.csv', index = False)
    top_15_negative.to_csv(sub_folder_name + 'top_15_Repub.csv', index = False)

    # add the ratio of the positive samples(non-voter) in the group
    non_voter_ratio = len(data_group[data_group[target_variable] == 1]) / len(data_group)

    # save the mean of the metrics
    metrics = pd.DataFrame({ 'non-voter-ratio': non_voter_ratio ,   'accuracy': np.mean(accuracy_list), 'recall': np.mean(recall_list), 'precision': np.mean(precision_list), 'f1': np.mean(f1_list), 'roc_auc': np.mean(roc_auc_list)}, index = [0])
    metrics.to_csv(sub_folder_name + 'metrics.csv', index = False)


    #  apply the universal model to the target data
    model.fit(X_continuous_categorical_train, Y_target_train)
    Y_target_predict = model.predict(X_continuous_categorical_test)

    # value counts of the prediction
    print(pd.Series(Y_target_predict).value_counts())

    # save the prediction results as a csv file
    data_target['prediction'] = Y_target_predict
    data_target.to_csv(sub_folder_name + 'prediction.csv', index = False)


'../data/universal_predict/_threshold_10_0.2_threshold_20_0.3_threshold_30_0.4_threshold_40_0.5/1982/'