# Feature Selection (2008)
This notebook follows the same process detailed in Feature Selection (2012), but with 2008 as the target year.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.metrics import f1_score, confusion_matrix, mutual_info_score, roc_auc_score
from helpers.machine_learning import Normalizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, chi2, RFECV, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from scipy.stats import pearsonr

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = 50
pd.options.display.max_rows = 50

In [2]:
df_orig = pd.read_csv('../data/anes_cdf_converted.csv')
df_orig.head()

Unnamed: 0.1,Unnamed: 0,year,age,congressional_district,state,gender,weight,final_vote,VCF0108,VCF0113,VCF0127,VCF0143,VCF0146,VCF0311,VCF0346,VCF0347,VCF0348,VCF0349,VCF0358,VCF0359,VCF0360,VCF0361,VCF0370,VCF0371,VCF0372,...,VCF0904_oh2,VCF0904_oh3,VCF1004_oh0,VCF1004_oh1,VCF1004_oh2,VCF1004_oh3,VCF1004_oh4,VCF9030_oh0,VCF9030_oh1,VCF9030_oh2,VCF9030_oh3,VCF9030_oh4,VCF9030_oh5,VCF9131_oh0,VCF9131_oh1,VCF9131_oh2,VCF9131_oh3,VCF9132_oh0,VCF9132_oh1,VCF9132_oh2,VCF9132_oh3,VCF9133_oh0,VCF9133_oh1,VCF9133_oh2,VCF9133_oh3
0,0,2000,49.0,MN01,MN,0,1.2886,2,0.0,0,0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1,2000,35.0,MI01,MI,1,0.8959,0,0.0,0,0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,2,2000,57.0,IL11,IL,1,1.0454,1,0.0,0,0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3,2000,63.0,ME02,ME,0,0.6005,1,0.0,0,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,2000,40.0,MA01,MA,1,1.927,2,0.0,0,0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [3]:
def cv_test(X_train, y_train, preprocessing, classifiers, clf_names, scoring = 'f1', cv = 5):
    scores = []
    if preprocessing != None:
        X_cv = preprocessing.fit_transform(X_train)
    else:
        X_cv = X_train
    for model, name in zip(classifiers, clf_names):
        cv_score = cross_val_score(X = X_cv, y = y_train, estimator = model, cv = cv, scoring = scoring)
        scores.append([cv_score.mean(), cv_score.ptp(), cv_score.std()])
    scores = pd.DataFrame(scores, columns = ['mean','range','std'])
    scores.index = clf_names
    return scores.sort_values(by = 'mean', ascending = False)

In [4]:
df_orig = df_orig.drop(['Unnamed: 0', 'congressional_district','state','final_vote'], axis = 1)

df_orig = df_orig.iloc[:, ~df_orig.columns.str.contains('VCF0734')]
df_orig = df_orig.iloc[:, ~df_orig.columns.str.contains('VCF0736')]
df_orig = df_orig.iloc[:, ~df_orig.columns.str.contains('VCF1011')]
df_orig = df_orig.iloc[:, ~df_orig.columns.str.contains('VCF0704')]
df_orig = df_orig.iloc[:, ~df_orig.columns.str.contains('VCF0710')]
df_orig = df_orig.iloc[:, ~df_orig.columns.str.contains('VCF0709')]
df_orig = df_orig.iloc[:, ~df_orig.columns.str.contains('VCF0703')]
df_orig = df_orig.iloc[:, ~df_orig.columns.str.contains('VCF0707')]
df_orig = df_orig.iloc[:, ~df_orig.columns.str.contains('VCF0708')]
df_orig = df_orig.iloc[:, ~df_orig.columns.str.contains('VCF1011')]

# Label non-voters as positive cases and voters as negative cases
df_orig.VCF0702 = df_orig.VCF0702.apply(lambda x: 0 if x==1 else 1)

Separate training (pre-2008) and test (2008) data so that we can test the generalizability of a model trained on data from earlier years to that of a target year.

In [5]:
df = df_orig[:]
df_train = df[df.year < 2008]

todrop = ['VCF0702','year']
X_train_orig = df_train.drop(todrop, axis = 1)
y_train = df_train.VCF0702

columns = X_train_orig.columns

Test various classifiers on training data

In [6]:
imp = Imputer(missing_values = 'NaN', strategy = 'median')

preprocessing = Pipeline([('imp',imp),('scale', Normalizer())])

classifiers = [LogisticRegression(), RandomForestClassifier(), AdaBoostClassifier(),
               BernoulliNB(), GaussianNB(), SVC()]
clf_names = ['Logistic Regression','Random Forest', 'AdaBoost',
             'Bernoulli Naive Bayes','Gaussian Naive Bayes', 'SVM']

scores = cv_test(X_train_orig, y_train, preprocessing, classifiers, clf_names)
scores

Unnamed: 0,mean,range,std
Bernoulli Naive Bayes,0.634344,0.022709,0.007901
AdaBoost,0.619434,0.074746,0.030931
Logistic Regression,0.613121,0.080711,0.030597
Gaussian Naive Bayes,0.574042,0.052484,0.020824
SVM,0.567656,0.141523,0.050305
Random Forest,0.517763,0.091912,0.030849


Test effect of PCA on training data with various classifiers

In [7]:
preprocessing = Pipeline([('imp', imp),('scale', Normalizer()),('pca', PCA(n_components = 200))])
scores = cv_test(X_train_orig, y_train, preprocessing, classifiers, clf_names)
scores

Unnamed: 0,mean,range,std
Logistic Regression,0.618364,0.132407,0.043008
Gaussian Naive Bayes,0.581504,0.031694,0.012033
SVM,0.566595,0.144979,0.051957
AdaBoost,0.545817,0.089637,0.029517
Bernoulli Naive Bayes,0.421027,0.155533,0.061324
Random Forest,0.245806,0.186596,0.070328


# Adding new features

### Adding thermometer intensity

In [8]:
def add_thermometer_intensity(df):
    
    def thermometer_to_intensity(x):
        if x == np.nan:
            return np.nan
        else:
            return abs(x - 5) ** 2
    
    columns_to_convert = (df.max() >= 10) & (df.min() == 0)
    columns_to_convert[[df.columns.get_loc('VCF0114_r1'),df.columns.get_loc('VCF1015')]] = False
    thermometer_df = df.loc[:,columns_to_convert]
    thermometer_df = thermometer_df.applymap(thermometer_to_intensity)
    thermometer_df.columns = thermometer_df.columns + '_int'
    thermometer_df['int_sum_therm'] = thermometer_df.sum(axis = 1) ** 1.2
    return pd.concat([df, thermometer_df], axis = 1)

In [9]:
X_train = add_thermometer_intensity(X_train_orig)
preprocessing = Pipeline([('imp', Imputer(missing_values = 'NaN', strategy = 'median')),('scale', Normalizer())])
scores = cv_test(X_train, y_train, preprocessing = preprocessing, classifiers = classifiers, clf_names = clf_names)
scores

Unnamed: 0,mean,range,std
AdaBoost,0.632951,0.117341,0.04218
Bernoulli Naive Bayes,0.628739,0.058211,0.021233
Logistic Regression,0.591557,0.140685,0.049709
Gaussian Naive Bayes,0.579488,0.034577,0.014732
SVM,0.550181,0.14723,0.053293
Random Forest,0.513956,0.083912,0.032764


### Adding ordinal intensity

In [10]:
def add_ordinal_intensity(df):
    columns_to_convert = ['VCF0803','VCF0806','VCF0830','VCF0851','VCF9014','VCF9015','VCF9039','VCF9042','VCF0301',
                     'VCF0303','VCF0502','VCF0604','VCF0605','VCF0880a','VCF9009','VCF9045']
    intensity_df = df.loc[:,columns_to_convert]
    intensity_df = abs(intensity_df - (intensity_df.max() + intensity_df.min()) / 2)
    intensity_df.columns = intensity_df.columns + '_int'
    intensity_df['int_sum_ord'] = intensity_df.sum(axis = 1) ** 2
    return pd.concat([df, intensity_df], axis = 1)

In [11]:
X_train = add_ordinal_intensity(X_train_orig)
scores = cv_test(X_train, y_train, preprocessing, classifiers, clf_names)
scores

Unnamed: 0,mean,range,std
Bernoulli Naive Bayes,0.625655,0.043083,0.015443
AdaBoost,0.619817,0.078645,0.031874
Logistic Regression,0.602748,0.086129,0.032368
Gaussian Naive Bayes,0.575457,0.053159,0.018406
SVM,0.547263,0.112605,0.039643
Random Forest,0.480667,0.144444,0.053304


### Summing "don't know" responses

In [12]:
def add_dk_sum(df):
    df = df[:]
    dk_column_index = df.columns.str.contains('dk')
    dk_df = df.loc[:,dk_column_index]
    df['dk_sum'] = dk_df.sum(axis = 1) ** 2
    return df

In [13]:
X_train = add_dk_sum(X_train_orig)
scores = cv_test(X_train, y_train, preprocessing, classifiers, clf_names)
scores

Unnamed: 0,mean,range,std
Bernoulli Naive Bayes,0.628253,0.02756,0.009201
AdaBoost,0.626057,0.057635,0.021547
Logistic Regression,0.611278,0.084034,0.029646
Gaussian Naive Bayes,0.577785,0.04772,0.018183
SVM,0.562592,0.169231,0.055509
Random Forest,0.470861,0.189232,0.066853


# Removing features
### First one-hot features

In [18]:
def drop_first_onehot(df):
    return df.loc[:, ~df.columns.str.contains('oh0')]

### Correlated features

In [19]:
def break_correlation(in_X, y, threshhold = 0.85, scoring = mutual_info_score):
       
    X = pd.DataFrame(Imputer(missing_values='NaN', strategy = 'median').fit_transform(in_X), columns = in_X.columns)
    
    corr_mask = abs(X.corr()) > threshhold
    corr_list = corr_mask.sum()
    corr_list = corr_list[corr_list > 1]
    corr_dict = dict(corr_list)
    
    print('Correlated features:', len(corr_dict))
    
    cluster_list = []
    while len(corr_list) > 0:
        cluster = dict()
        key = corr_list.index[0]
        correlations = list(corr_mask[key][corr_mask[key]].index)

        i = 1
        while i < len(correlations):
            key = correlations[i]
            temp_correlations = list(corr_mask[key][corr_mask[key]].index)
            difference = list(set(temp_correlations) - set(correlations))
            if len(difference) != 0:
                correlations = correlations + difference
            i = i + 1
            
        for i in range(0, len(correlations)):
            cluster[correlations[i]] = corr_dict[correlations[i]]
            corr_dict.pop(correlations[i])
            corr_list.pop(correlations[i])
            
        cluster_list.append(cluster)
        
    removed = []
        
    for cluster in cluster_list:
        scores = dict()
        for feature in cluster:
            scores[feature] = scoring(X[feature], y)

        while(len(cluster) > 0):
            min_key = min(scores, key = scores.get)
            removed.append(min_key)
            temp_correlations = list(corr_mask[min_key][corr_mask[min_key]].index)
            temp_correlations = list(set(temp_correlations).intersection(cluster))
            temp_correlations.remove(min_key)
            for each in temp_correlations:
                if cluster[each] > 2:
                    cluster[each] = cluster[each] - 1
                else:
                    cluster.pop(each)
                    scores.pop(each)
                                
            cluster.pop(min_key)
            scores.pop(min_key)
        
    print('Removed {} features:\n'.format(len(removed)),removed)        
    return in_X.drop(removed, axis = 1)

In [53]:
columns_to_drop = X_train_orig.describe().loc[:,X_train_orig.describe().loc['count',:] == 0].columns
X_train = X_train_orig.drop(columns_to_drop, axis = 1)
X_train = drop_first_onehot(X_train)
X_train = break_correlation(X_train, y_train)
scores = cv_test(X_train, y_train, preprocessing, classifiers, clf_names)
scores

Correlated features: 54
Removed 28 features:
 ['VCF0105a_oh5', 'VCF0108', 'VCF0112_oh2', 'VCF0127', 'VCF0450', 'VCF0904_oh1', 'VCF9030a', 'VCF0211_dk', 'VCF0224', 'VCF0424_dk', 'VCF0426', 'VCF0425_dk', 'VCF0624', 'VCF0114_r2', 'VCF0504_dk', 'VCF0513_dk', 'VCF0541_dk', 'VCF0549_dk', 'VCF0804_dk', 'VCF0804_oh4', 'VCF0804', 'VCF0110', 'VCF0303', 'VCF0870', 'VCF0714_oh1', 'VCF9131_oh2', 'VCF9132_oh1', 'VCF9133_oh2']


Unnamed: 0,mean,range,std
AdaBoost,0.625615,0.093137,0.036213
Bernoulli Naive Bayes,0.625063,0.047727,0.017448
Logistic Regression,0.611356,0.123331,0.042084
Gaussian Naive Bayes,0.598436,0.106732,0.038877
SVM,0.560981,0.152137,0.052673
Random Forest,0.509991,0.082257,0.030915


# Recursive feature elimination

In [54]:
def feature_elimination(X, y, preprocessing, estimator = LogisticRegression()):
    X = preprocessing.fit_transform(X)
    rfecv = RFECV(estimator = estimator, step = 5, cv = StratifiedKFold(3), scoring = 'f1')
    return rfecv.fit_transform(X, y), rfecv

In [55]:
X_train = X_train_orig
columns = X_train.columns
X_train, rfecv = feature_elimination(X_train, y_train, preprocessing, LogisticRegression())
print('Number of features selected:', rfecv.n_features_)
scores = cv_test(X_train, y_train, None, classifiers, clf_names)
scores

Number of features selected: 33


Unnamed: 0,mean,range,std
Bernoulli Naive Bayes,0.688208,0.088942,0.030648
AdaBoost,0.65937,0.130351,0.04897
Logistic Regression,0.652292,0.103178,0.043109
Random Forest,0.614636,0.114124,0.041528
Gaussian Naive Bayes,0.595178,0.0865,0.031459
SVM,0.570487,0.138496,0.051007


### Most important features

In [56]:
selected_features = columns[rfecv.ranking_ == 1]
X_rfe = break_correlation(pd.DataFrame(X_train, columns = selected_features), y_train)

Correlated features: 0
Removed 0 features:
 []


In [57]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [58]:
feature_rankings = pd.concat([pd.DataFrame(lr.coef_.T), pd.DataFrame(selected_features)], axis = 1)
feature_rankings.columns = ['coefficient','feature']
feature_rankings.coefficient = feature_rankings.coefficient.apply(lambda x: abs(x))
feature_rankings['sum'] = feature_rankings.feature.apply(lambda x: sum(df_train.loc[:,x].dropna()))
feature_rankings.sort_values(by = 'coefficient', ascending = False).head(50)

Unnamed: 0,coefficient,feature,sum
30,2.230005,VCF0713_oh2,684.0
31,1.138393,VCF9030_oh3,13.0
19,1.0311,VCF9016_dk,8.0
12,1.027395,VCF0426_dk,10.0
17,1.012824,VCF9095_dk,165.0
9,0.991275,VCF0231_dk,38.0
8,0.951009,VCF0224_dk,44.0
28,0.948049,VCF0302_oh5,576.0
2,0.935274,VCF0380,850.0
18,0.915152,VCF0852_dk,4.0


### Classifying with VCF0713_oh4 as only feature

In [60]:
df = df_orig[:]
X_train = df[df.year < 2008]
score_cv = cross_val_score(lr, X = X_train.VCF0713_oh4.reshape(-1,1), y = y_train, scoring = 'f1', cv = 10)
print('f1-score:', score_cv.mean())

f1-score: 0.564954988834


### Classifying with everything else

In [61]:
df = df_orig[:]
X_train = df[df.year < 2008]
X_train = X_train.drop(['VCF0713_oh4','VCF0713_oh3','VCF0713_oh2','VCF0713_oh1','VCF0702'], axis = 1)
X_exp = preprocessing.fit_transform(X_train)
score_cv = cross_val_score(lr, X = X_exp, y = y_train, scoring = 'f1', cv = 10)
score_cv.mean()

0.57510046820571525

# Testing Combinations

### Optimizing feature sets for voting classifiers

In [63]:
max_f1 = [0,0,0,0]
best_config = [0,0,0,0]
X_train_features = [0,0,0,0]

classifiers = [LogisticRegression(), AdaBoostClassifier(), BernoulliNB(), SVC()]
clf_names = ['Logistic Regression', 'AdaBoost', 'Bernoulli Naive Bayes', 'SVM']

for i in range(0, 32):
    X_train = X_train_orig.drop(columns_to_drop, axis = 1)
    operations = []
    flags = '{:05d}'.format(int(bin(i)[2:]))
    if int(flags[0]):
        X_train = add_thermometer_intensity(X_train)
        operations.append('add thermometer intensity')
    if int(flags[1]):
        X_train = add_ordinal_intensity(X_train)
        operations.append('add ordinal intensity')
    if int(flags[2]):
        X_train = add_dk_sum(X_train)
        operations.append('add dk sum')
    if int(flags[3]):        
        X_train = drop_first_onehot(X_train)
        operations.append('drop first one-hot')
    if int(flags[4]):
        X_train = break_correlation(X_train, y_train)
        operations.append('break correlation')
    columns = X_train.columns
    X_train, rfecv = feature_elimination(X_train, y_train, preprocessing, LogisticRegression())
    print('Operations:', operations)
    print('Number of features selected:', rfecv.n_features_)
    scores = cv_test(X_train, y_train, None, classifiers, clf_names, cv = 5)
    print(scores.sort_values(by = 'mean', ascending = False))
    
    selected_features = columns[rfecv.ranking_ == 1]
    for i in range(0,4):
        if max_f1[i] < scores['mean'][i]:
            max_f1[i] = scores['mean'][i]
            best_config[i] = operations
            X_train_features[i] = selected_features
    print('')

for f1, config, name in zip(max_f1, best_config, clf_names):
    print('best f1 for {} is {} with combination: {}'.format(name, f1, config))

Operations: []
Number of features selected: 33
                           mean     range       std
Bernoulli Naive Bayes  0.688208  0.088942  0.030648
AdaBoost               0.659370  0.130351  0.048970
Logistic Regression    0.652292  0.103178  0.043109
SVM                    0.570487  0.138496  0.051007

Correlated features: 54
Removed 28 features:
 ['VCF0105a_oh5', 'VCF0108', 'VCF0112_oh2', 'VCF0127', 'VCF0450', 'VCF0904_oh1', 'VCF9030a', 'VCF0211_dk', 'VCF0224', 'VCF0424_dk', 'VCF0426', 'VCF0425_dk', 'VCF0624', 'VCF0114_r2', 'VCF0504_dk', 'VCF0513_dk', 'VCF0541_dk', 'VCF0549_dk', 'VCF0804_dk', 'VCF0804_oh4', 'VCF0804', 'VCF0110', 'VCF0303', 'VCF0870', 'VCF0714_oh1', 'VCF9131_oh2', 'VCF9132_oh1', 'VCF9133_oh2']
Operations: ['break correlation']
Number of features selected: 30
                           mean     range       std
Bernoulli Naive Bayes  0.678914  0.106627  0.035355
AdaBoost               0.647446  0.163981  0.060967
Logistic Regression    0.642089  0.126948  0.042908
SV

In [67]:
X_train_all = add_dk_sum(add_thermometer_intensity(add_ordinal_intensity(X_train_orig)))
X_train_config = [0,0,0,0]
for i in range(0, len(X_train_config)):
    X_train_config[i] = X_train_all.loc[:,X_train_features[i]]

print(cv_test(X_train_config[0], y_train, preprocessing, [LogisticRegression()], ['Logistic Regression'], cv = 10),'\n')
print(cv_test(X_train_config[1], y_train, preprocessing, [AdaBoostClassifier()], ['AdaBoost'], cv = 10),'\n')
print(cv_test(X_train_config[2], y_train, preprocessing, [BernoulliNB()], ['Bernoulli Naive Bayes'], cv = 10),'\n')
print(cv_test(X_train_config[3], y_train, preprocessing, [SVC()], ['SVM'], cv = 10),'\n')

                         mean     range       std
Logistic Regression  0.724357  0.121061  0.032978 

              mean     range       std
AdaBoost  0.649711  0.190972  0.052995 

                           mean     range       std
Bernoulli Naive Bayes  0.692118  0.130251  0.043752 

         mean     range       std
SVM  0.601068  0.222222  0.075014 



In [68]:
for i in range(0,len(X_train_config)):
    print(X_train_config[i].shape)

(1784, 75)
(1784, 61)
(1784, 50)
(1784, 111)


In [69]:
X_train_config[0].to_csv('../data/anes_cdf_training_2008_lr.csv')
X_train_config[1].to_csv('../data/anes_cdf_training_2008_ada.csv')
X_train_config[2].to_csv('../data/anes_cdf_training_2008_bnb.csv')
X_train_config[3].to_csv('../data/anes_cdf_training_2008_svm.csv')