The data dictionary for this dataset is located here:

http://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.names

In [2]:
import pandas as pd, numpy as np
from patsy import dmatrices, dmatrix, demo_data

from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

In [3]:
columns = [
    "class", 
    "handicapped_infants", 
    "water_project_cost", 
    "adoption_of_the_budget_resolution", 
    "physician_fee_freeze",
    "el_salvador_aid",
    "religious_groups_in_schools",
    "anti_satellite_test_ban",
    "aid_to_nicaraguan_contras",
    "mx_missile",
    "immigration",
    "synfuels_corporation_cutback",
    "education_spending",
    "superfund_right_to_sue",
    "crime",
    "duty_free_exports",
    "export_administration_act_south_africa"
]

csv_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data"
house_df = pd.read_csv(csv_url, names = columns)
house_df['class'] = house_df['class'].map(lambda value: 0 if value == "republican" else 1 )

## Quick Data Cleaning
This is usually a bad idea to ffill this much but there isn't too much missing data.

In [4]:
house_df.head()

Unnamed: 0,class,handicapped_infants,water_project_cost,adoption_of_the_budget_resolution,physician_fee_freeze,el_salvador_aid,religious_groups_in_schools,anti_satellite_test_ban,aid_to_nicaraguan_contras,mx_missile,immigration,synfuels_corporation_cutback,education_spending,superfund_right_to_sue,crime,duty_free_exports,export_administration_act_south_africa
0,0,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,0,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,1,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,1,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,1,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [4]:
house_df.replace('?', np.nan, inplace=True)
house_df.ffill(inplace=True)

### Setup Patsy to Use All Variables as Predictors
This is mainly a convience to avoid typing all the variables out.

In [5]:
all_columns = "C(" + ") + C(".join(house_df.columns - ["class"]) + ')' # We remove class becuase that's our predictor!
all_columns

  if __name__ == '__main__':


'C(adoption_of_the_budget_resolution) + C(aid_to_nicaraguan_contras) + C(anti_satellite_test_ban) + C(crime) + C(duty_free_exports) + C(education_spending) + C(el_salvador_aid) + C(export_administration_act_south_africa) + C(handicapped_infants) + C(immigration) + C(mx_missile) + C(physician_fee_freeze) + C(religious_groups_in_schools) + C(superfund_right_to_sue) + C(synfuels_corporation_cutback) + C(water_project_cost)'

In [6]:
formula = str("~ " + all_columns)
formula

'~ C(adoption_of_the_budget_resolution) + C(aid_to_nicaraguan_contras) + C(anti_satellite_test_ban) + C(crime) + C(duty_free_exports) + C(education_spending) + C(el_salvador_aid) + C(export_administration_act_south_africa) + C(handicapped_infants) + C(immigration) + C(mx_missile) + C(physician_fee_freeze) + C(religious_groups_in_schools) + C(superfund_right_to_sue) + C(synfuels_corporation_cutback) + C(water_project_cost)'

## Use Our Patsy Forumula to Encode our Data

In [7]:
X = dmatrix(formula, house_df)  # Encoded variables / predictors
y = house_df['class']           # Target / response

## Setup Logistic Regression

In [8]:
logistic = linear_model.LogisticRegression()

## Define Search Parameters

Full list of parameters here:

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [9]:
Cs = np.logspace(0.0, 5.0, 20)

search_parameters = {
    "penalty":             ['l1','l2'],   # Used to specify the norm used in the penalization.
    "C":                   Cs,  # Regularization paramter
    # "dual":                [True, False], # Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features
    "fit_intercept":       [False, True], # Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.
    "class_weight":        [None, "balanced"], # The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))
    "intercept_scaling":   [2, 1],        # Useful only if solver is liblinear. when self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling], i.e. a “synthetic” feature with constant value equals to intercept_scaling is appended to the instance vector. 
    "solver":              ['liblinear'],
    "warm_start":          [False, True]
}

estimator = GridSearchCV(logistic, search_parameters)

## Test Train Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X[0:434], y[0:434], test_size=0.33, random_state=42)

In [12]:
estimator.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'warm_start': [False, True], 'C': array([  1.00000e+00,   1.83298e+00,   3.35982e+00,   6.15848e+00,
         1.12884e+01,   2.06914e+01,   3.79269e+01,   6.95193e+01,
         1.27427e+02,   2.33572e+02,   4.28133e+02,   7.84760e+02,
         1.43845e+03,   2.63665e+03,   4.83293e+03,  ...near'], 'fit_intercept': [False, True], 'penalty': ['l1', 'l2'], 'class_weight': [None, 'balanced']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

## GridSearch Estimator Report

In [13]:
print "Best C / Regularization Param:", estimator.best_estimator_.C # This estimator.best_estimator_ object has many great reporting metrics
print "Best Params:", estimator.best_params_
print "Best Score:", estimator.best_score_

Best C / Regularization Param: 1.0
Best Params: {'warm_start': False, 'C': 1.0, 'intercept_scaling': 2, 'fit_intercept': True, 'solver': 'liblinear', 'penalty': 'l1', 'class_weight': None}
Best Score: 0.579310344828


In [14]:
y_true, y_pred = y_test, estimator.predict(X_test)
print classification_report(y_true, y_pred, target_names=["Republican", "Democrat"])


             precision    recall  f1-score   support

 Republican       0.46      0.24      0.32        49
   Democrat       0.69      0.85      0.76        95

avg / total       0.61      0.65      0.61       144

