# Logistic Regression with Grid Search (scikit-learn)

In [1]:
import os
import itertools

import joblib

import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

## Data Loading

In [2]:
# load pre-cleaned data from CSV file into pandas DataFrame
df = pd.read_csv(os.path.join("..", "data", "census", "cleaned-census-data.csv"), delimiter=',')

df.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,...,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,>50K
0,39,2174,0,40,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,50,0,0,13,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,38,0,0,40,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,53,0,0,40,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,28,0,0,40,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [3]:
# split into features and labels
features_df = df.drop('>50K', axis='columns')
labels_df = df['>50K']  # we are predicting whether an individual's income exceeds $50k/yr

## Data Splitting

In [4]:
# extract NumPy arrays from DataFrames
X = features_df.values
y = labels_df.values

# split data into training and testing sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33)

# instantiate iterator that yields train/val indices for each fold of cross validation
validation_splitter = model_selection.KFold(n_splits=5, shuffle=True)

## Cross Validation

In [5]:
hyperparam_candidates = {
    'C': [1e-1, 1, 1e1],
    'solver': ['lbfgs'],
    'max_iter': [1e3, 1e4, 1e5],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

In [6]:
val_accs = []  # track performance of each hyperparam set
for hyperparams in hyperparam_sets:
    print(hyperparams, end=' ')
    
    val_acc = 0  # track average validation accuracy across folds
    for idxs_train, idxs_val in validation_splitter.split(X_train, y_train):
        # index into training data to produce train/val splits
        X_val_train, y_val_train = X[idxs_train], y[idxs_train]
        X_val, y_val = X[idxs_val], y[idxs_val]

        # create and fit model
        model = linear_model.LogisticRegression(**hyperparams)
        model.fit(X_val_train, y_val_train)

        # accumulate average validation accuracy
        val_acc += model.score(X_val, y_val)/validation_splitter.get_n_splits()

    # record performance of current hyperparam set
    val_accs.append(val_acc)
    print(f"Validation accuracy: {val_acc}")

# pair validation accuracies with hyperparam sets
hyperparam_accs = zip(hyperparam_sets, val_accs)

{'C': 0.1, 'solver': 'lbfgs', 'max_iter': 1000.0} Validation accuracy: 0.8252383733655096
{'C': 0.1, 'solver': 'lbfgs', 'max_iter': 10000.0} Validation accuracy: 0.8259519849361866
{'C': 0.1, 'solver': 'lbfgs', 'max_iter': 100000.0} Validation accuracy: 0.8254328733328161
{'C': 1, 'solver': 'lbfgs', 'max_iter': 1000.0} Validation accuracy: 0.824881583348734
{'C': 1, 'solver': 'lbfgs', 'max_iter': 10000.0} Validation accuracy: 0.8248166904561719
{'C': 1, 'solver': 'lbfgs', 'max_iter': 100000.0} Validation accuracy: 0.8247517082084199
{'C': 10.0, 'solver': 'lbfgs', 'max_iter': 1000.0} Validation accuracy: 0.8248815623239834
{'C': 10.0, 'solver': 'lbfgs', 'max_iter': 10000.0} Validation accuracy: 0.8253679856964417
{'C': 10.0, 'solver': 'lbfgs', 'max_iter': 100000.0} Validation accuracy: 0.826211004606733


## Testing

In [7]:
best_hyperparams, best_val_acc = sorted(hyperparam_accs, key=lambda item: item[1])[-1]
print(f"{best_hyperparams} Validation accuracy: {best_val_acc}")

{'C': 10.0, 'solver': 'lbfgs', 'max_iter': 100000.0} Validation accuracy: 0.826211004606733


In [8]:
# create and fit model using best set of hyperparameters
model = linear_model.LogisticRegression(**best_hyperparams)
model.fit(X_train, y_train)

LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100000.0, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [9]:
print(f"Training accuracy: {model.score(X_train, y_train)}")
print(f"Testing accuracy: {model.score(X_test, y_test)}")

Training accuracy: 0.8251734647558524
Testing accuracy: 0.8239088934237377


In [10]:
print(f"Training F-score: {metrics.f1_score(y_train, model.predict(X_train))}")
print(f"Testing F-score: {metrics.f1_score(y_test, model.predict(X_test))}")

Training F-score: 0.5744949494949495
Testing F-score: 0.5715201025148167


## Save Model

In [11]:
joblib.dump(model, os.path.join("..", "output", "logreg_gridsearch.gz"))

['../output/logreg_gridsearch.gz']