# Logistic Regression with Grid Search (scikit-learn)

---

In [None]:
email = 
dev_key = 

# Log Workflow

In [None]:
import os, sys
import itertools
import time

import joblib

import numpy as np

from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

from verta import ModelDBClient

## Instantiate Client

In [None]:
client = ModelDBClient(email, dev_key)
proj = client.set_project("Income Classification")
expt = client.set_experiment("Logistic Regression")

In [None]:
TRAIN_DATA_PATH = os.path.join("..", "data", "census", "train.npz")
TEST_DATA_PATH = os.path.join("..", "data", "census", "test.npz")
MODEL_PATH = os.path.join("..", "output", "client-demo", "logreg_gridsearch_{}.gz")

## Prepare Data

In [None]:
train_data = np.load(TRAIN_DATA_PATH)

X_train, y_train = train_data['X'], train_data['y']

## Prepare Hyperparameters

In [None]:
hyperparam_candidates = {
    'C': [1e-4, 1e-3, 1e-2, 1e-1],
    'solver': ['lbfgs'],
    'max_iter': [1e4],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

## Run Validation

In [None]:
for hyperparams in hyperparam_sets:
    start_time = int(time.time())
    
    # create object to track experiment run
    run = client.set_experiment_run()
        
    # log data
    run.log_dataset("train_data", TRAIN_DATA_PATH)
    run.log_dataset("test_data", TEST_DATA_PATH)
    
    # create validation split
    X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train,
                                                                      test_size=0.1, shuffle=False)
    # log hyperparameters
    for key, val in hyperparams.items():
        run.log_hyperparameter(key, val)
    print(hyperparams, end=' ')
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_train, y_train)
    
    # calculate and log validation accuracy
    val_acc = model.score(X_val, y_val)
    run.log_metric("val_acc", val_acc)
    print("Validation accuracy: {:.4f}".format(val_acc))
    
    # save and log model
    joblib.dump(model, MODEL_PATH.format(start_time))
    run.log_model("model", MODEL_PATH.format(start_time))

## Clean Up

In [None]:
%reset -f

---

In [None]:
email = 
dev_key = 

# Load Workflow

In [None]:
import os, sys
import itertools

import joblib

import numpy as np

from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

from verta import ModelDBClient

## Instantiate Client

In [None]:
client = ModelDBClient(email, dev_key)
proj = client.set_project("Income Classification")
expt = client.set_experiment("Logistic Regression")

## Select Best Experiment Run

In [None]:
# fetch best experiment run based on validation accuracy
best_run = sorted(client.expt_runs, key=lambda run: run.get_metrics()['val_acc'])[-1]

## Restore Training Data

In [None]:
TRAIN_DATA_PATH = best_run.get_datasets()['train_data']

train_data = np.load(TRAIN_DATA_PATH)
X_train, y_train = train_data['X'], train_data['y']

## Restore Hyperparameters

In [None]:
best_hyperparams = best_run.get_hyperparameters()
best_val_acc = best_run.get_metrics()['val_acc']

print("{} Validation accuracy: {:.4f}".format(best_hyperparams, best_val_acc))

## Restore Earlier Model to Verify

In [None]:
MODEL_PATH = best_run.get_models()['model']

model = joblib.load(MODEL_PATH)

acc = model.score(X_train, y_train)
print("Accuracy on full training set: {:.4f}".format(acc))

## Retrain Model on Full Training Set

In [None]:
model = linear_model.LogisticRegression(**best_hyperparams)
model.fit(X_train, y_train)

train_acc = model.score(X_train, y_train)
best_run.log_metric("train_acc", train_acc)
print("Training accuracy: {:.4f}".format(train_acc))

## Run and Log Testing

In [None]:
TEST_DATA_PATH = best_run.get_datasets()['test_data']

test_data = np.load(TEST_DATA_PATH)
X_test, y_test = test_data['X'], test_data['y']

test_acc = model.score(X_test, y_test)
best_run.log_metric("test_acc", test_acc)
print("Testing accuracy: {:.4f}".format(test_acc))