# Logistic Regression with Grid Search (scikit-learn)

---

# Log Workflow

In [1]:
import os, sys
import itertools

import joblib

import numpy as np

from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

sys.path.append(os.path.join("..", "modeldb"))
from modeldbclient import ModelDBClient

## Instantiate Client

In [2]:
client = ModelDBClient()
proj = client.set_project("Income Classification")
expt = client.set_experiment("Logistic Regression")

In [3]:
TRAIN_DATA_PATH = os.path.join("..", "data", "census", "train.npz")
TEST_DATA_PATH = os.path.join("..", "data", "census", "test.npz")
MODEL_PATH = os.path.join("..", "output", "client-demo", "logreg_gridsearch_{}.gz")

## Prepare Data

In [4]:
train_data = np.load(TRAIN_DATA_PATH)

X_train, y_train = train_data['X'], train_data['y']

## Prepare Hyperparameters

In [5]:
hyperparam_candidates = {
    'C': [1e-4, 1e-3, 1e-2, 1e-1],
    'solver': ['lbfgs'],
    'max_iter': [1e4],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

## Run Validation

In [6]:
for hyperparams in hyperparam_sets:
    # create object to track experiment run
    run = client.set_experiment_run()
        
    # log data
    run.log_dataset("train data", TRAIN_DATA_PATH)
    run.log_dataset("test data", TEST_DATA_PATH)
    
    # create validation split
    X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train,
                                                                      test_size=0.1, shuffle=False)
    # log hyperparameters
    for key, val in hyperparams.items():
        run.log_hyperparameter(key, val)
    print(hyperparams, end=' ')
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_train, y_train)
    
    # calculate and log validation accuracy
    val_acc = model.score(X_val, y_val)
    run.log_metric("val acc", val_acc)
    print(f"Validation accuracy: {val_acc:.4f}")
    
    # save and log model
    joblib.dump(model, MODEL_PATH.format(run.id))
    run.log_model("model", MODEL_PATH.format(run.id))

{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 10000.0} Validation accuracy: 0.7988
{'C': 0.001, 'solver': 'lbfgs', 'max_iter': 10000.0} Validation accuracy: 0.8120
{'C': 0.01, 'solver': 'lbfgs', 'max_iter': 10000.0} Validation accuracy: 0.8393
{'C': 0.1, 'solver': 'lbfgs', 'max_iter': 10000.0} Validation accuracy: 0.8434


## Close Client

In [7]:
client.disconnect()

%reset -f

---

# Load Workflow

In [8]:
import os, sys
import itertools

import joblib

import numpy as np

from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

sys.path.append(os.path.join("..", "modeldb"))
from modeldbclient import ModelDBClient

## Instantiate Client

In [9]:
client = ModelDBClient()
proj = client.set_project("Income Classification")
expt = client.set_experiment("Logistic Regression")

## Load Experiment Runs

In [10]:
# retrieve all ExperimentRuns under the Experiment we set
client.set_experiment_runs()

## Select Best Experiment Run

In [11]:
# fetch best experiment run based on validation accuracy
best_run = sorted(client.expt_runs, key=lambda run: run.get_metrics()['val acc'])[-1]

## Restore Training Data

In [12]:
TRAIN_DATA_PATH = best_run.get_datasets()['train data']

train_data = np.load(TRAIN_DATA_PATH)
X_train, y_train = train_data['X'], train_data['y']

## Restore Hyperparameters

In [13]:
best_hyperparams = best_run.get_hyperparameters()
best_val_acc = best_run.get_metrics()['val acc']

print(f"{best_hyperparams} Validation accuracy: {best_val_acc:.4f}")

{'C': 0.1, 'solver': 'lbfgs', 'max_iter': 10000.0} Validation accuracy: 0.8434


## Restore Earlier Model to Verify

In [14]:
MODEL_PATH = best_run.get_models()['model']

model = joblib.load(MODEL_PATH)

acc = model.score(X_train, y_train)
print(f"Accuracy on full training set: {acc:.4f}")

Accuracy on full training set: 0.8469


## Retrain Model on Full Training Set

In [15]:
model = linear_model.LogisticRegression(**best_hyperparams)
model.fit(X_train, y_train)

train_acc = model.score(X_train, y_train)
best_run.log_metric("train acc", train_acc)
print(f"Training accuracy: {train_acc:.4f}")

Training accuracy: 0.8464


## Run and Log Testing

In [16]:
TEST_DATA_PATH = best_run.get_datasets()['test data']

test_data = np.load(TEST_DATA_PATH)
X_test, y_test = test_data['X'], test_data['y']

test_acc = model.score(X_test, y_test)
best_run.log_metric("test acc", test_acc)
print(f"Testing accuracy: {test_acc:.4f}")

Testing accuracy: 0.8463


## Close Client

In [17]:
client.disconnect()