<a href="https://colab.research.google.com/github/VertaAI/modeldb-client/blob/development/workflows/demos/sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logistic Regression with Grid Search (scikit-learn)

In [None]:
#Run this cell if you are running this python notebook on Google Colab and restart your notebook when prompted
#!pip install verta

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [None]:
import os, sys
import itertools
import time
from multiprocessing import Pool

import numpy as np

from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

from verta import ModelDBClient


DATA_DIR = os.path.join("..", "data", "census")
TRAIN_DATA_PATH = os.path.join(DATA_DIR, "train.npz")
TEST_DATA_PATH = os.path.join(DATA_DIR, "test.npz")

In [None]:
HOST = "eks-alb-http-betamax-prod-513501981.us-east-1.elb.amazonaws.com"
PORT = "6244"

---

# Log Workflow

## Instantiate Client

In [None]:
client = ModelDBClient(HOST, PORT)
proj = client.set_project("Income Classification")
expt = client.set_experiment("Logistic Regression")

## Prepare Data

In [None]:
train_data = np.load(TRAIN_DATA_PATH)

X_train, y_train = train_data['X'], train_data['y']

## Prepare Hyperparameters

In [None]:
hyperparam_candidates = {
    'C': [1e-6, 1e-4, 1e-3, 1e-2],
    'solver': ['lbfgs'],
    'max_iter': [15, 28, 45, 66],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

## Run Validation

In [None]:
def run_experiment(hyperparams):
    # create object to track experiment run
    run = client.set_experiment_run()
        
    # log data
    run.log_dataset("train_data", "train-data.csv.zip")
    
    # log validation splits
    (X_val_train, X_val_test,
     y_val_train, y_val_test) = model_selection.train_test_split(X_train, y_train,
                                                                 test_size=0.2,
                                                                 shuffle=True)
    
    # log hyperparameters
    run.log_hyperparameters(**hyperparams)
    print(hyperparams, end=' ')
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_val_train, y_val_train)
    
    # log validation accuracy
    val_acc = model.score(X_val_test, y_val_test)
    run.log_metric("val_acc", val_acc)
    print("Validation accuracy: {:.4f}".format(val_acc))
    
    # log model
    run.log_model("model", model)

In [None]:
with Pool() as pool:
    pool.map(run_experiment, hyperparam_sets)

---

## Select Best Experiment Run

In [None]:
# fetch best experiment run based on validation accuracy
best_run = expt.expt_runs.top_k("metrics.val_acc", 1)[0]
best_hyperparams = best_run.get_hyperparameters()

print(best_hyperparams)

## Test Best Model

In [None]:
test_data = np.load(TEST_DATA_PATH)

X_test, y_test = test_data['X'], test_data['y']
best_run.log_dataset("test_data", "test-data.csv.zip")

In [None]:
model = linear_model.LogisticRegression(**best_hyperparams)

# fit on full training set
model.fit(X_train, y_train)

# log training accuracy
train_acc = model.score(X_train, y_train)
best_run.log_metric("train_acc", train_acc)
print("Training accuracy: {:.4f}".format(train_acc))

# log testing accuracy
test_acc = model.score(X_test, y_test)
best_run.log_metric("test_acc", test_acc)
print("Testing accuracy: {:.4f}".format(test_acc))