<a href="https://colab.research.google.com/github/VertaAI/modeldb-client/blob/development/workflows/demos/sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logistic Regression with Grid Search (scikit-learn)

In [None]:
#Run this cell if you are running this python notebook on Google Colab and restart your notebook when prompted
#!pip install verta

In [1]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
import os, sys
import json
import itertools
import time
from multiprocessing import Pool

import numpy as np

from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

from verta import ModelDBClient


DATA_DIR = os.path.join("..", "data", "census")
TRAIN_DATA_PATH = os.path.join(DATA_DIR, "train-data.csv")
TEST_DATA_PATH = os.path.join(DATA_DIR, "test-data.csv")

In [3]:
HOST = "demo1.verta.ai"
PORT = 6244

---

# Log Workflow

## Instantiate Client

In [4]:
client = ModelDBClient(HOST, PORT)
proj = client.set_project("Conrado 4 - Income Classification")
expt = client.set_experiment("Conrado 4 - Logistic Regression")

connection successfully established
set existing Project: Conrado 4 - Income Classification
set existing Experiment: Conrado 4 - Logistic Regression


## Create model API

In [5]:
with open(TRAIN_DATA_PATH) as f:
    header = f.readline().strip()
headers = header.split(",")
api = {
    "input": {
        "type": "list",
        "fields": [{"name":n, "type": "float"} for n in headers[:-1]]
    },
    "output": {
        "name": headers[-1],
        "type": "float"
    }
}

with open('model_api.json', 'w') as f:
    json.dump(api, f, indent=2)

## Prepare Data

In [6]:
csv = np.genfromtxt(TRAIN_DATA_PATH, delimiter=",")
# Skip first line due to header. Last column is the label
X_train = csv[1:,1:-1]
y_train = csv[1:, -1]

## Prepare Hyperparameters

In [7]:
hyperparam_candidates = {
    'C': [1e-6, 1e-4, 1e-3, 1e-2],
    'solver': ['lbfgs'],
    'max_iter': [15, 28, 45, 66],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

## Run Train and Validation

In [8]:
def run_experiment(hyperparams):
    # create object to track experiment run
    run = client.set_experiment_run(desc="", tags=['AnacondaCon', 'demo'])
        
    # log data
    run.log_dataset("train_data", TRAIN_DATA_PATH)
    
    # log validation splits
    (X_val_train, X_val_test,
     y_val_train, y_val_test) = model_selection.train_test_split(X_train, y_train,
                                                                 test_size=0.2,
                                                                 shuffle=True)
    
    # log hyperparameters
    run.log_hyperparameters(**hyperparams)
    print(hyperparams, end=' ')
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_val_train, y_val_train)
    
    # log validation accuracy
    val_acc = model.score(X_val_test, y_val_test)
    run.log_metric("val_acc", val_acc)
    print("Validation accuracy: {:.4f}".format(val_acc))
    
    # log model
    run.log_model("model", model)
    run.log_model_requirements_file("requirements.txt")
    run.log_model_api("model_api.json")

In [9]:
with Pool() as pool:
    pool.map(run_experiment, hyperparam_sets)

{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 28} Validation accuracy: 0.7841
{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 15} Validation accuracy: 0.7844
{'C': 1e-06, 'solver': 'lbfgs', 'max_iter': 28} Validation accuracy: 0.7841
{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 45} Validation accuracy: 0.7939
{'C': 1e-06, 'solver': 'lbfgs', 'max_iter': 45} Validation accuracy: 0.7909
{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 66} Validation accuracy: 0.8017
{'C': 1e-06, 'solver': 'lbfgs', 'max_iter': 15} Validation accuracy: 0.7852
{'C': 0.001, 'solver': 'lbfgs', 'max_iter': 15} Validation accuracy: 0.7857
{'C': 0.001, 'solver': 'lbfgs', 'max_iter': 28} Validation accuracy: 0.7801
{'C': 0.001, 'solver': 'lbfgs', 'max_iter': 45} Validation accuracy: 0.8043
{'C': 0.01, 'solver': 'lbfgs', 'max_iter': 15} Validation accuracy: 0.7857
{'C': 0.001, 'solver': 'lbfgs', 'max_iter': 66} Validation accuracy: 0.8074
{'C': 0.01, 'solver': 'lbfgs', 'max_iter': 28} Validation accuracy: 0.7799
{'C': 0.01

---

## Select Best Experiment Run

In [10]:
# fetch best experiment run based on validation accuracy
best_run = expt.expt_runs.top_k("metrics.val_acc", 1)[0]
best_hyperparams = best_run.get_hyperparameters()

print(best_hyperparams)

{'C': 0.01, 'solver': 'lbfgs', 'max_iter': 66}


## Test Best Model

In [11]:
csv = np.genfromtxt(TEST_DATA_PATH, delimiter=",")
# Skip first line due to header. Last column is the label
X_test = csv[1:,1:-1]
y_test = csv[1:, -1]
best_run.log_dataset("test_data", TEST_DATA_PATH)

In [12]:
model = linear_model.LogisticRegression(**best_hyperparams)

# fit on full training set
model.fit(X_train, y_train)

# log training accuracy
train_acc = model.score(X_train, y_train)
best_run.log_metric("train_acc", train_acc)
print("Training accuracy: {:.4f}".format(train_acc))

# log testing accuracy
test_acc = model.score(X_test, y_test)
best_run.log_metric("test_acc", test_acc)
print("Testing accuracy: {:.4f}".format(test_acc))

Training accuracy: 0.8386
Testing accuracy: 0.8371
