<a href="https://colab.research.google.com/github/VertaAI/modeldb-client/blob/development/workflows/demos/sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logistic Regression with Grid Search (scikit-learn)

In [None]:
#Run this cell if you are running this python notebook on Google Colab and restart your notebook when prompted
#!pip install verta

In [72]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [73]:
import os, sys
import json
import itertools
import time
from multiprocessing import Pool

import numpy as np

from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

from verta import ModelDBClient


DATA_DIR = os.path.join("..", "data", "census")
TRAIN_DATA_PATH = os.path.join(DATA_DIR, "train-data.csv")
TEST_DATA_PATH = os.path.join(DATA_DIR, "test-data.csv")

In [74]:
HOST = "demo1.verta.ai"
PORT = 6244

---

# Log Workflow

## Instantiate Client

In [75]:
client = ModelDBClient(HOST, PORT)
proj = client.set_project("Conrado 04/02 5 - Income Classification")
expt = client.set_experiment("Logistic Regression")

connection successfully established
set existing Project: Conrado 04/02 5 - Income Classification
set existing Experiment: Logistic Regression


## Create model API

In [76]:
with open(TRAIN_DATA_PATH) as f:
    header = f.readline().strip()
headers = header.split(",")
input_headers = headers[:-1]
output_header = headers[-1]
api = {
    "input": {
        "type": "list",
        "fields": [{"name":n, "type": "float"} for n in input_headers]
    },
    "output": {
        "name": output_header,
        "type": "float"
    }
}

with open('model_api.json', 'w') as f:
    json.dump(api, f, indent=2)

## Prepare Data

In [77]:
csv = np.genfromtxt(TRAIN_DATA_PATH, delimiter=",")
# Skip first line due to header. Last column is the label
X_train = csv[1:,:-1]
y_train = csv[1:, -1]

## Prepare Hyperparameters

In [86]:
hyperparam_candidates = {
    'C': [1e-6, 1e-4, 1e-3, 1e-2],
    'solver': ['lbfgs'],
    'max_iter': [15, 28, 45, 66],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

## Run Train and Validation

In [87]:
def run_experiment(hyperparams):
    # create object to track experiment run
    run = client.set_experiment_run(desc="", tags=['AnacondaCon', 'demo'])
        
    # log data
    run.log_dataset("train_data", TRAIN_DATA_PATH)
    
    # log validation splits
    (X_val_train, X_val_test,
     y_val_train, y_val_test) = model_selection.train_test_split(X_train, y_train,
                                                                 test_size=0.2,
                                                                 shuffle=True)
    
    # log hyperparameters
    run.log_hyperparameters(**hyperparams)
    print(hyperparams, end=' ')
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_val_train, y_val_train)
    
    # log validation accuracy
    val_acc = model.score(X_val_test, y_val_test)
    run.log_metric("val_acc", val_acc)
    print("Validation accuracy: {:.4f}".format(val_acc))
    
    # log model
    run.log_model("model", model)
    run.log_model_requirements_file("requirements.txt")
    run.log_model_api("model_api.json")

In [88]:
with Pool() as pool:
    pool.map(run_experiment, hyperparam_sets)

{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 66} Validation accuracy: 0.8104
{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 15} Validation accuracy: 0.7991
{'C': 1e-06, 'solver': 'lbfgs', 'max_iter': 66} Validation accuracy: 0.8101
{'C': 1e-06, 'solver': 'lbfgs', 'max_iter': 28} Validation accuracy: 0.7993
{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 45} Validation accuracy: 0.8060
{'C': 1e-06, 'solver': 'lbfgs', 'max_iter': 45} Validation accuracy: 0.8039
{'C': 0.0001, 'solver': 'lbfgs', 'max_iter': 28} Validation accuracy: 0.7999
{'C': 1e-06, 'solver': 'lbfgs', 'max_iter': 15} Validation accuracy: 0.7989
{'C': 0.001, 'solver': 'lbfgs', 'max_iter': 15} Validation accuracy: 0.7843
{'C': 0.001, 'solver': 'lbfgs', 'max_iter': 28} Validation accuracy: 0.7828
{'C': 0.01, 'solver': 'lbfgs', 'max_iter': 15} Validation accuracy: 0.7844
{'C': 0.01, 'solver': 'lbfgs', 'max_iter': 28} Validation accuracy: 0.7826
{'C': 0.01, 'solver': 'lbfgs', 'max_iter': 45} Validation accuracy: 0.7963
{'C': 0.01,

---

## Select Best Experiment Run

In [89]:
# fetch best experiment run based on validation accuracy
best_run = expt.expt_runs.top_k("metrics.val_acc", 1)[0]
best_hyperparams = best_run.get_hyperparameters()

print(best_hyperparams)

{'C': 0.01, 'solver': 'lbfgs', 'max_iter': 66}


## Test Best Model

In [90]:
csv = np.genfromtxt(TEST_DATA_PATH, delimiter=",")
# Skip first line due to header. Last column is the label
X_test = csv[1:,:-1]
y_test = csv[1:, -1]
best_run.log_dataset("test_data", TEST_DATA_PATH)

In [91]:
model = linear_model.LogisticRegression(**best_hyperparams)

# fit on full training set
model.fit(X_train, y_train)

# log training accuracy
train_acc = model.score(X_train, y_train)
best_run.log_metric("train_acc", train_acc)
print("Training accuracy: {:.4f}".format(train_acc))

# log testing accuracy
test_acc = model.score(X_test, y_test)
best_run.log_metric("test_acc", test_acc)
print("Testing accuracy: {:.4f}".format(test_acc))

Training accuracy: 0.8372
Testing accuracy: 0.8349


## Deploy the model
Choose the best model in the UI and deploy it. Then run the code below to create some requests.

In [94]:
import requests
#url = "http://{}:{}/api/v1/deployment/status/{}".format(HOST, PORT, best_run._id)
url = "http://{}:{}/api/v1/deployment/status/{}".format(HOST, PORT, "126b3b47-3fa5-4bb2-91b0-de0f18315967")
status = json.loads(requests.get(url)._content)
token = status['token']
prediction_url = "http://{}:{}{}".format(HOST, PORT, status['api'])
print(prediction_url)
print(token)

http://demo1.verta.ai:6244/api/v1/predict/126b3b47-3fa5-4bb2-91b0-de0f18315967
f0b76729-d4f9-4566-b8c1-99c85bdeb398


In [95]:
input_request={input_headers[i]: X_train[1,i] for i in range(len(input_headers))}
input_request={'token': token, 'data': json.dumps(input_request)}
response = requests.post(prediction_url, data=input_request)
prediction = response.json()
print(prediction)

{'>50k': 0.0}


Now let's create some continuous load.

In [96]:
import time
while True:
    for i in range(X_train.shape[0]):
        input_request={input_headers[j]: X_train[i,j] for j in range(len(input_headers))}
        input_request['education_9th']=1
        input_request={'token': token, 'data': json.dumps(input_request)}
        response = requests.post(prediction_url, data=input_request)
        if response.status_code != 200:
            print("Got a failure while predicting sample {}".format(i+1))
        #time.sleep(0.1)

KeyboardInterrupt: 