<a href="https://colab.research.google.com/github/VertaAI/modeldb-client/blob/development/workflows/demos/parallel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multiprocess Grid Search

In [None]:
#Run this cell if you are running this python notebook on Google Colab and restart your notebook when prompted
!pip install verta

In [None]:
import os
import itertools
import time

from multiprocessing import Pool

import joblib

import numpy as np

from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

import grpc

from verta import ModelDBClient


data_dir = os.path.join("..", "data", "census")
output_dir = os.path.join("..", "output", "grid-search")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

In [None]:
HOST = 
PORT = 

---

## Instantiate Client

In [None]:
client = ModelDBClient(HOST, PORT)
proj = client.set_project("Income Classification")
expt = client.set_experiment("Logistic Regression in Parallel")

In [None]:
TRAIN_DATA_PATH = os.path.join(data_dir, "train.npz")
TEST_DATA_PATH = os.path.join(data_dir, "test.npz")
MODEL_PATH = os.path.join(output_dir, "logreg_gridsearch_{}.gz")

## Design Validation Run

In [None]:
hyperparam_candidates = {
    'C': [1e-4, 1e-3, 1e-2],
    'solver': ['lbfgs'],
    'max_iter': [1e4],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

def run_experiment(hyperparams):
    start_time = int(time.time())
    
    # create object to track experiment run
    run = client.set_experiment_run()
        
    # create and log data
    train_data = np.load(TRAIN_DATA_PATH)
    X_train, y_train = train_data['X'], train_data['y']
    run.log_dataset("train_data", TRAIN_DATA_PATH)
    run.log_dataset("test_data", TEST_DATA_PATH)
    
    # create validation split
    X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train,
                                                                      test_size=0.1, shuffle=False)
    # log hyperparameters
    for key, val in hyperparams.items():
        run.log_hyperparameter(key, val)
    print(hyperparams, end=' ')
    
    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_train, y_train)
    
    # calculate and log validation accuracy
    val_acc = model.score(X_val, y_val)
    run.log_metric("val_acc", val_acc)
    print("Validation accuracy: {:.4f}".format(val_acc))
    
    # save and log model
    joblib.dump(model, MODEL_PATH.format(start_time))
    run.log_model("model", MODEL_PATH.format(start_time))

## Run Validation

In [None]:
with Pool() as p:
    p.map(run_experiment, hyperparam_sets)