In [1]:
%load_ext autoreload
%autoreload 2
import clipper_manager as cl
import os
import pandas as pd
import numpy as np
from sklearn import linear_model as lm
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.externals import joblib
import sys
import json


## Connect to EC2

In [2]:
import clipper_manager as cl
ec2_host = "ec2-52-53-151-0.us-west-1.compute.amazonaws.com"
user = "ubuntu"
key = os.path.expanduser("~/.ssh/aws_rsa")
clipper = cl.Cluster(ec2_host, user, key)

Checking if Docker running...
Found Docker running
Creating internal Docker network
Creating local model repository


## Start Clipper

In [3]:
clipper.start_clipper()

starting Clipper with default settings:
models = []
redis_port = 6379
use_lsh = false
window_size = -1
name = "clipper-demo"
input_type = "float"
num_update_workers = 1
num_predict_workers = 1
redis_ip = "redis-clipper"
cache_size = 49999
slo_micros = 20000
correction_policy = "logistic_regression"
input_length = 784
[batching]
sample_size = 1000
strategy = "aimd"

[ec2-52-53-151-0.us-west-1.compute.amazonaws.com] sudo: docker run -d --network=clipper_nw -p 6379:6379 --cpuset-cpus="0" --name redis-clipper redis:alpine
[ec2-52-53-151-0.us-west-1.compute.amazonaws.com] sudo: docker run -d --network=clipper_nw -p 1337:1337 --cpuset-cpus="1-4" --name clipper -v ~/conf.toml:/tmp/conf.toml dcrankshaw/clipper


In [4]:
print json.dumps(clipper.get_metrics(), indent=4)

{
    "meters": [
        {
            "rate": 0.0, 
            "name": "prediction_thruput", 
            "unit": "events per second"
        }, 
        {
            "rate": 0.0, 
            "name": "update_thruput", 
            "unit": "events per second"
        }
    ], 
    "histograms": [
        {
            "std": 0.0, 
            "p99": 0.0, 
            "name": "prediction_latency", 
            "min": 0, 
            "max": 0, 
            "p95": 0.0, 
            "p50": 0.0, 
            "mean": 0.0
        }, 
        {
            "std": 0.0, 
            "p99": 0.0, 
            "name": "in_time_predictions", 
            "min": 0, 
            "max": 0, 
            "p95": 0.0, 
            "p50": 0.0, 
            "mean": 0.0
        }, 
        {
            "std": 0.0, 
            "p99": 0.0, 
            "name": "update_latency", 
            "min": 0, 
            "max": 0, 
            "p95": 0.0, 
            "p50": 0.0, 
            "mean": 0.0
        

## Start a serving workload

We go to a [different notebook](run_serving_workload.ipynb) so we can start querying the model from a separate process. 

# Train a Scikit-Learn model

In [5]:
def load_digits(digits_location, digits_filename = "train.data", norm=True):
    digits_path = digits_location + "/" + digits_filename
    print("Source file: %s" % digits_path)
    df = pd.read_csv(digits_path, sep=",", header=None)
    data = df.values
    print("Number of image files: %d" % len(data))
    y = data[:,0]
    X = data[:,1:]
    Z = X
    if norm:
        mu = np.mean(X,0)
        sigma = np.var(X,0)
        Z = (X - mu) / np.array([np.sqrt(z) if z > 0 else 1. for z in sigma])
    return Z, y

def filter_data(data):
    cx, cy = data
    binary_x = []
    binary_y = []
    for i in range(len(cy)):
        if cy[i] == 3:
            binary_x.append(cx[i,:])
            binary_y.append(1.0)
        elif cy[i] == 6:
            binary_x.append(cx[i,:])
            binary_y.append(0.0)
    return np.array(binary_x), np.array(binary_y)
            
train_x, train_y = filter_data(load_digits(os.path.expanduser("~/model-serving/data/mnist_data")))
test_x, test_y = filter_data(load_digits(os.path.expanduser("~/model-serving/data/mnist_data"), digits_filename="test.data"))

Source file: /Users/crankshaw/model-serving/data/mnist_data/train.data
Number of image files: 60000
Source file: /Users/crankshaw/model-serving/data/mnist_data/test.data
Number of image files: 10000


## Train a bad model

In [6]:
rf_model = RFC(n_estimators=2, max_depth=1)
rf_model.fit(train_x, train_y)
rf_model.score(test_x, test_y)

0.79469854469854473

## Deploy A Bad Model

In [7]:
clipper.add_sklearn_model("bad_rf_model", rf_model)

/tmp/clipper-models/bad_rf_model/1
[ec2-52-53-151-0.us-west-1.compute.amazonaws.com] sudo: docker run -d --network=clipper_nw --name bad_rf_model_v1_r0 -v /tmp/clipper-models/bad_rf_model/1/bad_rf_model:/model:ro dcrankshaw/clipper-sklearn-mw


## Train a Spark Model

Now let's go train a model using Spark in a Databricks Cloud [Notebook](https://amplab-berkeley-research.cloud.databricks.com/#notebook/46987).

## Deploy a Spark Model from S3

In [8]:
clipper.add_pyspark_model("spark_svm", "s3://clipperdbdemo/spark_svm_model/svm_predict_3")

/tmp/clipper-models/spark_svm/1
[ec2-52-53-151-0.us-west-1.compute.amazonaws.com] run: aws s3 cp s3://clipperdbdemo/spark_svm_model/svm_predict_3 /tmp/clipper-models/spark_svm/1/svm_predict_3 --recursive
[ec2-52-53-151-0.us-west-1.compute.amazonaws.com] out: download: s3://clipperdbdemo/spark_svm_model/svm_predict_3/data/_SUCCESS to svm_predict_3/data/_SUCCESS
[ec2-52-53-151-0.us-west-1.compute.amazonaws.com] out: Completed 1 of 6 part(s) with 5 file(s) remaining
[ec2-52-53-151-0.us-west-1.compute.amazonaws.com] out: download: s3://clipperdbdemo/spark_svm_model/svm_predict_3/metadata/_SUCCESS to svm_predict_3/metadata/_SUCCESS
[ec2-52-53-151-0.us-west-1.compute.amazonaws.com] out: Completed 2 of 6 part(s) with 4 file(s) remaining
[ec2-52-53-151-0.us-west-1.compute.amazonaws.com] out: download: s3://clipperdbdemo/spark_svm_model/svm_predict_3/metadata/part-00000 to svm_predict_3/metadata/part-00000
[ec2-52-53-151-0.us-west-1.compute.amazonaws.com] out: Completed 3 of 6 part(s) with 3 fi

## Train an SVM with RBF Kernel

In [10]:
svm_model = svm.SVC()
svm_model.fit(train_x, train_y)
svm_model.score(test_x, test_y)

0.99480249480249483

In [11]:
clipper.add_sklearn_model("rbf_svm_model", svm_model)

/tmp/clipper-models/rbf_svm_model/1
[ec2-52-53-151-0.us-west-1.compute.amazonaws.com] sudo: docker run -d --network=clipper_nw --name rbf_svm_model_v1_r0 -v /tmp/clipper-models/rbf_svm_model/1/rbf_svm_model:/model:ro dcrankshaw/clipper-sklearn-mw


In [13]:
print json.dumps(clipper.get_correction_model(0), indent=4)

{
    "linear_model": {
        "label": [
            1, 
            0
        ], 
        "bias": -1.0, 
        "nr_class": 2, 
        "w": [
            1.9580837366019657, 
            0.6497062643846426, 
            -0.02436218717027392
        ], 
        "nr_feature": 3
    }, 
    "anytime_estimators": [
        0.56, 
        0.6, 
        0.52
    ], 
    "offline_model_order": [
        "rbf_svm_model", 
        "spark_svm", 
        "bad_rf_model"
    ]
}


## Send some corrections

We go to a [different notebook](send_updates.ipynb) to send more training data to Clipper. 

## Stop Clipper

In [None]:
clipper.stop_all()

In [None]:
lm_model = lm.LogisticRegression()
lm_model.fit(train_x, train_y)
lm_model.score(test_x, test_y)

In [None]:
clipper.add_replicas("bad_rf_model", 1, num_replicas=3)