# Nuclio - Training function

## Environment

In [2]:
# nuclio: ignore
import nuclio

### Configurations

In [3]:
%%nuclio config

# Trigger
spec.triggers.retrain.kind = "cron"
spec.triggers.retrain.attributes.interval = "1h"

# Base image
spec.build.baseImage = "python:3.6-jessie"

%nuclio: setting spec.triggers.retrain.kind to 'cron'
%nuclio: setting spec.triggers.retrain.attributes.interval to '1h'
%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'


### Variables

In [99]:
%%nuclio env

# DB Config
V3IO_FRAMESD=${V3IO_FRAMESD}
V3IO_USERNAME=${V3IO_USERNAME}
V3IO_ACCESS_KEY=${V3IO_ACCESS_KEY}

# Features
FEATURES_TABLE=netops_features

# Training
TRAIN_ON_LAST=7d
TRAIN_SIZE=0.7

# Parallelizem
NUMBER_OF_SHARDS=4

# Model
MODEL_FILENAME=netops.v2.model.pickle
MODEL_DIR=/bigdata/netops/models

%nuclio: setting 'V3IO_FRAMESD' environment variable
%nuclio: setting 'V3IO_USERNAME' environment variable
%nuclio: setting 'V3IO_ACCESS_KEY' environment variable
%nuclio: setting 'FEATURES_TABLE' environment variable
%nuclio: setting 'TRAIN_ON_LAST' environment variable
%nuclio: setting 'TRAIN_SIZE' environment variable
%nuclio: setting 'NUMBER_OF_SHARDS' environment variable
%nuclio: setting 'MODEL_FILENAME' environment variable
%nuclio: setting 'MODEL_DIR' environment variable


%nuclio: cannot find "=" in line
%nuclio: cannot find "=" in line
%nuclio: cannot find "=" in line
%nuclio: cannot find "=" in line
%nuclio: cannot find "=" in line


In [95]:
%nuclio env -c CURRENT_MODEL_DIR=/models
%nuclio env -l CURRENT_MODEL_DIR=/v3io/bigdata/netops/models

%nuclio: setting 'CURRENT_MODEL_DIR' environment variable


### Commands

In [96]:
%%nuclio cmd -c

############
# installs #
############

# Utils
pip install pyarrow
pip install pandas

# Igz DB
pip install v3io_frames --upgrade

# Function
pip install scikit-learn==0.20.1
pip install xgboost --upgrade
pip install dask["complete"] --upgrade
pip install dask-ml["complete"] --upgrade

apt-get update && apt-get install -y wget
mkdir -p ${CURRENT_MODEL_DIR}

In [100]:
%%nuclio cmd -c 
# Copy the model file into the function
wget -O ${CURRENT_MODEL_DIR}/${MODEL_FILENAME} --header "x-v3io-session-key: ${V3IO_ACCESS_KEY}" http://${V3IO_WEBAPI_SERVICE_HOST}:8081${MODEL_DIR}/${MODEL_FILENAME}

## Function

### Imports

In [79]:
import os
import pickle

import v3io_frames as v3f
from dask.distributed import Client

import pandas as pd
import sklearn
from dask_ml.wrappers import ParallelPostFit
from dask_ml import xgboost as xgb

### Helper functions

In [80]:
def get_data(context):
    df = context.v3f.read(backend='tsdb', query=f'select * from {context.features_table}',
                          start=f'now-{context.train_on_last}', end='now', multi_index=True)
    df = df.reset_index(drop=True)
    df = dd.from_pandas(df, npartitions=context.shards)
    return df

In [81]:
def get_train_test_sets_from_data(context, df: pd.DataFrame):
    drop_columns = [col for col in df.columns if 'is_error' in col]
    X = df.drop(drop_columns, axis=1)
    y = df.loc[:, 'is_error']
    X_train, X_test, y_train, y_test = dcv.train_test_split(X, y, train_size=context.train_size, test_size=1-context.train_size)
    return X_train, X_test, y_train, y_test

### Init context

In [82]:
def init_context(context):
    
    # Create our DB client
    v3io_client = v3f.Client(address='http://' + os.getenv('V3IO_FRAMESD', 'framesd:8081'), 
                        container='bigdata', 
                        password=os.environ['V3IO_ACCESS_KEY'], 
                        user=os.environ['V3IO_USERNAME'])
    setattr(context, 'v3f', v3io_client)
    
    # Create Dask client
    dask_client = Client()
    setattr(context, 'dask', dask_client)
    
    # Save features directory
    features_table = os.getenv('FEATURES_TABLE', 'netops_features')
    setattr(context, 'features_table', features_table)
    
    # Load the model
    model_path = os.path.join(os.getenv('CURRENT_MODEL_DIR', '/models'), os.getenv('MODEL_FILENAME', 'netops.v1.model'))
    with open(model_path, 'rb+') as f:
        model = pickle.load(f)
    clf = ParallelPostFit(estimator=model,
                          scoring='accuracy')
    setattr(context, 'model', clf)

### Handler

In [87]:
def handler(context, event):

    # Load last hour data
    df = context.v3f.read(backend='tsdb', query=f'select * from {context.features_table}', start="now-1h", end='now', multi_index=True)
    
    # Predict
    df['prediction'] = context.model.predict(df)
    
    # Save
    df.reset_index(inplace=True)
    context.v3f.write(backend='tsdb', table='netops_predictions', dfs=[df], index_cols=['company', 'data_center', 'device', 'time'])



## Test

In [85]:
# nuclio: ignore
init_context(context)

In [86]:
# nuclio: ignore
# init_context(context)
event = nuclio.Event(body='')
output = handler(context, event)
output

UnboundLocalError: local variable 'result' referenced before assignment

## Deployment

In [108]:
%nuclio deploy -p netops -n predict -c

%nuclio: ['deploy', '-p', 'netops', '-n', 'predict', '-c', '/User/netops/tutorials/demos/netops/Nuclio-Inference.ipynb']
%nuclio: [nuclio.deploy] 2019-05-18 21:03:09,773 (info) Building processor image
%nuclio: [nuclio.deploy] 2019-05-18 21:03:18,207 (info) Pushing image
%nuclio: [nuclio.deploy] 2019-05-18 21:03:18,207 (info) Build complete
%nuclio: [nuclio.deploy] 2019-05-18 21:03:48,563 (warn) Create function failed failed, setting function status
%nuclio: [nuclio.deploy] 2019-05-18 21:03:48,563
%nuclio: Error - NuclioFunction in error state (
%nuclio: Error - context deadline exceeded
%nuclio:     .../platform/kube/controller/nucliofunction.go:122
%nuclio: 
%nuclio: Call stack:
%nuclio: Failed to wait for function resources to be available
%nuclio:     .../platform/kube/controller/nucliofunction.go:122
%nuclio: )
%nuclio:     .../nuclio/nuclio/pkg/platform/kube/deployer.go:185
%nuclio: 
%nuclio: Call stack:
%nuclio: NuclioFunction in error state (
%nuclio: Error - context deadline e

%nuclio: cannot deploy
error: cannot deploy
