# Nuclio - Training function

## Environment

In [1]:
# nuclio: ignore
import nuclio

### Configurations

In [2]:
%%nuclio config

# Trigger
spec.triggers.retrain.kind = "cron"
spec.triggers.retrain.attributes.interval = "1h"

# Base image
spec.build.baseImage = "python:3.6-jessie"

%nuclio: setting spec.triggers.retrain.kind to 'cron'
%nuclio: setting spec.triggers.retrain.attributes.interval to '1h'
%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'


### Variables

In [128]:
%%nuclio env

# Work from TSDB or Parquet?
FROM_TSDB=1

# DB Config
V3IO_FRAMESD=${V3IO_FRAMESD}
V3IO_USERNAME=${V3IO_USERNAME}
V3IO_ACCESS_KEY=${V3IO_ACCESS_KEY}

# Features
FEATURES_TABLE=netops_features
# FEATURES_TABLE=/v3io/bigdata/netops_features_parquet

# Predictions
PREDICTIONS_TABLE=netops_predictions
# PREDICTIONS_TABLE=/v3io/bigdata/netops_predictions_parquet

# Training
TRAIN_ON_LAST=1d
TRAIN_SIZE=0.7

# Parallelizem
NUMBER_OF_SHARDS=4

# Model
# MODEL_FILENAME=netops.model.pickle
MODEL_FILENAME=netops.v3.model.pickle
SOURCE_MODEL_DIR=/bigdata/netops/models
FIXED_WEB_DIR=/models

%nuclio: setting 'FROM_TSDB' environment variable
%nuclio: setting 'V3IO_FRAMESD' environment variable
%nuclio: setting 'V3IO_USERNAME' environment variable
%nuclio: setting 'V3IO_ACCESS_KEY' environment variable
%nuclio: setting 'FEATURES_TABLE' environment variable
%nuclio: setting '# FEATURES_TABLE' environment variable
%nuclio: setting 'PREDICTIONS_TABLE' environment variable
%nuclio: setting '# PREDICTIONS_TABLE' environment variable
%nuclio: setting 'TRAIN_ON_LAST' environment variable
%nuclio: setting 'TRAIN_SIZE' environment variable
%nuclio: setting 'NUMBER_OF_SHARDS' environment variable
%nuclio: setting '# MODEL_FILENAME' environment variable
%nuclio: setting 'MODEL_FILENAME' environment variable
%nuclio: setting 'SOURCE_MODEL_DIR' environment variable
%nuclio: setting 'FIXED_WEB_DIR' environment variable


%nuclio: cannot find "=" in line
%nuclio: cannot find "=" in line
%nuclio: cannot find "=" in line
%nuclio: cannot find "=" in line
%nuclio: cannot find "=" in line
%nuclio: cannot find "=" in line
%nuclio: cannot find "=" in line


In [129]:
%nuclio env -c CURRENT_MODEL_DIR=/models
%nuclio env -l CURRENT_MODEL_DIR=/v3io/bigdata/netops/models

%nuclio: setting 'CURRENT_MODEL_DIR' environment variable


### Commands

In [130]:
%%nuclio cmd -c

############
# installs #
############

# Utils
pip install pyarrow
pip install pandas

# Igz DB
pip install v3io_frames --upgrade

# Function
pip install xgboost
pip install scikit-learn==0.20.1

apt-get update && apt-get install -y wget
mkdir -p ${FIXED_WEB_DIR}

In [131]:
%%nuclio cmd -c 
# Copy the model file into the function
wget -O ${FIXED_WEB_DIR}/${MODEL_FILENAME} --header "x-v3io-session-key: ${V3IO_ACCESS_KEY}" http://${V3IO_WEBAPI_SERVICE_HOST}:8081${SOURCE_MODEL_DIR}/${MODEL_FILENAME}

## Function

### Imports

In [132]:
import os
import pickle

import v3io_frames as v3f

import pandas as pd
import xgboost as xgb

### Helper functions

In [165]:
def format_df_from_tsdb(context, df):
    df.index.names = ['timestamp', 'company', 'data_center', 'device']
    return df

In [166]:
def set_indexes(df):
    df = df.set_index(['timestamp', 'company', 'data_center', 'device'])
    return df

In [167]:
def get_data_tsdb(context):
    df = context.v3f.read(backend='tsdb', query=f'select * from {context.features_table}',
                          start=f'now-{context.train_on_last}', end='now', multi_index=True)
    df = format_df_from_tsdb(context, df)
    
    # Keep columns
    keep_columns = [col for col in df.columns if 'is_error' not in col]
    
    # Keep good columns and Sort them
    df = df[sorted(keep_columns)]
    
    return df

In [168]:
def get_data_parquet(context):
    # Get parquet files
    mpath = [os.path.join(context.features_table, file) for file in os.listdir(context.features_table)]
    
    # Get latest filename
    latest = max(mpath, key=os.path.getmtime)
    print(latest)
    context.logger.debug(f'Reading data from: {latest}')
    
    # Load parquet to dask
    df = pd.read_parquet(latest)
    
    # Keep columns
    keep_columns = [col for col in df.columns if 'is_error' not in col]
    
    # Keep good columns and Sort them
    df = df[sorted(keep_columns)]
    
    return df

In [169]:
def save_to_tsdb(context, df: pd.DataFrame):   
    # Fix indexes before saving to TSDB
    df = set_indexes(df)
    
    # Save to TSDB
    context.v3f.write('tsdb', context.predictions_table, df)

In [170]:
def save_to_parquet(context, df: pd.DataFrame):
    print('Saving features to Parquet')
    
    # Need to fix timestamps from ns to ms if we write to parquet
    df = df.reset_index()
    df['timestamp'] = df.loc[:, 'timestamp'].astype('datetime64[ms]')
    
    # Fix indexes
    df= set_indexes(df)
    
    # Save parquet
    first_timestamp = df.index[0][0].strftime('%Y%m%dT%H%M%S')
    last_timestamp = df.index[-1][0].strftime('%Y%m%dT%H%M%S')
    filename = first_timestamp + '-' + last_timestamp + '.parquet'
    filepath = os.path.join(context.predictions_table, filename)
    with open(filepath, 'wb+') as f:
        df.to_parquet(f)

### Init context

In [171]:
def init_context(context):
    
    # Save features directory
    features_table = os.getenv('FEATURES_TABLE', 'netops_features')
    setattr(context, 'features_table', features_table)
    
    # Save predictions directory
    predictions_table = os.getenv('PREDICTIONS_TABLE', 'netops_predictions')
    setattr(context, 'predictions_table', predictions_table)
    
    # Get saving configuration
    is_from_tsdb = (int(os.getenv('FROM_TSDB', 1)) == 1)
    
    # Save to TSDB
    if is_from_tsdb:
        # Create our DB client
        v3io_client = v3f.Client(address='http://' + os.getenv('V3IO_FRAMESD', 'framesd:8081'), 
                            container='bigdata', 
                            password=os.environ['V3IO_ACCESS_KEY'], 
                            user=os.environ['V3IO_USERNAME'])
        setattr(context, 'v3f', v3io_client)
        
        # Create predictions table if neede
        context.v3f.create('tsdb', context.predictions_table, attrs={'rate': '1/s'}, if_exists=1)
        
        train_on_last = os.getenv('TRAIN_ON_LAST', '1h')
        setattr(context, 'train_on_last', train_on_last)
        
        # Set TSDB reading function
        setattr(context, 'read', get_data_tsdb)
        
        # Set TSDB saving fucntion
        setattr(context, 'write', save_to_tsdb)
        
    # Save to Parquet
    else:
         # Create saving directory if needed
        filepath = os.path.join(context.predictions_table)
        if not os.path.exists(filepath):
            os.makedirs(filepath)
            
        # Set Parquet reading function
        setattr(context, 'read', get_data_parquet)
        
        # Set Parquet saving fucntion
        setattr(context, 'write', save_to_parquet)
    
    # Load the model
    model_path = os.path.join(os.getenv('CURRENT_MODEL_DIR', '/models'), os.getenv('MODEL_FILENAME', 'netops.v1.model'))
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    setattr(context, 'model', model)

### Handler

In [172]:
def handler(context, event):

    # Load last hour data
    df = context.read(context)
    
    # limit for testing
    df = df.head(2)
    
    # Predict
    df['prediction'] = context.model.predict(df.values)
    
    print(df.head(1))
    
    # Prepare to save predictions
    df = df.reset_index()
    df = df.rename({'level_0': 'time',
                    'level_1': 'company',
                    'level_2': 'data_center',
                    'level_3': 'device'}, axis=1)
    
    # Save
    context.write(context, df)
#     context.v3f.write(backend='tsdb', table=context.predictions_table, dfs=[df], index_cols=['time', 'data_center', 'device', 'company'])

## Test

In [173]:
# nuclio: ignore
init_context(context)

In [174]:
# nuclio: ignore
event = nuclio.Event(body='')
output = handler(context, event)
output

                                                                 cpu_utilization_hourly  \
timestamp           company         data_center   device                                  
2019-05-19 08:56:33 Vargas_and_Sons Collier_Ranch 0313798941486               75.131431   

                                                                 cpu_utilization_minute  \
timestamp           company         data_center   device                                  
2019-05-19 08:56:33 Vargas_and_Sons Collier_Ranch 0313798941486                74.46279   

                                                                 cpu_utilization_raw  \
timestamp           company         data_center   device                               
2019-05-19 08:56:33 Vargas_and_Sons Collier_Ranch 0313798941486            71.817727   

                                                                 latency_hourly  \
timestamp           company         data_center   device                          
2019-05-19 08:56:33 V

## Deployment

In [175]:
%nuclio deploy -p netops -n predict -c

%nuclio: ['deploy', '-p', 'netops', '-n', 'predict', '-c', '/User/netops/tutorials/demos/netops/Nuclio-Inference.ipynb']
%nuclio: [nuclio.deploy] 2019-05-20 08:57:42,217 (info) Building processor image
%nuclio: [nuclio.deploy] 2019-05-20 08:57:48,294 (info) Pushing image
%nuclio: [nuclio.deploy] 2019-05-20 08:57:48,294 (info) Build complete
%nuclio: [nuclio.deploy] 2019-05-20 08:57:52,436 (info) Function deploy complete
%nuclio: [nuclio.deploy] 2019-05-20 08:57:52,445 done updating predict, function address: 3.122.56.83:32642
%nuclio: function deployed
