# Nuclio - Training function

## Environment

In [1]:
# nuclio: ignore
import nuclio

### Configurations

In [2]:
%%nuclio config

# Trigger
spec.triggers.retrain.kind = "cron"
spec.triggers.retrain.attributes.interval = "1h"

# Base image
spec.build.baseImage = "daskdev/dask"

%nuclio: setting spec.triggers.retrain.kind to 'cron'
%nuclio: setting spec.triggers.retrain.attributes.interval to '1h'
%nuclio: setting spec.build.baseImage to 'daskdev/dask'


In [3]:
%nuclio mount /User ~/

mounting volume path /User as ~/


### Commands

In [4]:
%%nuclio cmd -c

# apt-get update && apt-get install -y libaio1
# apt-get install libgomp1 

############
# installs #
############

# Igz DB
pip install v3io_frames

# Utils
pip install 'fsspec>=0.3.3'
pip install pyarrow

# Function
pip install dask-ml
pip install dask-xgboost --upgrade

### Variables

In [5]:
# DB Config
%nuclio env %v3io

## Function

### Imports

In [46]:
# Utils
import os

# DB Connection
import v3io_frames as v3f

# Parallelization
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

# Function
import dask_xgboost as dxgb
import dask_ml.model_selection as dcv
from sklearn.metrics import roc_auc_score

### Helper functions

In [7]:
def format_df_from_tsdb(context, df):
    df.index.names = ['timestamp', 'company', 'data_center', 'device']
    df = df.reset_index()
    df = dd.from_pandas(df, npartitions=context.shards)
    return df

In [8]:
def get_data_tsdb(context, features_table, train_on_last, dask_shards):
    df = context.v3f.read(backend='tsdb', query=f'select * from {features_table}',
                          start=f'now-{train_on_last}', end='now', multi_index=True)
    df = df.reset_index(drop=True)
    df = df[sorted(df.columns)]
    df = dd.from_pandas(df, npartitions=dask_shards)
    return df

In [9]:
def get_data_parquet(context, features_table, train_on_last, dask_shards):
    # Get parquet files
    mpath = [os.path.join(features_table, file) for file in os.listdir(features_table) if os.path.isdir(os.path.join(features_table, file))]
    
    # Get latest filename
    latest = max(mpath, key=os.path.getmtime)
    context.logger.info(f'Reading data from: {latest}')
    
    # Load parquet to dask
    df = dd.read_parquet(latest, infer_divisions=False)
    
    return df

In [10]:
def get_train_test_sets_from_data(context, 
                                  df, 
                                  metrics, 
                                  labels, 
                                  train_size):
    X = df.loc[:, metrics]
    y = df.loc[:, labels]
    X_train, X_test, y_train, y_test = dcv.train_test_split(X, y, train_size=train_size, test_size=1-train_size)
    return X_train, X_test, y_train, y_test

### Handler

In [42]:
def trainer_dask_xgboost(context, 
            save_to_tsdb=0,
            labels=[],
            metrics=[],
            features_table='/v3io/bigdata/netops_features_parquet',
            model_filepath='/v3io/bigdata/netops/models/netops.model',
            train_on_last='7d',
            train_size=0.7,
            dask_shards=4):
    
    # Setup context   
    if save_to_tsdb:
        # Create V3IO connection
        v3io_client = v3f.Client(address='framesd:8081', 
                                 container='bigdata')
        setattr(context, 'v3f', v3io_client)
        
        # Create features table if neede
        context.v3f.create('tsdb', 
                           features_table, 
                           attrs={'rate': '1/s'}, 
                           if_exists=1)
    
        # Set TSDB reading function
        setattr(context, 'read', get_data_tsdb)
    
    # Save to Parquet
    else:
         # Create saving directory if needed
        filepath = os.path.join(features_table)
        if not os.path.exists(filepath):
            os.makedirs(filepath)
            
        # Set Parquet reading function
        setattr(context, 'read', get_data_parquet)
        
    # Setup Dask
    dask_client = Client(LocalCluster(n_workers=dask_shards))  
    
    # Create save-to folder if needed
    model_dir = os.path.dirname(model_filepath)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    # Get data
    df = context.read(context, features_table, train_on_last, dask_shards) 

    # Split to Train / Test datasets
    X_train, X_test, y_train, y_test = get_train_test_sets_from_data(context,
                                                                     df, 
                                                                     metrics, 
                                                                     labels, 
                                                                     train_size)
    
    # Persist to memory to ensure fast computation on training
    X_train = dask_client.persist(X_train)
    X_test = dask_client.persist(X_test)
    y_train = dask_client.persist(y_train)
    y_test = dask_client.persist(y_test)
    
    # Train
    params = {'objective': 'binary:logistic', 'nround': 1000, 
              'max_depth': 3, 'eta': 0.01, 'subsample': 0.5, 
              'min_child_weight': 1}
    model = dxgb.train(dask_client, params, X_train, y_train)
    
    # Score
    predictions = dxgb.predict(dask_client, model, X_test)
    
    score = roc_auc_score(y_test.compute(), predictions.compute())
    context.log_result('accuracy', score)
    
    # Save model
    model.save_model(model_filepath)
    context.log_artifact('model', local_path=model_filepath)

In [12]:
# nuclio: end-code

## Test locally

In [None]:
trainer_dask_xgboost(context, 
            save_to_tsdb=0,
            labels='is_error',
            metrics=['cpu_utilization', 'throughput', 'latency', 'packet_loss'],
            features_table='/User/netops_features_parquet',
            model_filepath='/User/netops/models/netops.model',
            train_on_last='7d',
            train_size=0.7,
            dask_shards=4)

## Deploy to cluster

In [13]:
from mlrun import code_to_function, mount_v3io, mlconf

In [14]:
mlconf.dbpath = 'http://mlrun-api:8080'

In [15]:
trainer = code_to_function(name='trainer',
                           runtime='job',
                           project='netops',
                           handler='trainer_dask_xgboost')
trainer = trainer.apply(mount_v3io())
trainer.deploy()

[mlrun] 2020-01-09 10:38:38,510 starting remote build, image: .mlrun/func-netops-trainer-latest
[36mINFO[0m[0000] Resolved base name daskdev/dask to daskdev/dask 
[36mINFO[0m[0000] Resolved base name daskdev/dask to daskdev/dask 
[36mINFO[0m[0000] Downloading base image daskdev/dask          
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:2ac5385ebc20fe2982a22f8fcf3cf765e7a01dc5e5003b42aa44493af0a06438: no such file or directory 
[36mINFO[0m[0000] Downloading base image daskdev/dask          
[36mINFO[0m[0000] Built cross stage deps: map[]                
[36mINFO[0m[0000] Downloading base image daskdev/dask          
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:2ac5385ebc20fe2982a22f8fcf3cf765e7a01dc5e5003b42aa44493af0a06438: no such file or directory 
[36mINFO[0m[0000] Downloading base image daskdev/dask          
[36mINFO[0m[0000] Unpacking rootfs as cmd RUN pip 

True

In [44]:
trainer.with_code()

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f79f580dcf8>

In [45]:
params = {
    'windows': {'minutely': 3, 
                'hourly': 3*60},
    'metrics': ['cpu_utilization', 'throughput', 'latency', 'packet_loss'],
    'labels': ['is_error'],
    'save_to_tsdb': 0,
    'features_table': '/User/netops_features_parquet',
    'model_filepath': '/User/netops/models/model.bst',
    'dask_shards': 4,
}

run = trainer.run(params=params, watch=True, handler='trainer_dask_xgboost')

[mlrun] 2020-01-09 11:37:46,190 starting run trainer_dask_xgboost uid=e720db2eb8724ceda7898bc6def5acc0  -> http://mlrun-api:8080
[11:37:56] Tree method is automatically selected to be 'approx' for distributed training.
[mlrun] 2020-01-09 11:37:55,416 Reading data from: /User/netops_features_parquet/20200102T074701-20200102T084511

[mlrun] 2020-01-09 11:37:57,087 run executed, status=completed
final state: succeeded


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...f5acc0,0,Jan 09 11:37:53,completed,trainer,host=trainer-dask-xgboost-d2sgpkind=jobowner=admin,,"dask_shards=4features_table=/User/netops_features_parquetlabels=['is_error']metrics=['cpu_utilization', 'throughput', 'latency', 'packet_loss']model_filepath=/User/netops/models/model.bstsave_to_tsdb=0windows={'hourly': 180, 'minutely': 3}",accuracy=0.9990536277602523,model


to track results use .show() or .logs() or in CLI: 
!mlrun get run e720db2eb8724ceda7898bc6def5acc0  , !mlrun logs e720db2eb8724ceda7898bc6def5acc0 
[mlrun] 2020-01-09 11:38:05,488 run executed, status=completed
