# Nuclio - Data preperation function

## Environment

In [None]:
# nuclio: ignore
import nuclio

### Configurations

In [None]:
%%nuclio config

# Trigger
spec.triggers.retrain.kind = "cron"
spec.triggers.retrain.attributes.interval = "1h"

# Base image
spec.build.baseImage = "python:3.6-jessie"

### Commands

In [None]:
%%nuclio cmd -c

############
# installs #
############

# Utils
pip install pyyaml
pip install pyarrow
pip install pandas

# Igz DB
pip install v3io_frames --upgrade

# Function
pip install dask["complete"]
pip install dask-ml
pip install scikit-learn

### Variables

In [None]:
%%nuclio env

# DB Config
V3IO_FRAMESD=${V3IO_FRAMESD}
V3IO_USERNAME=${V3IO_USERNAME}
V3IO_ACCESS_KEY=${V3IO_ACCESS_KEY}

# Metrics
METRICS_TABLE=netops_metrics

# Features
FEATURES_TABLE=netops_features

# Parallelizem
NUMBER_OF_SHARDS=4

## Function

### Imports

In [None]:
# Utils
import os
import time
import yaml
import pandas as pd
import datetime
import itertools

# DB Connection
import v3io_frames as v3f

# Parallelization
import dask.dataframe as dd
from dask.distributed import Client

# Function
import dask_ml.model_selection as dcv
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline

### Helper functions

In [None]:
def normalize_timestamp(df):
#     df['timestamp'] = df['timestamp'].dt.strftime('%Y%m%d %H:%M')
    return df

In [None]:
def format_df_from_tsdb(df):
    df.index.names = ['timestamp', 'company', 'data_center', 'device']
    df = df.reset_index()
    df = normalize_timestamp(df)
    df = dd.from_pandas(df, npartitions=context.shards)
    return df

In [None]:
def get_data(context):
    df = context.v3f.read(backend='tsdb', query=f'select cpu_utilization, latency, packet_loss, throughput, is_error from {context.metrics_table}',
                          start=f'now-2h', end='now', steps='1m', multi_index=True)
    df = format_df_from_tsdb(df)
    return df

In [None]:
def create_rolling_featuers(context, df, window_size: int):
    features = df.copy()
    features['key'] = features.apply(lambda row: f'{row["company"]}_{row["data_center"]}_{row["device"]}', axis=1, meta=features.compute().dtypes)
    features.set_index('key')
    features["cpu_utilization"] = features.cpu_utilization.rolling(window=window_size).mean()
    features["latency"] = features.latency.rolling(window=window_size).mean()
    features["packet_loss"] = features.packet_loss.rolling(window=window_size).mean()
    features["throughput"] = features.throughput.rolling(window=window_size).mean()
    features["is_error"] = features.is_error.rolling(window=window_size).max()
                                     
    features = features.dropna()
    features = features.drop_duplicates()

    return features

In [None]:
def save_to_tsdb(context, features: pd.DataFrame):   
    context.v3f.write('tsdb', context.features_table, features)

### Init context

In [None]:
def init_context(context):
    
    # Create our DB client
    v3io_client = v3f.Client(address='http://' + os.getenv('V3IO_FRAMESD', 'framesd:8081'), 
                        container='bigdata', 
                        password=os.environ['V3IO_ACCESS_KEY'], 
                        user=os.environ['V3IO_USERNAME'])
    setattr(context, 'v3f', v3io_client)
    
    # Create Dask client
    dask_client = Client()
    setattr(context, 'dask', dask_client)
    
    # Set time to train on
    train_on_last = os.getenv('TRAIN_ON_LAST', '7d')
    setattr(context, 'train_on_last', train_on_last)
    
    # Set training set size
    train_set_size = float(os.getenv('TRAIN_SIZE', 0.7))
    setattr(context, 'train_size', train_set_size)
    
    # Netops metrics table
    setattr(context, 'metrics_table', os.getenv('METRICS_TABLE', 'netops_metrics'))
    
    # Netops feautres table
    setattr(context, 'features_table', os.getenv('FEATURES_TABLE', 'netops_features'))
    context.v3f.create('tsdb', context.features_table, attrs={'rate': '1/s'}, if_exists=1)
    
    # Dask shards / CV
    setattr(context, 'shards', int(os.getenv('NUMBER_OF_SHARDS', 4)))

### Handler

In [None]:
def handler(context, event):
    
    # Get data
    raw = get_data(context) 
    print('got raw data')
    # Get minute features
    minute = create_rolling_featuers(context, raw, 3)
    print('created minute data')
    # Get hour features
    hour = create_rolling_featuers(context, raw, 3*60)
    column_names = {'cpu_utilization': 'cpu_utilization_hourly',
                    'latency': 'latency_hourly',
                    'packet_loss': 'packet_loss_hourly',
                    'throughput': 'throughput_hourly'}
    hour = hour.rename(columns=column_names)
    print('created hour data')
    # Create feature vector from data sources
    features_rm = raw.merge(minute, on=['timestamp', 'company', 'data_center', 'device'], suffixes=('_raw', '_minute'))
    features_rm.compute()
    print('merged raw')
    features = features_rm.merge(hour, on=['timestamp', 'company', 'data_center', 'device'], suffixes=('_raw', '_hourly'))
    features = features.compute()
    print('merged hour')
    # Save feature vector to TSDB
    
    # Drop key columns
    features = features.reset_index(drop=True)
    feature_cols = [col for col in features.columns if 'key' in col]
    features = features.drop(feature_cols, axis=1)
    print('dropped columns')
    
    # Fix indexes before saving
    features = features.set_index(['timestamp', 'company', 'data_center', 'device'])
    print('set indexes')
    
    # Save to TSDB
    save_to_tsdb(context, features)
    print('saved to TSDB')

## Test

In [None]:
# nuclio: ignore
init_context(context)

In [None]:
# nuclio: ignore
# init_context(context)
event = nuclio.Event(body='')
output = handler(context, event)
output

## Deployment

In [48]:
!rm -r /v3io/bigdata/netops_metrics_parquet