# Model Training

In [1]:
!pip install dask-ml



In [2]:
import scipy as sp
import numpy as np
import pandas as pd
import pickle
import os
import sys
sys.path.append('../../')
import v3io_frames_local as v3f
import dask.dataframe as dd
from dask.distributed import Client, progress

import matplotlib.pyplot as plt; plt.rcdefaults()
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.layouts import column, row, gridplot
from bokeh.models import ColumnDataSource

import dask_ml.model_selection as dcv
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline

import pprint

In [3]:
output_notebook()
pp = pprint.PrettyPrinter(indent=4)
client = v3f.Client('framesd:8081')

In [4]:
dask_client = Client()

  json = yaml.load(f)
Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


## Load data

In [5]:
df = dd.from_pandas(client.read(backend='tsdb', query='select * from netops_metrics_jupyter',
            start="now-1d", end='now+1d', multi_index=True).reset_index(), npartitions=4)
df.head(5)

Unnamed: 0,time,data_center,device,company,cpu_utilization,packet_loss_is_error,is_error,throughput_is_error,throughput,cpu_utilization_is_error,latency,latency_is_error,packet_loss
0,2020-01-10 18:30:34.637,John_Curve,7026488238274,Roberts_PLC,88.484602,0.0,0.0,0.0,223.201614,0.0,2.354705,0.0,0.0
1,2020-01-10 18:30:39.637,John_Curve,7026488238274,Roberts_PLC,65.315625,0.0,0.0,0.0,238.413831,0.0,0.638181,0.0,0.0
2,2020-01-10 18:30:44.637,John_Curve,7026488238274,Roberts_PLC,85.228517,0.0,0.0,0.0,272.113396,0.0,0.0,0.0,0.0
3,2020-01-10 18:30:49.637,John_Curve,7026488238274,Roberts_PLC,75.384545,0.0,0.0,0.0,284.499619,0.0,0.0,0.0,0.0
4,2020-01-10 18:30:54.637,John_Curve,7026488238274,Roberts_PLC,63.015509,0.0,0.0,0.0,239.421695,0.0,0.0,0.0,1.415631


In [6]:
df['key'] = df.apply(lambda row: f'{row["company"]}_{row["data_center"]}_{row["device"]}', axis=1, meta=df.compute().dtypes)
df.set_index('key')
df.head(10)
#df.sort_values('timestamp', ascending=True, inplace=True)

Unnamed: 0,time,data_center,device,company,cpu_utilization,packet_loss_is_error,is_error,throughput_is_error,throughput,cpu_utilization_is_error,latency,latency_is_error,packet_loss,key
0,2020-01-10 18:30:34.637,John_Curve,7026488238274,Roberts_PLC,88.484602,0.0,0.0,0.0,223.201614,0.0,2.354705,0.0,0.0,Roberts_PLC_John_Curve_7026488238274
1,2020-01-10 18:30:39.637,John_Curve,7026488238274,Roberts_PLC,65.315625,0.0,0.0,0.0,238.413831,0.0,0.638181,0.0,0.0,Roberts_PLC_John_Curve_7026488238274
2,2020-01-10 18:30:44.637,John_Curve,7026488238274,Roberts_PLC,85.228517,0.0,0.0,0.0,272.113396,0.0,0.0,0.0,0.0,Roberts_PLC_John_Curve_7026488238274
3,2020-01-10 18:30:49.637,John_Curve,7026488238274,Roberts_PLC,75.384545,0.0,0.0,0.0,284.499619,0.0,0.0,0.0,0.0,Roberts_PLC_John_Curve_7026488238274
4,2020-01-10 18:30:54.637,John_Curve,7026488238274,Roberts_PLC,63.015509,0.0,0.0,0.0,239.421695,0.0,0.0,0.0,1.415631,Roberts_PLC_John_Curve_7026488238274
5,2020-01-10 18:30:59.637,John_Curve,7026488238274,Roberts_PLC,78.258995,0.0,0.0,0.0,244.586052,0.0,10.545494,0.0,0.064035,Roberts_PLC_John_Curve_7026488238274
6,2020-01-10 18:31:04.637,John_Curve,7026488238274,Roberts_PLC,62.552714,0.0,0.0,0.0,229.478398,0.0,0.0,0.0,4.073991,Roberts_PLC_John_Curve_7026488238274
7,2020-01-10 18:31:09.637,John_Curve,7026488238274,Roberts_PLC,68.960976,0.0,0.0,0.0,253.682738,0.0,0.0,0.0,0.822157,Roberts_PLC_John_Curve_7026488238274
8,2020-01-10 18:31:14.637,John_Curve,7026488238274,Roberts_PLC,65.545961,0.0,0.0,0.0,241.973549,0.0,3.768121,0.0,0.796025,Roberts_PLC_John_Curve_7026488238274
9,2020-01-10 18:31:19.637,John_Curve,7026488238274,Roberts_PLC,63.707748,0.0,0.0,0.0,223.357996,0.0,0.0,0.0,0.0,Roberts_PLC_John_Curve_7026488238274


In [7]:
df["cpu_1h_mean"] = df.cpu_utilization.rolling(window=12).mean()
df["latency_1h_mean"] = df.latency.rolling(window=12).mean()
df["packet_loss_1h_mean"] = df.packet_loss.rolling(window=12).mean()
df["throughput_1h_mean"] = df.throughput.rolling(window=12).mean()

df.head(10)

Unnamed: 0,time,data_center,device,company,cpu_utilization,packet_loss_is_error,is_error,throughput_is_error,throughput,cpu_utilization_is_error,latency,latency_is_error,packet_loss,key,cpu_1h_mean,latency_1h_mean,packet_loss_1h_mean,throughput_1h_mean
0,2020-01-10 18:30:34.637,John_Curve,7026488238274,Roberts_PLC,88.484602,0.0,0.0,0.0,223.201614,0.0,2.354705,0.0,0.0,Roberts_PLC_John_Curve_7026488238274,,,,
1,2020-01-10 18:30:39.637,John_Curve,7026488238274,Roberts_PLC,65.315625,0.0,0.0,0.0,238.413831,0.0,0.638181,0.0,0.0,Roberts_PLC_John_Curve_7026488238274,,,,
2,2020-01-10 18:30:44.637,John_Curve,7026488238274,Roberts_PLC,85.228517,0.0,0.0,0.0,272.113396,0.0,0.0,0.0,0.0,Roberts_PLC_John_Curve_7026488238274,,,,
3,2020-01-10 18:30:49.637,John_Curve,7026488238274,Roberts_PLC,75.384545,0.0,0.0,0.0,284.499619,0.0,0.0,0.0,0.0,Roberts_PLC_John_Curve_7026488238274,,,,
4,2020-01-10 18:30:54.637,John_Curve,7026488238274,Roberts_PLC,63.015509,0.0,0.0,0.0,239.421695,0.0,0.0,0.0,1.415631,Roberts_PLC_John_Curve_7026488238274,,,,
5,2020-01-10 18:30:59.637,John_Curve,7026488238274,Roberts_PLC,78.258995,0.0,0.0,0.0,244.586052,0.0,10.545494,0.0,0.064035,Roberts_PLC_John_Curve_7026488238274,,,,
6,2020-01-10 18:31:04.637,John_Curve,7026488238274,Roberts_PLC,62.552714,0.0,0.0,0.0,229.478398,0.0,0.0,0.0,4.073991,Roberts_PLC_John_Curve_7026488238274,,,,
7,2020-01-10 18:31:09.637,John_Curve,7026488238274,Roberts_PLC,68.960976,0.0,0.0,0.0,253.682738,0.0,0.0,0.0,0.822157,Roberts_PLC_John_Curve_7026488238274,,,,
8,2020-01-10 18:31:14.637,John_Curve,7026488238274,Roberts_PLC,65.545961,0.0,0.0,0.0,241.973549,0.0,3.768121,0.0,0.796025,Roberts_PLC_John_Curve_7026488238274,,,,
9,2020-01-10 18:31:19.637,John_Curve,7026488238274,Roberts_PLC,63.707748,0.0,0.0,0.0,223.357996,0.0,0.0,0.0,0.0,Roberts_PLC_John_Curve_7026488238274,,,,


In [8]:
# Drop first 'Window' samples due to no featuers
# (Don't want to confuse the ML algorithm)
feature_vectors = df.dropna()[['cpu_utilization', 'latency', 'packet_loss', 'throughput', 'cpu_1h_mean', 'latency_1h_mean', 'packet_loss_1h_mean', 'throughput_1h_mean', 'is_error']]
feature_vectors.head(10)

Unnamed: 0,cpu_utilization,latency,packet_loss,throughput,cpu_1h_mean,latency_1h_mean,packet_loss_1h_mean,throughput_1h_mean,is_error
11,59.534368,0.752885,0.0,254.303641,70.010186,1.700553,0.599096,244.865357,0.0
12,63.926298,0.749091,0.0,254.030072,67.963661,1.566752,0.599096,247.434395,0.0
13,81.087116,13.068888,0.0,233.671279,69.277952,2.602644,0.599096,247.039183,0.0
14,76.480438,3.313845,1.322079,256.707807,68.548945,2.878798,0.709269,245.755383,0.0
15,71.103633,0.0,0.0,202.725881,68.192202,2.878798,0.709269,238.940905,0.0
16,85.370503,4.022155,0.0,255.781661,70.055119,3.213977,0.5913,240.304236,0.0
17,71.25909,0.0,1.895075,263.233415,69.471793,2.335186,0.743887,241.858183,0.0
18,87.812752,0.0,1.736162,252.518276,71.576796,2.335186,0.549068,243.778172,0.0
19,65.727765,0.0,0.0,244.129823,71.307362,2.335186,0.480555,242.982096,0.0
20,63.760104,0.0,1.373622,234.295685,71.158541,2.021176,0.528688,242.342274,0.0


In [9]:
from dask_ml.model_selection import train_test_split

## Training

In [10]:
X = feature_vectors[['cpu_1h_mean', 'latency_1h_mean', 'packet_loss_1h_mean', 'throughput_1h_mean']]
y = feature_vectors['is_error']
X_train, X_test, y_train, y_test = dcv.train_test_split(X, y, train_size=0.7, test_size=0.3)

In [11]:
param_grid = {
    'n_estimators': [5, 10, 20, 30]
}

model = GradientBoostingClassifier()
model = dcv.GridSearchCV(model, param_grid)

print(X_train.compute().shape, y_train.compute().shape)

model.fit(X_train.compute(), y_train.compute())

(4033, 4) (4033,)


GridSearchCV(cache_cv=True, cv=None, error_score='raise',
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_

In [12]:
model.score(X_test, y_test)

0.9918793503480279

In [13]:
model.best_estimator_

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=20,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [14]:
!mkdir models

mkdir: cannot create directory ‘models’: File exists


In [15]:
version = '1.0'
model_filepath = 'models/netops.v{}.model'.format(version)
with open(model_filepath, 'wb+') as f:
    pickle.dump(model.best_estimator_, f)