# Model Training

In [None]:
!pip install dask-ml

In [1]:
import scipy as sp
import numpy as np
import pandas as pd
import pickle
import os
import v3io_frames as v3f
import dask.dataframe as dd
from dask.distributed import Client, progress

import matplotlib.pyplot as plt; plt.rcdefaults()
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.layouts import column, row, gridplot
from bokeh.models import ColumnDataSource

import dask_ml.model_selection as dcv
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline

import pprint

In [2]:
output_notebook()
pp = pprint.PrettyPrinter(indent=4)
client = v3f.Client('framesd:8081')

In [3]:
dask_client = Client()

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


## Load data

In [4]:
df = dd.from_pandas(client.read(backend='tsdb', query='select * from netops_metrics_jupyter',
            start="now-1d", end='now+1d', multi_index=True).reset_index(), npartitions=4)
df.head(5)

Unnamed: 0,time,company,data_center,device,latency,throughput,latency_is_error,packet_loss,is_error,cpu_utilization,packet_loss_is_error,throughput_is_error,cpu_utilization_is_error
0,2019-04-24 16:29:53.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,270.349839,0.0,1.88073,0.0,69.491442,0.0,0.0,0.0
1,2019-04-24 16:29:59.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,200.724432,0.0,0.187542,0.0,82.425018,0.0,0.0,0.0
2,2019-04-24 16:30:05.996,Henderson-Lopez,Spencer_Greens,9909774422784,1.029673,265.47445,0.0,0.0,0.0,60.817191,0.0,0.0,0.0
3,2019-04-24 16:30:11.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,268.298557,0.0,0.035432,0.0,66.563655,0.0,0.0,0.0
4,2019-04-24 16:30:17.996,Henderson-Lopez,Spencer_Greens,9909774422784,2.279885,261.555663,0.0,2.438059,0.0,77.089529,0.0,0.0,0.0


In [5]:
df['key'] = df.apply(lambda row: f'{row["company"]}_{row["data_center"]}_{row["device"]}', axis=1, meta=df.compute().dtypes)
df.set_index('key')
df.head(10)
#df.sort_values('timestamp', ascending=True, inplace=True)

Unnamed: 0,time,company,data_center,device,latency,throughput,latency_is_error,packet_loss,is_error,cpu_utilization,packet_loss_is_error,throughput_is_error,cpu_utilization_is_error,key
0,2019-04-24 16:29:53.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,270.349839,0.0,1.88073,0.0,69.491442,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784
1,2019-04-24 16:29:59.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,200.724432,0.0,0.187542,0.0,82.425018,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784
2,2019-04-24 16:30:05.996,Henderson-Lopez,Spencer_Greens,9909774422784,1.029673,265.47445,0.0,0.0,0.0,60.817191,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784
3,2019-04-24 16:30:11.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,268.298557,0.0,0.035432,0.0,66.563655,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784
4,2019-04-24 16:30:17.996,Henderson-Lopez,Spencer_Greens,9909774422784,2.279885,261.555663,0.0,2.438059,0.0,77.089529,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784
5,2019-04-24 16:30:23.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,300.0,0.0,1.733723,0.0,48.045429,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784
6,2019-04-24 16:30:29.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,257.501858,0.0,0.0,0.0,63.269681,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784
7,2019-04-24 16:30:35.996,Henderson-Lopez,Spencer_Greens,9909774422784,6.219771,251.818551,0.0,0.600471,0.0,74.93198,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784
8,2019-04-24 16:30:41.996,Henderson-Lopez,Spencer_Greens,9909774422784,5.231056,256.647878,0.0,1.69004,0.0,69.037508,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784
9,2019-04-24 16:30:47.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,239.106875,0.0,0.0,0.0,66.865006,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784


In [6]:
df["cpu_1h_mean"] = df.cpu_utilization.rolling(window=12).mean()
df["latency_1h_mean"] = df.latency.rolling(window=12).mean()
df["packet_loss_1h_mean"] = df.packet_loss.rolling(window=12).mean()
df["throughput_1h_mean"] = df.throughput.rolling(window=12).mean()

df.head(10)

Unnamed: 0,time,company,data_center,device,latency,throughput,latency_is_error,packet_loss,is_error,cpu_utilization,packet_loss_is_error,throughput_is_error,cpu_utilization_is_error,key,cpu_1h_mean,latency_1h_mean,packet_loss_1h_mean,throughput_1h_mean
0,2019-04-24 16:29:53.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,270.349839,0.0,1.88073,0.0,69.491442,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784,,,,
1,2019-04-24 16:29:59.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,200.724432,0.0,0.187542,0.0,82.425018,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784,,,,
2,2019-04-24 16:30:05.996,Henderson-Lopez,Spencer_Greens,9909774422784,1.029673,265.47445,0.0,0.0,0.0,60.817191,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784,,,,
3,2019-04-24 16:30:11.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,268.298557,0.0,0.035432,0.0,66.563655,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784,,,,
4,2019-04-24 16:30:17.996,Henderson-Lopez,Spencer_Greens,9909774422784,2.279885,261.555663,0.0,2.438059,0.0,77.089529,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784,,,,
5,2019-04-24 16:30:23.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,300.0,0.0,1.733723,0.0,48.045429,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784,,,,
6,2019-04-24 16:30:29.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,257.501858,0.0,0.0,0.0,63.269681,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784,,,,
7,2019-04-24 16:30:35.996,Henderson-Lopez,Spencer_Greens,9909774422784,6.219771,251.818551,0.0,0.600471,0.0,74.93198,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784,,,,
8,2019-04-24 16:30:41.996,Henderson-Lopez,Spencer_Greens,9909774422784,5.231056,256.647878,0.0,1.69004,0.0,69.037508,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784,,,,
9,2019-04-24 16:30:47.996,Henderson-Lopez,Spencer_Greens,9909774422784,0.0,239.106875,0.0,0.0,0.0,66.865006,0.0,0.0,0.0,Henderson-Lopez_Spencer_Greens_9909774422784,,,,


In [7]:
# Drop first 'Window' samples due to no featuers
# (Dont want to confuse the ML algorithm)
feature_vectors = df.dropna()[['cpu_utilization', 'latency', 'packet_loss', 'throughput', 'cpu_1h_mean', 'latency_1h_mean', 'packet_loss_1h_mean', 'throughput_1h_mean', 'is_error']]
feature_vectors.head(10)

Unnamed: 0,cpu_utilization,latency,packet_loss,throughput,cpu_1h_mean,latency_1h_mean,packet_loss_1h_mean,throughput_1h_mean,is_error
11,64.259285,3.252629,0.0,246.26175,66.701207,1.501084,0.872012,252.694008,0.0
12,72.220273,3.617189,0.0,261.796905,66.92861,1.802517,0.715284,251.981264,0.0
13,73.605601,0.0,1.7305,266.26711,66.193658,1.802517,0.843864,257.443153,0.0
14,75.458159,0.0,0.0,249.766203,67.413739,1.716711,0.843864,256.134133,0.0
15,76.809393,0.0,0.78127,272.817708,68.26755,1.716711,0.906017,256.510729,0.0
16,74.173947,0.0,1.434033,218.947365,68.024585,1.52672,0.822348,252.960037,0.0
17,88.501879,0.0,0.0,242.930991,71.395956,1.52672,0.677871,248.204287,0.0
18,63.62242,4.325036,1.147024,277.539168,71.425351,1.88714,0.773457,249.874062,0.0
19,48.186029,4.50442,0.0,262.617006,69.196522,1.744194,0.723418,250.773934,0.0
20,74.591433,0.0,0.0,260.016214,69.659349,1.308273,0.582581,251.054628,0.0


In [8]:
from dask_ml.model_selection import train_test_split

## Training

In [9]:
X = feature_vectors[['cpu_1h_mean', 'latency_1h_mean', 'packet_loss_1h_mean', 'throughput_1h_mean']]
y = feature_vectors['is_error']
X_train, X_test, y_train, y_test = dcv.train_test_split(X, y, train_size=0.7, test_size=0.3)

In [10]:
param_grid = {
    'n_estimators': [5, 10, 20, 30]
}

model = GradientBoostingClassifier()
model = dcv.GridSearchCV(model, param_grid)

print(X_train.compute().shape, y_train.compute().shape)

model.fit(X_train.compute(), y_train.compute())

(3305, 4) (3305,)


GridSearchCV(cache_cv=True, cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       iid=True, n_jobs=-1, param_grid={'n_estimators': [5, 10, 20, 30]},
       refit=True, return_train_score='warn', scheduler=None, scoring=None)

In [11]:
model.score(X_test, y_test)

0.9912868632707775

In [12]:
model.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [13]:
!mkdir models

mkdir: cannot create directory 'models': File exists


In [14]:
version = '1.0'
model_filepath = 'models/netops.v{}.model'.format(version)
with open(model_filepath, 'wb+') as f:
    pickle.dump(model.best_estimator_, f)