# Model Training

In [None]:
!pip install dask-ml

In [2]:
import scipy as sp
import numpy as np
import pandas as pd
import pickle
import os
import v3io_frames as v3f
import dask.dataframe as dd
from dask.distributed import Client, progress

import matplotlib.pyplot as plt; plt.rcdefaults()
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.layouts import column, row, gridplot
from bokeh.models import ColumnDataSource

import dask_ml.model_selection as dcv
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline

import pprint

In [3]:
output_notebook()
pp = pprint.PrettyPrinter(indent=4)
client = v3f.Client('framesd:8081')

In [None]:
dask_client = Client()

## Load data

In [5]:
df = dd.from_pandas(client.read(backend='tsdb', query='select * from netops_metrics_jupyter',
            start="now-1d", end='now+1d', multi_index=True).reset_index(), npartitions=4)
df.head(5)

Unnamed: 0,time,company,data_center,device,packet_loss_is_error,packet_loss,cpu_utilization,is_error,throughput_is_error,throughput,latency_is_error,cpu_utilization_is_error,latency
0,2019-03-05 14:38:47.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,55.59,0.0,0.0,257.54,0.0,0.0,9.96
1,2019-03-05 14:38:53.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,82.64,0.0,0.0,243.49,0.0,0.0,0.0
2,2019-03-05 14:38:59.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,73.17,0.0,0.0,268.42,0.0,0.0,0.0
3,2019-03-05 14:39:05.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,58.67,0.0,0.0,248.26,0.0,0.0,0.0
4,2019-03-05 14:39:11.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,62.35,0.0,0.0,223.03,0.0,0.0,0.0


In [6]:
df['key'] = df.apply(lambda row: f'{row["company"]}_{row["data_center"]}_{row["device"]}', axis=1, meta=df.compute().dtypes)
df.set_index('key')
df.head(10)
#df.sort_values('timestamp', ascending=True, inplace=True)

Unnamed: 0,time,company,data_center,device,packet_loss_is_error,packet_loss,cpu_utilization,is_error,throughput_is_error,throughput,latency_is_error,cpu_utilization_is_error,latency,key
0,2019-03-05 14:38:47.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,55.59,0.0,0.0,257.54,0.0,0.0,9.96,Boyd-Faulkner_Nancy_Mall_3916108545629
1,2019-03-05 14:38:53.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,82.64,0.0,0.0,243.49,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629
2,2019-03-05 14:38:59.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,73.17,0.0,0.0,268.42,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629
3,2019-03-05 14:39:05.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,58.67,0.0,0.0,248.26,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629
4,2019-03-05 14:39:11.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,62.35,0.0,0.0,223.03,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629
5,2019-03-05 14:39:17.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,52.57,0.0,0.0,227.83,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629
6,2019-03-05 14:39:23.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,72.28,0.0,0.0,239.97,0.0,0.0,3.19,Boyd-Faulkner_Nancy_Mall_3916108545629
7,2019-03-05 14:39:29.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,54.51,0.0,0.0,241.34,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629
8,2019-03-05 14:39:35.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,2.0,82.47,0.0,0.0,249.91,0.0,0.0,3.89,Boyd-Faulkner_Nancy_Mall_3916108545629
9,2019-03-05 14:39:41.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,74.17,0.0,0.0,257.91,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629


In [7]:
df["cpu_1h_mean"] = df.cpu_utilization.rolling(window=12).mean()
df["latency_1h_mean"] = df.latency.rolling(window=12).mean()
df["packet_loss_1h_mean"] = df.packet_loss.rolling(window=12).mean()
df["throughput_1h_mean"] = df.throughput.rolling(window=12).mean()

df.head(10)

Unnamed: 0,time,company,data_center,device,packet_loss_is_error,packet_loss,cpu_utilization,is_error,throughput_is_error,throughput,latency_is_error,cpu_utilization_is_error,latency,key,cpu_1h_mean,latency_1h_mean,packet_loss_1h_mean,throughput_1h_mean
0,2019-03-05 14:38:47.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,55.59,0.0,0.0,257.54,0.0,0.0,9.96,Boyd-Faulkner_Nancy_Mall_3916108545629,,,,
1,2019-03-05 14:38:53.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,82.64,0.0,0.0,243.49,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629,,,,
2,2019-03-05 14:38:59.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,73.17,0.0,0.0,268.42,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629,,,,
3,2019-03-05 14:39:05.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,58.67,0.0,0.0,248.26,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629,,,,
4,2019-03-05 14:39:11.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,62.35,0.0,0.0,223.03,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629,,,,
5,2019-03-05 14:39:17.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,52.57,0.0,0.0,227.83,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629,,,,
6,2019-03-05 14:39:23.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,72.28,0.0,0.0,239.97,0.0,0.0,3.19,Boyd-Faulkner_Nancy_Mall_3916108545629,,,,
7,2019-03-05 14:39:29.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,54.51,0.0,0.0,241.34,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629,,,,
8,2019-03-05 14:39:35.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,2.0,82.47,0.0,0.0,249.91,0.0,0.0,3.89,Boyd-Faulkner_Nancy_Mall_3916108545629,,,,
9,2019-03-05 14:39:41.338,Boyd-Faulkner,Nancy_Mall,3916108545629,0.0,0.0,74.17,0.0,0.0,257.91,0.0,0.0,0.0,Boyd-Faulkner_Nancy_Mall_3916108545629,,,,


In [8]:
# Drop first 'Window' samples due to no featuers
# (Dont want to confuse the ML algorithm)
feature_vectors = df.dropna()[['cpu_utilization', 'latency', 'packet_loss', 'throughput', 'cpu_1h_mean', 'latency_1h_mean', 'packet_loss_1h_mean', 'throughput_1h_mean', 'is_error']]
feature_vectors.head(10)

Unnamed: 0,cpu_utilization,latency,packet_loss,throughput,cpu_1h_mean,latency_1h_mean,packet_loss_1h_mean,throughput_1h_mean,is_error
11,78.76,0.0,4.0,279.5,67.254167,1.42,0.583333,247.419167,0.0
12,68.78,0.0,0.0,277.02,68.353333,0.59,0.583333,249.0425,0.0
13,65.86,6.25,0.0,261.62,66.955,1.110833,0.583333,250.553333,0.0
14,68.96,0.0,2.0,234.32,66.604167,1.110833,0.75,247.711667,0.0
15,66.83,0.0,0.0,244.19,67.284167,1.110833,0.75,247.3725,0.0
16,72.46,5.56,0.0,246.29,68.126667,1.574167,0.75,249.310833,0.0
17,76.64,0.0,0.0,207.5,70.1325,1.574167,0.75,247.616667,0.0
18,68.16,0.0,3.0,232.01,69.789167,1.308333,1.0,246.953333,0.0
19,60.22,0.0,0.0,259.76,70.265,1.308333,1.0,248.488333,0.0
20,76.74,0.0,0.0,240.82,69.7875,0.984167,0.833333,247.730833,0.0


In [9]:
from dask_ml.model_selection import train_test_split

## Training

In [10]:
X = feature_vectors[['cpu_1h_mean', 'latency_1h_mean', 'packet_loss_1h_mean', 'throughput_1h_mean']]
y = feature_vectors['is_error']
X_train, X_test, y_train, y_test = dcv.train_test_split(X, y, train_size=0.7, test_size=0.3)

In [11]:
param_grid = {
    'n_estimators': [5, 10, 20, 30]
}

model = GradientBoostingClassifier()
model = dcv.GridSearchCV(model, param_grid)

print(X_train.compute().shape, y_train.compute().shape)

model.fit(X_train.compute(), y_train.compute())

(5550, 4) (5550,)


GridSearchCV(cache_cv=True, cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       iid=True, n_jobs=-1, param_grid={'n_estimators': [5, 10, 20, 30]},
       refit=True, return_train_score='warn', scheduler=None, scoring=None)

In [12]:
model.score(X_test, y_test)

0.997129971299713

In [13]:
model.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=30,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [14]:
!mkdir models

In [14]:
version = '1.0'
model_filepath = 'models/netops.v{}.model'.format(version)
with open(model_filepath, 'wb+') as f:
    pickle.dump(model.best_estimator_, f)