# Objective

This notebook clusters together spot instances that behave similarly. The main goal is to know where to migrate when a spot instance is going to be evicted.

# Code

## Load libs

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## Input params

In [None]:
compression = 'zip'
processed_dir = '../data/processed'
in_fname = 'step_7_clustered_features.csv.zip'

report_dir = '../reports/figures'
out_fname_cluster = 'plot_step_8_cluster_3d.html'
out_fname_bar = 'plot_step_8_barplot.html'

In [None]:
# Papermill parameters injection ... do not delete!

## Load data

In [None]:
# Pricing info
file = f'{processed_dir}/{in_fname}'
cluster_df = pd.read_csv(file, 
                   compression=compression, 
                   index_col=0)

print(cluster_df.shape)
cluster_df.head()

## Plot 3D

The goal is to see the relationship of clusters regarding the price update interval, average pricing, and the minimum update interval (which is the time that a spot instance will be evicted)

In [None]:
plot_df = cluster_df.reset_index().rename(columns={'index': 'instance'})

fig = px.scatter_3d(plot_df,
                    x="updist_mean",
                    y="updist_std",
                    z="updist_min",
                    color='cluster_num',
                    hover_data=["instance",
                                "updist_mean",
                                "price_std",
                                "updist_min"],
                    )

fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()
plotly.offline.plot(fig, filename=f'{report_dir}/{out_fname_cluster}')

## Clustering instances

The goal is to see what are the main characteristics of each group.

In [None]:
# Calculate the size of each group
cdf2 = cluster_df["cluster_num"]\
        .value_counts()\
        .reset_index()\
        .rename(columns={'index':'cluster_group', 'cluster_num':'quantity'})
cdf2

In [None]:
# Calculate the statistics of each group

grp = cluster_df.groupby('cluster_num').describe().stack(level=0).iloc[:, [4, 6]].unstack(level=1)
grp.columns = grp.columns.swaplevel(0, 1)
grp.columns = grp.columns.map('|'.join).str.strip('|')
grp = grp.round(decimals=2)
grp

In [None]:
# Group together the statistics with the group information
m1 = cdf2.merge(grp.reset_index(), left_on='cluster_group', right_on='cluster_num')
m1.head()

In [None]:
# plot it
fig = px.bar(m1, x="cluster_group", 
             y="quantity",
             color = "cluster_group",
             hover_data = ['price_count|min', 
                           'price_count|max', 
                           'updist_mean|min', 
                           'updist_max|max', 
                           'price_min|min', 
                           'price_max|max'])

fig.update_layout(xaxis = dict(
        tickmode = 'linear',
        tick0 = 1,
        dtick = 1),
                 yaxis = dict(
        tickmode = 'linear',
        tick0 = 10,
        dtick = 10))

fig.show()
plotly.offline.plot(fig, filename=f'{report_dir}/{out_fname_bar}')