# Objective

The objective is to cluster the similar spot instances together. Thus, when the user needs to get another spot instance, he can select one from a pool where the price changes and update intervals are similar.

# Code

## Load libs

In [None]:
import sys
sys.path.append('..')

import random
import pickle
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

## Input params

In [None]:
compression = 'zip'

models_dir = '../models'
processed_dir = '../data/processed'

in_fname_pricing = 'step_5_features_pricing_var.csv.zip'
in_fname_updist = 'step_5_features_updist.csv.zip'

out_fname = 'step_7_clustered_features.csv.zip'

In [None]:
# Papermill parameters injection ... do not delete!

## Load data

In [None]:
# Pricing info
file = f'{processed_dir}/{in_fname_pricing}'
pricing_df = pd.read_csv(file, 
                   compression='zip', 
                   index_col=0)\
               .T\
               .add_prefix('price_')

print(pricing_df.shape)
pricing_df.head()

In [None]:
# Update distribution info
file = f'{processed_dir}/{in_fname_updist}'
updist_df = pd.read_csv(file, 
                        compression=compression, 
                        index_col=0)\
              .T\
              .add_prefix('updist_')

print(updist_df.shape)
updist_df.head()

In [None]:
# Perform a inner join with the updist df
# Results in a feature df
fdf = updist_df.join(pricing_df)
fdf.head()

## Data normalization

We need to normalize the data as they are in different scales and it may bias the clustering algorithm towards bigger numbers.

In [None]:
scaler = StandardScaler()
scaled_fdf = scaler.fit_transform(fdf)
print(scaled_fdf)

In [None]:
pd.DataFrame(scaled_fdf)

In [None]:
inertia = []
for i in range(1, 40):
    kmeans = KMeans(n_clusters = i, init='k-means++', random_state=42)
    kmeans.fit(scaled_fdf)
    inertia.append(kmeans.inertia_)

In [None]:
plt.plot(inertia)
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Number of cluster when it becomes linear = 13')

In [None]:
kmeans = KMeans(n_clusters = 13, init='k-means++', random_state=42)
kmeans.fit(scaled_fdf)
cluster_num = kmeans.predict(scaled_fdf)
cluster_num

In [None]:
with open(f"{models_dir}/kmeans_model.pkl", "wb") as f:
    pickle.dump(kmeans, f)

In [None]:
fdf['cluster_num'] = cluster_num
fdf.head()

In [None]:
fdf.to_csv(f'{processed_dir}/{out_fname}', compression=compression)