# Objective

Generate features for machine learning algorithms. The goal is to use unsupervised machine learning algorithms to cluster similar instances together, e.g., highly volatile instances, and/or cheaper least volatile instances.

This will help us in the migration between spot instances, i.e., if we need to move from one spot instance that is going to expire, which should be the next of where we should go next.

# Code

## Load libs

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

import random
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from src.data.helpers import load_aws_dataset
from src.data.helpers import remove_consecutive_repeated_price_entries
from src.data.helpers import calc_pdf_price_update_interval_seconds
from src.data.helpers import generate_price_update_interval

## Input params

In [15]:
data_compression = 'zip'

interim_data_dir = '../data/interim'
interim_data_filename = 'aws_filtered_sample.csv.zip'

processed_data_dir = '../data/processed'
processed_data_filename = 'step_5_features_pricing_var.csv.zip'

## Load data

In [3]:
file = f'{interim_data_dir}/{interim_data_filename}'
data = load_aws_dataset(file)
print(data.shape)
data.head()

(1666418, 4)


  mask |= (ar1 == a)


Unnamed: 0,Timestamp,AvailabilityZone,InstanceType,SpotPrice
0,2020-06-01 00:00:04,us-east-1f,r5d.large,0.0356
1,2020-06-01 00:00:04,us-east-1c,r5d.large,0.0356
2,2020-06-01 00:00:04,us-east-1d,r5d.large,0.0356
3,2020-06-01 00:00:04,us-east-1b,r5d.large,0.0356
4,2020-06-01 00:00:50,us-west-2c,r5.2xlarge,0.156


### Pivot table to wide format

Wide format allows to have a price column for each instance

In [4]:
%%time

df = data.query('AvailabilityZone == "us-east-1a"')\
         .drop('AvailabilityZone', axis=1)

print(df.shape)

# Pivot table to change a wide format for the data. Thus, we can remove
# instances that do not have any price update.
# Dropping MultiIndex column 'SpotPrice' as there is no use for it.
pvt = df.pivot_table(index=['Timestamp'], 
                     columns=['InstanceType'])\
        .droplevel(0, axis=1)

pvt.head()

(58756, 3)
CPU times: user 381 ms, sys: 185 ms, total: 565 ms
Wall time: 563 ms


InstanceType,a1.2xlarge,a1.4xlarge,a1.large,a1.medium,a1.metal,a1.xlarge,c1.medium,c1.xlarge,c3.2xlarge,c3.4xlarge,...,x1e.4xlarge,x1e.8xlarge,x1e.xlarge,z1d.12xlarge,z1d.2xlarge,z1d.3xlarge,z1d.6xlarge,z1d.large,z1d.metal,z1d.xlarge
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-01 00:05:28,,,,,,,,,,,...,,,,,,,,,,
2020-06-01 00:05:31,,,,,,,,,,,...,,,,,,,,,,
2020-06-01 00:06:06,,,,,,,,,,,...,,,,,,,,,,
2020-06-01 00:13:29,,,,,,,,,,,...,,,,,,,,,,
2020-06-01 00:14:13,,,,,,,,,,,...,,,,,,,,,,


## Feature Engineering

As features, we will extract the following:
1. Price variation function: extract the probabilistic density function (pdf) of price variation;
2. Volatility curve: this represents the  pdf of the number of price changes for a given instance;
3. Price update interval curve: represents the pdf of price update interval

We will use these three pdfs to cluster our instances together.

### Price variation (PDF)

In [5]:
var_list = [{'Instance': cname, 'price_var': cdf.dropna().var()} for cname, cdf in pvt.items()]
var_df = pd.DataFrame(var_list).set_index('Instance').T
var_df

Instance,a1.2xlarge,a1.4xlarge,a1.large,a1.medium,a1.metal,a1.xlarge,c1.medium,c1.xlarge,c3.2xlarge,c3.4xlarge,...,x1e.4xlarge,x1e.8xlarge,x1e.xlarge,z1d.12xlarge,z1d.2xlarge,z1d.3xlarge,z1d.6xlarge,z1d.large,z1d.metal,z1d.xlarge
price_var,7.96051e-34,3.1877460000000004e-33,4.884844e-07,3.1095739999999996e-36,7.944461e-34,4.970142e-35,1.4899e-34,0.002035,9e-06,0.000193,...,2.4410519999999998e-30,9.76317e-30,1.5253370000000001e-31,2.78722e-07,0.0008,7e-06,9.8e-05,0.000588,4.9812089999999995e-30,0.000191


### Volatility (PDF)



In [6]:
# Iterate over each instance type, drop NaN and get the price distribution.
volatility_list = [cdf.dropna().describe(include='all').to_frame() for _, cdf in pvt.items()]
volatility_pdf = pd.concat(volatility_list, axis=1).round(3)
volatility_pdf

Unnamed: 0,a1.2xlarge,a1.4xlarge,a1.large,a1.medium,a1.metal,a1.xlarge,c1.medium,c1.xlarge,c3.2xlarge,c3.4xlarge,...,x1e.4xlarge,x1e.8xlarge,x1e.xlarge,z1d.12xlarge,z1d.2xlarge,z1d.3xlarge,z1d.6xlarge,z1d.large,z1d.metal,z1d.xlarge
count,31.0,30.0,63.0,31.0,33.0,32.0,97.0,357.0,111.0,114.0,...,97.0,98.0,99.0,104.0,296.0,112.0,119.0,182.0,98.0,243.0
mean,0.067,0.134,0.022,0.008,0.134,0.034,0.013,0.14,0.142,0.298,...,1.001,2.002,0.25,1.339,0.248,0.336,0.673,0.077,1.339,0.129
std,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.045,0.003,0.014,...,0.0,0.0,0.0,0.001,0.028,0.003,0.01,0.024,0.0,0.014
min,0.067,0.134,0.02,0.008,0.134,0.034,0.013,0.074,0.137,0.272,...,1.001,2.002,0.25,1.339,0.223,0.335,0.67,0.056,1.339,0.112
25%,0.067,0.134,0.021,0.008,0.134,0.034,0.013,0.11,0.141,0.284,...,1.001,2.002,0.25,1.339,0.228,0.335,0.67,0.056,1.339,0.119
50%,0.067,0.134,0.022,0.008,0.134,0.034,0.013,0.13,0.142,0.3,...,1.001,2.002,0.25,1.339,0.234,0.335,0.67,0.068,1.339,0.123
75%,0.067,0.134,0.022,0.008,0.134,0.034,0.013,0.154,0.143,0.31,...,1.001,2.002,0.25,1.339,0.276,0.335,0.67,0.096,1.339,0.143
max,0.067,0.134,0.023,0.008,0.134,0.034,0.013,0.289,0.149,0.318,...,1.001,2.002,0.25,1.344,0.317,0.348,0.707,0.14,1.339,0.16


### Merge Price and Volatility PDFs

In [7]:
res_df = pd.concat([volatility_pdf, var_df], axis=0).round(3)
res_df

Unnamed: 0,a1.2xlarge,a1.4xlarge,a1.large,a1.medium,a1.metal,a1.xlarge,c1.medium,c1.xlarge,c3.2xlarge,c3.4xlarge,...,x1e.4xlarge,x1e.8xlarge,x1e.xlarge,z1d.12xlarge,z1d.2xlarge,z1d.3xlarge,z1d.6xlarge,z1d.large,z1d.metal,z1d.xlarge
count,31.0,30.0,63.0,31.0,33.0,32.0,97.0,357.0,111.0,114.0,...,97.0,98.0,99.0,104.0,296.0,112.0,119.0,182.0,98.0,243.0
mean,0.067,0.134,0.022,0.008,0.134,0.034,0.013,0.14,0.142,0.298,...,1.001,2.002,0.25,1.339,0.248,0.336,0.673,0.077,1.339,0.129
std,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.045,0.003,0.014,...,0.0,0.0,0.0,0.001,0.028,0.003,0.01,0.024,0.0,0.014
min,0.067,0.134,0.02,0.008,0.134,0.034,0.013,0.074,0.137,0.272,...,1.001,2.002,0.25,1.339,0.223,0.335,0.67,0.056,1.339,0.112
25%,0.067,0.134,0.021,0.008,0.134,0.034,0.013,0.11,0.141,0.284,...,1.001,2.002,0.25,1.339,0.228,0.335,0.67,0.056,1.339,0.119
50%,0.067,0.134,0.022,0.008,0.134,0.034,0.013,0.13,0.142,0.3,...,1.001,2.002,0.25,1.339,0.234,0.335,0.67,0.068,1.339,0.123
75%,0.067,0.134,0.022,0.008,0.134,0.034,0.013,0.154,0.143,0.31,...,1.001,2.002,0.25,1.339,0.276,0.335,0.67,0.096,1.339,0.143
max,0.067,0.134,0.023,0.008,0.134,0.034,0.013,0.289,0.149,0.318,...,1.001,2.002,0.25,1.344,0.317,0.348,0.707,0.14,1.339,0.16
price_var,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.001,0.0,0.0


### Export to csv

In [16]:
res_df.to_csv(f'{processed_data_dir}/{processed_data_filename}', compression=data_compression)

### Price Update Interval (PDF)

In [8]:
pvt.head()

InstanceType,a1.2xlarge,a1.4xlarge,a1.large,a1.medium,a1.metal,a1.xlarge,c1.medium,c1.xlarge,c3.2xlarge,c3.4xlarge,...,x1e.4xlarge,x1e.8xlarge,x1e.xlarge,z1d.12xlarge,z1d.2xlarge,z1d.3xlarge,z1d.6xlarge,z1d.large,z1d.metal,z1d.xlarge
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-01 00:05:28,,,,,,,,,,,...,,,,,,,,,,
2020-06-01 00:05:31,,,,,,,,,,,...,,,,,,,,,,
2020-06-01 00:06:06,,,,,,,,,,,...,,,,,,,,,,
2020-06-01 00:13:29,,,,,,,,,,,...,,,,,,,,,,
2020-06-01 00:14:13,,,,,,,,,,,...,,,,,,,,,,


In [9]:
# Remove repeated price entries without actual change

sample_instance = pvt.loc[:, 'r5n.16xlarge']
sample_instance_norepeat = remove_consecutive_repeated_price_entries(sample_instance)
sample_instance_norepeat

Timestamp
2020-06-01 03:51:56    1.1501
2020-06-01 08:54:20    1.1504
2020-06-01 15:36:25    1.1502
2020-06-01 22:44:34    1.1519
2020-06-02 04:36:46    1.1546
                        ...  
2020-08-30 19:28:26    1.4237
2020-08-31 00:15:40    1.4264
2020-08-31 05:27:33    1.4300
2020-08-31 10:30:13    1.4345
2020-08-31 22:10:12    1.4337
Name: r5n.16xlarge, Length: 362, dtype: float64

In [10]:
update_interval_dist = calc_pdf_price_update_interval_seconds(sample_instance_norepeat)
update_interval_dist

Unnamed: 0,r5n.16xlarge
count,361.0
mean,21962.038781
std,3465.494613
min,2051.0
25%,19167.0
50%,22208.0
75%,24271.0
max,41999.0


In [11]:
uplist = []
for cname, cdf in pvt.items():
    uplist.append((cdf.pipe(remove_consecutive_repeated_price_entries)\
         .pipe(calc_pdf_price_update_interval_seconds)))

    
res_df = pd.concat(uplist, axis=1).iloc[1:].dropna(axis=1, how='all')
res_df

Unnamed: 0,a1.large,c1.xlarge,c3.2xlarge,c3.4xlarge,c3.8xlarge,c3.large,c3.xlarge,c4.2xlarge,c4.4xlarge,c4.8xlarge,...,t3a.micro,t3a.nano,t3a.small,t3a.xlarge,z1d.12xlarge,z1d.2xlarge,z1d.3xlarge,z1d.6xlarge,z1d.large,z1d.xlarge
mean,48002.215686,23118.774854,23718.355769,23279.242991,22547.162162,396009.0,109994.909091,23105.559767,22788.354467,22270.766197,...,252460.6,202394.6,138695.754386,58808.932836,609845.2,29743.18,147586.5,285948.8,71639.79,39064.06
std,34996.722451,5713.610456,6900.502253,6876.12545,5310.423968,209784.830316,122003.650071,7306.742782,5815.030315,4706.520829,...,401050.4,338656.2,154888.722623,54426.187035,1009967.0,106908.4,511520.5,947159.0,292746.2,153996.3
min,17637.0,11821.0,16564.0,16493.0,16623.0,208548.0,19172.0,16429.0,481.0,4453.0,...,16657.0,3631.0,16602.0,7105.0,7377.0,7120.0,1016.0,8784.0,16605.0,3661.0
25%,24173.0,19163.0,19655.0,19383.5,19635.5,268342.5,42589.5,19141.5,19650.5,19168.0,...,47325.0,22855.5,43341.0,24529.5,16225.5,19315.75,17670.0,20003.75,19665.25,19211.0
50%,40816.0,22421.0,23159.5,22198.0,21681.0,343734.0,77583.0,21712.0,22661.0,21727.0,...,123548.0,29586.0,94720.0,42921.0,160516.0,22212.5,19610.0,22941.5,22662.0,22666.0
75%,57136.5,25161.75,25578.5,24936.5,24207.5,471400.5,102504.25,24979.0,24708.0,24681.5,...,304258.0,248669.2,176723.0,70133.5,754135.8,25170.75,25199.0,25177.75,25696.75,25734.0
max,207049.0,45531.0,54365.0,58437.0,46775.0,688020.0,522133.0,73019.0,78592.0,50882.0,...,2103855.0,1393036.0,925199.0,374915.0,2110972.0,1762407.0,2131480.0,4236366.0,2734244.0,2100592.0


In [12]:


res_df = generate_price_update_interval(pvt)
res_df

Unnamed: 0,a1.large,c1.xlarge,c3.2xlarge,c3.4xlarge,c3.8xlarge,c3.large,c3.xlarge,c4.2xlarge,c4.4xlarge,c4.8xlarge,...,t3a.micro,t3a.nano,t3a.small,t3a.xlarge,z1d.12xlarge,z1d.2xlarge,z1d.3xlarge,z1d.6xlarge,z1d.large,z1d.xlarge
mean,48002.215686,23118.774854,23718.355769,23279.242991,22547.162162,396009.0,109994.909091,23105.559767,22788.354467,22270.766197,...,252460.6,202394.6,138695.754386,58808.932836,609845.2,29743.18,147586.5,285948.8,71639.79,39064.06
std,34996.722451,5713.610456,6900.502253,6876.12545,5310.423968,209784.830316,122003.650071,7306.742782,5815.030315,4706.520829,...,401050.4,338656.2,154888.722623,54426.187035,1009967.0,106908.4,511520.5,947159.0,292746.2,153996.3
min,17637.0,11821.0,16564.0,16493.0,16623.0,208548.0,19172.0,16429.0,481.0,4453.0,...,16657.0,3631.0,16602.0,7105.0,7377.0,7120.0,1016.0,8784.0,16605.0,3661.0
25%,24173.0,19163.0,19655.0,19383.5,19635.5,268342.5,42589.5,19141.5,19650.5,19168.0,...,47325.0,22855.5,43341.0,24529.5,16225.5,19315.75,17670.0,20003.75,19665.25,19211.0
50%,40816.0,22421.0,23159.5,22198.0,21681.0,343734.0,77583.0,21712.0,22661.0,21727.0,...,123548.0,29586.0,94720.0,42921.0,160516.0,22212.5,19610.0,22941.5,22662.0,22666.0
75%,57136.5,25161.75,25578.5,24936.5,24207.5,471400.5,102504.25,24979.0,24708.0,24681.5,...,304258.0,248669.2,176723.0,70133.5,754135.8,25170.75,25199.0,25177.75,25696.75,25734.0
max,207049.0,45531.0,54365.0,58437.0,46775.0,688020.0,522133.0,73019.0,78592.0,50882.0,...,2103855.0,1393036.0,925199.0,374915.0,2110972.0,1762407.0,2131480.0,4236366.0,2734244.0,2100592.0
