# Objective

Generate report about the pricing analysis and place the figures in the `reports/figures` folder.

# Code

## Load libs

In [None]:
import sys
sys.path.append('..')

import random
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from src.data.helpers import load_aws_dataset

## Input params

In [None]:
interim_dir = '../data/interim'
in_fname = 'step_1_aws_filtered_sample.csv.zip'
compression = 'zip'
report_dir = '../reports/figures/'

In [None]:
# Papermill parameters injection ... do not delete!

## Load data

In [None]:
file = f'{interim_dir}/{in_fname}'
data = load_aws_dataset(file)
print(data.shape)
data.head()

## Filter data

Filter data only for `us-east-1a` at this moment.

In [None]:
%%time

df = data.query('AvailabilityZone == "us-east-1a"')\
         .drop('AvailabilityZone', axis=1)

print(df.shape)

# Pivot table to change a wide format for the data. Thus, we can remove
# instances that do not have any price update.
# Dropping MultiIndex column 'SpotPrice' as there is no use for it.
pvt = df.pivot_table(index=['Timestamp'], 
                     columns=['InstanceType'])\
        .droplevel(0, axis=1)

pvt.head()

## Plotting

### Most volatile instances

Conclusion: these are the instances with more price changes, meaning that the user has more risk of losing them due to eviction based on price updates.

In [None]:
most_volatiles = pvt.count().sort_values(ascending=False).nlargest(10)
fig, ax = plt.subplots(figsize=(12, 6))

pvt.loc[:, most_volatiles.index.to_list()]\
    .dropna(how='all', axis=0)\
    .fillna(method='bfill').plot(ax=ax)

ax.set_title('Top 10 most volatile instances')
ax.set_ylabel('Hourly Price (USD)')
ax.legend(loc='lower center', ncol=5, bbox_to_anchor=(0.5, -0.35))

plt.tight_layout()
plt.savefig(f'{report_dir}/plot_step_4_most_volatile_instances.png', dpi=300)

### Least volatile instances

Conclusion: the least volatile instances simply don't have price change, which means they are a good pick to be used as spot instances

In [None]:
# Now getting the least volatile instances
least_volatiles = pvt.count().sort_values(ascending=False).nsmallest(10)

fig, ax = plt.subplots(figsize=(12, 6))

pvt.loc[:, least_volatiles.index.to_list()]\
    .dropna(how='all', axis=0)\
    .fillna(method='bfill').plot(ax=ax)

ax.set_title('Top 10 least volatile instances')
ax.set_ylabel('Hourly Price (USD)')
ax.legend(loc='lower center', ncol=5, bbox_to_anchor=(0.5, -0.35))

plt.tight_layout()
plt.savefig(f'{report_dir}/plot_step_4_least_volatile_instances.png', dpi=300)