In [1]:
import pandas as pd
import numpy as np

In [2]:
# Azure
azure_compute_src = './data/azure.merged.compute.xz.pkl' 
azure_compute_data = pd.read_pickle(azure_compute_src, compression="xz")

In [3]:
# AWS
aws_compute_src = './data/aws.src.compute.xz.pkl' 
aws_compute_data = pd.read_pickle(aws_compute_src, compression="xz")

In [4]:
# GCP
gcp_compute_src = './data/gcp_compute_costs.csv'
gcp_compute_data = pd.read_csv(gcp_compute_src)

## Inspect AWS

In [5]:
aws_compute_data.columns

Index(['sku', 'offercode', 'location', 'instance_type', 'instance_family',
       'vcpu', 'clock_speed', 'memory', 'storage', 'network_performance',
       'processor_architecture', 'tenancy', 'os', 'usagetype', 'operation',
       'pd_key', 'price_description', 'unit', 'price'],
      dtype='object')

In [6]:
aws_compute_data['price_description']

0         $0.00 per Dedicated Reservation RHEL m4.10xlar...
1         $0.00 per RHEL with HA and SQL Enterprise i3.8...
2         $0.00 per Reservation RHEL c5a.large Instance ...
3         $7.293 per On Demand Linux with SQL Web inf1.2...
4         $7.234 per Unused Reservation RHEL with SQL We...
                                ...                        
505796    $2.046 per Unused Reservation Windows with SQL...
505797    $0.00 per Dedicated Reservation RHEL with HA a...
505798    $16.704 per On Demand Windows with SQL Server ...
505799    $0.1784 per Unused Reservation RHEL r6g.large ...
505800    $0.511 per Dedicated SUSE i3.xlarge Instance Hour
Name: price_description, Length: 505801, dtype: object

In [7]:
aws_compute_data_trimmed = aws_compute_data.drop(columns=['offercode','processor_architecture','usagetype','operation',
                                                          'pd_key','price_description','unit','network_performance'])

In [8]:
aws_compute_data_trimmed.columns

Index(['sku', 'location', 'instance_type', 'instance_family', 'vcpu',
       'clock_speed', 'memory', 'storage', 'tenancy', 'os', 'price'],
      dtype='object')

In [9]:
# windows only
aws_compute_data_windows = aws_compute_data_trimmed[aws_compute_data_trimmed['os'] == 'Windows']

In [10]:
aws_compute_data_windows.head()

Unnamed: 0,sku,location,instance_type,instance_family,vcpu,clock_speed,memory,storage,tenancy,os,price
7,FMEJYTPF3M8YDBYD,Asia Pacific (Osaka),r3.8xlarge,Memory optimized,32.0,2.5 GHz,244 GiB,2 x 320 SSD,Host,Windows,0.0
8,B9QWBFQZHCY7X3HM,EU (Ireland),r5ad.2xlarge,Memory optimized,8.0,2.5 GHz,64 GiB,1 x 300 NVMe SSD,Shared,Windows,0.0
11,SA8JPXVV8GR6B4XS,US West (Oregon),g4dn.xlarge,GPU instance,4.0,2.5 GHz,16 GiB,125 GB NVMe SSD,Host,Windows,0.0
18,PGGUDD5Q3K4SP2K5,EU (Ireland),g4ad.4xlarge,GPU instance,16.0,2.8 GHz,64 GiB,600 GB NVMe SSD,Shared,Windows,1.704
19,4CR6D9AX3JUCVGKK,EU (Ireland),m1.large,General purpose,2.0,,7.5 GiB,2 x 420 SSD,Shared,Windows,0.19


## Inspect GCP

In [11]:
gcp_compute_data.columns

Index(['region', 'family', 'vm type', 'regional_disk', 'sole_tenant',
       'nested_virtualization', 'local_ssd', 'cores', 'gpu', 'cpu', 'memory',
       'network_egress', 'on demand price', 'preemptible price'],
      dtype='object')

In [13]:
gcp_compute_data.tail(10)

Unnamed: 0,region,family,vm type,regional_disk,sole_tenant,nested_virtualization,local_ssd,cores,gpu,cpu,memory,network_egress,on demand price,preemptible price
3730,asia-northeast3,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0.0,0.0
3731,asia-southeast,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,10.85126,2.63909
3732,asia-southeast1,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,10.85126,2.63909
3733,australia-southeast1,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0.0,0.0
3734,australia-southeast2,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0.0,0.0
3735,australia,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0.0,0.0
3736,southamerica-east1,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0.0,0.0
3737,asia-south1,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0.0,0.0
3738,asia-southeast2,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0.0,0.0
3739,asia-south2,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0.0,0.0


In [23]:
gcp_compute_data.cpu.unique()

array(["['N/A']",
       "['Skylake', 'Broadwell', 'Haswell', 'Sandy Bridge', 'Ivy Bridge']",
       "['Cascade Lake']", "['AMD EPYC Rome']",
       "['Skylake', 'Broadwell', 'Haswell', 'AMD EPYC Rome (coming soon)']",
       "['Skylake', 'Broadwell E5']", "['Cascade Lake', 'Broadwell E7']"],
      dtype=object)

In [18]:
df_gcp = gcp_compute_data.copy()

In [34]:
df_gcp.family.unique()

array(['f1', 'g1', 'n1', 'n2', 'n2d', 'e2', 'c2', 'm1', 'm2', 'a2'],
      dtype=object)

#### Clock speed
Based on: https://cloud.google.com/compute/docs/cpu-platforms
Should probably process all clock speeds in GCP notebook and then merge here

In [27]:
df_gcp['clock'] = 2.2

In [44]:
df_gcp[(df_gcp.cpu == "['Cascade Lake']") & (df_gcp.family == 'c2')]

Unnamed: 0,region,family,vm type,regional_disk,sole_tenant,nested_virtualization,local_ssd,cores,gpu,cpu,memory,network_egress,on demand price,preemptible price,clock
3196,us,c2,c2-standard-4,0,-1,-1,1,4,0,['Cascade Lake'],16.0,10,0.20873,0.05052,2.8
3197,us-central1,c2,c2-standard-4,0,-1,-1,1,4,0,['Cascade Lake'],16.0,10,0.20873,0.05052,2.8
3198,us-east1,c2,c2-standard-4,0,-1,-1,1,4,0,['Cascade Lake'],16.0,10,0.20881,0.05052,2.8
3199,us-east4,c2,c2-standard-4,0,-1,-1,1,4,0,['Cascade Lake'],16.0,10,0.23516,0.05406,2.8
3200,us-west4,c2,c2-standard-4,0,-1,-1,1,4,0,['Cascade Lake'],16.0,10,0.23516,0.05406,2.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3361,australia,c2,c2-standard-60,0,-1,-1,1,60,0,['Cascade Lake'],240.0,32,4.44408,1.02168,2.8
3362,southamerica-east1,c2,c2-standard-60,0,-1,-1,1,60,0,['Cascade Lake'],240.0,32,4.9716,1.1433,2.8
3363,asia-south1,c2,c2-standard-60,0,-1,-1,1,60,0,['Cascade Lake'],240.0,32,3.76188,0.9117,2.8
3364,asia-southeast2,c2,c2-standard-60,0,-1,-1,1,60,0,['Cascade Lake'],240.0,32,1.46967,0.96805,2.8


In [45]:
df_gcp.loc[(df_gcp.cpu == "['Cascade Lake']") & (df_gcp.family == 'c2'),'clock'] = 3.1

In [29]:
df_gcp.clock.unique()

array([2.2, 2.8])

## Map Schemas

In [None]:
# Since a smaller set of data, I'm going to use AWS as the base
df_aws = aws_compute_data_windows.drop(columns=['os'])
df_aws['provider'] = 'AWS'

In [None]:
df_aws.columns

In [None]:
azure_compute_data.columns

In [None]:
df_azure = azure_compute_data[['Machine Image','location','armSkuName','Purpose','vCPU(s)','Clock',
                                     'RAM','Temporary storage','unitPrice','Single Customer']]

In [None]:
df_azure = df_azure.rename(columns={"Machine Image": "sku", 'armSkuName':'instance_type',
                                                'Purpose':'instance_family','vCPU(s)':'vcpu','Clock':'clock_speed',
                                               'RAM':'memory','Temporary storage':'storage',
                                               'Single Customer':'tenancy', 'unitPrice':'price'})
df_azure['provider'] = 'Azure'

In [None]:
df_azure

## Align Values

In [None]:
def compare_col_vals(df1, df2, col_name):
    d1_u = df1[col_name].unique()
    d2_u = df2[col_name].unique()
    
    for item in d1_u:
        if not any([x in item for x in d2_u]):
            print ("{} mismatched".format(item))


### Instance Family

In [None]:
aws_instances = df_aws['instance_family'].unique()
aws_instances

In [None]:
df_aws = df_aws.dropna(subset=['instance_family'])

In [None]:
azure_instances = df_azure['instance_family'].unique()
azure_instances

In [None]:
compare_col_vals(df_aws,df_azure,'instance_family')

In [None]:
#df_azure[df_azure['instance_family'].isnull()] = 'General purpose' # need to reload data
df_aws.loc[df_aws['instance_family']     == 'FPGA Instances'   , 'instance_family'] = 'High Performance'
df_aws.loc[df_aws['instance_family']     == 'GPU instance'     , 'instance_family'] = 'GPU optimized'
df_azure.loc[df_azure['instance_family'] == 'GPU Optimized'    , 'instance_family'] = 'GPU optimized'
df_azure.loc[df_azure['instance_family'] == 'Storage Optimized', 'instance_family'] = 'Storage optimized'
df_azure.loc[df_azure['instance_family'] == 'General'          , 'instance_family'] = 'General purpose'

df_aws = df_aws[df_aws['instance_family'] != 'Micro instances']

In [None]:
compare_col_vals(df_aws,df_azure,'instance_family')

### vcpu

In [None]:
#compare_col_vals(df_aws, df_azure,'vcpu')

col = 'vcpu'
aws_u = df_aws[col].unique()
aws_u

In [None]:
azu_u = df_azure[col].unique()
azu_u

### clock_speed

In [None]:
#compare_col_vals(df_aws,df_azure,'clock_speed')

In [None]:
col = 'clock_speed'
aws_u = df_aws[col].unique()
aws_u

In [None]:
azu_u = df_azure[col].unique()
azu_u

### memory

In [None]:
compare_col_vals(df_aws,df_azure,'memory')

In [None]:
col = 'memory'
aws_u = df_aws[col].unique()
aws_u

In [None]:
azu_u = df_azure[col].unique()
azu_u

### storage

In [None]:
col = 'storage'
aws_u = df_aws[col].unique()
aws_u

In [None]:
azu_u = df_azure[col].unique()
azu_u

### tenancy

In [None]:
col = 'tenancy'
aws_u = df_aws[col].unique()
aws_u

In [None]:
azu_u = df_azure[col].unique()
azu_u

In [None]:
df_azure.loc[df_azure[col]==False, col] = 'Shared'
df_azure.loc[df_azure[col]==True, col] = 'Dedicated'

In [None]:
azu_u = df_azure[col].unique()
azu_u

In [None]:
df_azure[df_azure[col]=='Dedicated']

### price

In [None]:
col = 'price'
aws_u = df_aws[col].unique()
aws_u

In [None]:
azu_u = df_azure[col].unique()
azu_u

In [None]:
# Good enough..

## Merge AWS and Azure dataframes

In [None]:
df_merged =  df_aws.append(df_azure)
df_merged.reset_index(drop=True, inplace=True)

In [None]:
df_merged

In [None]:
df_merged.to_pickle('./data/all.merged.compute.xz.pkl', compression='xz')

In [None]:
df_merged.to_csv('./data/all.merged.csv')