In [1]:
import pandas as pd
import numpy as np

In [2]:
# Azure
azure_compute_src = './data/azure.merged.compute.xz.pkl' 
azure_compute_data = pd.read_pickle(azure_compute_src, compression="xz")

In [3]:
# AWS
aws_compute_src = './data/aws.src.compute.xz.pkl' 
aws_compute_data = pd.read_pickle(aws_compute_src, compression="xz")

In [4]:
# GCP
gcp_compute_src = './data/gcp.compute.xz.pkl'
df_gcp = pd.read_pickle(gcp_compute_src, compression="xz")
df_gcp['provider'] = 'GCP'

## Inspect AWS

In [5]:
aws_compute_data.columns

Index(['sku', 'offercode', 'location', 'instance_type', 'instance_family',
       'vcpu', 'clock_speed', 'memory', 'storage', 'network_performance',
       'processor_architecture', 'tenancy', 'os', 'usagetype', 'operation',
       'pd_key', 'price_description', 'unit', 'price'],
      dtype='object')

In [6]:
aws_compute_data['price_description']

0         $0.00 per Dedicated Reservation RHEL m4.10xlar...
1         $0.00 per RHEL with HA and SQL Enterprise i3.8...
2         $0.00 per Reservation RHEL c5a.large Instance ...
3         $7.293 per On Demand Linux with SQL Web inf1.2...
4         $7.234 per Unused Reservation RHEL with SQL We...
                                ...                        
505796    $2.046 per Unused Reservation Windows with SQL...
505797    $0.00 per Dedicated Reservation RHEL with HA a...
505798    $16.704 per On Demand Windows with SQL Server ...
505799    $0.1784 per Unused Reservation RHEL r6g.large ...
505800    $0.511 per Dedicated SUSE i3.xlarge Instance Hour
Name: price_description, Length: 505801, dtype: object

In [7]:
aws_compute_data_trimmed = aws_compute_data.drop(columns=['sku','offercode','processor_architecture','usagetype','operation',
                                                          'pd_key','price_description','unit','network_performance'])

In [8]:
aws_compute_data_trimmed.columns

Index(['location', 'instance_type', 'instance_family', 'vcpu', 'clock_speed',
       'memory', 'storage', 'tenancy', 'os', 'price'],
      dtype='object')

In [9]:
# windows only
aws_compute_data_windows = aws_compute_data_trimmed[aws_compute_data_trimmed['os'] == 'Windows']

In [10]:
aws_compute_data_windows.head()

Unnamed: 0,location,instance_type,instance_family,vcpu,clock_speed,memory,storage,tenancy,os,price
7,Asia Pacific (Osaka),r3.8xlarge,Memory optimized,32.0,2.5 GHz,244 GiB,2 x 320 SSD,Host,Windows,0.0
8,EU (Ireland),r5ad.2xlarge,Memory optimized,8.0,2.5 GHz,64 GiB,1 x 300 NVMe SSD,Shared,Windows,0.0
11,US West (Oregon),g4dn.xlarge,GPU instance,4.0,2.5 GHz,16 GiB,125 GB NVMe SSD,Host,Windows,0.0
18,EU (Ireland),g4ad.4xlarge,GPU instance,16.0,2.8 GHz,64 GiB,600 GB NVMe SSD,Shared,Windows,1.704
19,EU (Ireland),m1.large,General purpose,2.0,,7.5 GiB,2 x 420 SSD,Shared,Windows,0.19


## Map Schemas

In [11]:
# Since a smaller set of data, I'm going to use AWS as the base
df_aws = aws_compute_data_windows.drop(columns=['os'])
df_aws['provider'] = 'AWS'

In [12]:
df_aws.columns

Index(['location', 'instance_type', 'instance_family', 'vcpu', 'clock_speed',
       'memory', 'storage', 'tenancy', 'price', 'provider'],
      dtype='object')

In [13]:
azure_compute_data.columns

Index(['currencyCode', 'tierMinimumUnits', 'retailPrice', 'unitPrice',
       'armRegionName', 'location', 'effectiveStartDate', 'meterId',
       'meterName', 'productId', 'skuId', 'productName', 'skuName',
       'serviceName', 'serviceId', 'serviceFamily', 'unitOfMeasure', 'type',
       'isPrimaryMeterRegion', 'armSkuName', 'effectiveEndDate',
       'reservationTerm', 'Instance', 'vCPU(s)', 'RAM', 'Temporary storage',
       'Supports Premium SSD Storage', 'Clock', 'Burst', 'Purpose',
       'Single Customer', 'Constrained CPU', 'NVMe Disk', 'GPU', 'GPU Model',
       'NVIDIA GRID Supported', 'Machine Image'],
      dtype='object')

In [14]:
df_azure = azure_compute_data[['location','armSkuName','Purpose','vCPU(s)','Clock',
                                     'RAM','Temporary storage','unitPrice','Single Customer']]

In [15]:
df_azure = df_azure.rename(columns={'armSkuName':'instance_type',
                                                'Purpose':'instance_family','vCPU(s)':'vcpu','Clock':'clock_speed',
                                               'RAM':'memory','Temporary storage':'storage',
                                               'Single Customer':'tenancy', 'unitPrice':'price'})
df_azure['provider'] = 'Azure'

In [16]:
df_azure

Unnamed: 0,location,instance_type,instance_family,vcpu,clock_speed,memory,storage,price,tenancy,provider
0,US North Central,Standard_DS14_v2_Promo,Memory,16,2.1,112 GiB,224 GiB,1.482,False,Azure
1,NO West,Standard_DS14_v2,Memory,16,2.1,112 GiB,224 GiB,2.171,False,Azure
2,NO West,Standard_DS14_v2,Memory,16,2.1,112 GiB,224 GiB,2.907,False,Azure
3,CH North,Standard_DS14_v2,Memory,16,2.1,112 GiB,224 GiB,1.824,False,Azure
4,US Gov AZ,Standard_DS14_v2,Memory,16,2.1,112 GiB,224 GiB,1.491,False,Azure
...,...,...,...,...,...,...,...,...,...,...
39219,US West 2,Standard_ND40rs_v2,GPU Optimized,40,,672 GiB,"2,900 GiB",23.872,False,Azure
39220,US West 2,Standard_ND40rs_v2,GPU Optimized,40,,672 GiB,"2,900 GiB",22.032,False,Azure
39221,US South Central,Standard_ND40rs_v2,GPU Optimized,40,,672 GiB,"2,900 GiB",28.278,False,Azure
39222,US South Central,Standard_ND40rs_v2,GPU Optimized,40,,672 GiB,"2,900 GiB",26.438,False,Azure


## Align Values

In [17]:
def compare_col_vals(df1, df2, col_name):
    d1_u = df1[col_name].unique()
    d2_u = df2[col_name].unique()
    
    for item in d1_u:
        if not any([x in item for x in d2_u]):
            print ("{} mismatched".format(item))


### Instance Family

In [18]:
aws_instances = df_aws['instance_family'].unique()
aws_instances

array(['Memory optimized', 'GPU instance', 'General purpose',
       'Compute optimized', 'Storage optimized', nan, 'Micro instances',
       'FPGA Instances'], dtype=object)

In [19]:
df_aws = df_aws.dropna(subset=['instance_family'])

In [20]:
azure_instances = df_azure['instance_family'].unique()
azure_instances

array(['Memory', 'General', 'High Performance', 'Compute',
       'GPU Optimized', 'Storage Optimized'], dtype=object)

In [21]:
compare_col_vals(df_aws,df_azure,'instance_family')

GPU instance mismatched
Storage optimized mismatched
Micro instances mismatched
FPGA Instances mismatched


In [22]:
#df_azure[df_azure['instance_family'].isnull()] = 'General purpose' # need to reload data
df_aws.loc[df_aws['instance_family']     == 'FPGA Instances'   , 'instance_family'] = 'High Performance'
df_aws.loc[df_aws['instance_family']     == 'GPU instance'     , 'instance_family'] = 'GPU optimized'
df_azure.loc[df_azure['instance_family'] == 'Memory'           , 'instance_family'] = 'Memory optimized'
df_azure.loc[df_azure['instance_family'] == 'GPU Optimized'    , 'instance_family'] = 'GPU optimized'
df_azure.loc[df_azure['instance_family'] == 'Storage Optimized', 'instance_family'] = 'Storage optimized'
df_azure.loc[df_azure['instance_family'] == 'General'          , 'instance_family'] = 'General purpose'
df_azure.loc[df_azure['instance_family'] == 'Compute'          , 'instance_family'] = 'Compute optimized'

df_aws = df_aws[df_aws['instance_family'] != 'Micro instances']

In [23]:
compare_col_vals(df_aws,df_azure,'instance_family')

In [24]:
compare_col_vals(df_azure, df_aws,'instance_family')

In [25]:
compare_col_vals(df_gcp, df_azure,'instance_family')

Shared core mismatched


In [26]:
compare_col_vals(df_azure, df_gcp,'instance_family')

High Performance mismatched
Storage optimized mismatched


### vcpu

In [27]:
#compare_col_vals(df_aws, df_azure,'vcpu')

col = 'vcpu'
aws_u = df_aws[col].unique()
aws_u

array([ 32.,   8.,   4.,  16.,   2.,  96.,  48.,  40.,  64.,   1.,  12.,
        72.,  36., 448., 128.,  24., 224.])

In [28]:
azu_u = df_azure[col].unique()
azu_u

array(['16', '64', '2', '8', '4 / 16', '8 / 16', '48', '32', '8 / 32',
       '16 / 32', '4 / 8', '2 / 8', '72', '416', '16 / 64', '32 / 64',
       '20', '2 / 4', '4', '1 / 2', '24', '1', '208', '96', '128',
       '1 / 4', '120', '12', '80', '64 / 128', '32 / 128', '6', '44',
       '40'], dtype=object)

In [29]:
df_gcp[col].unique()

array([0.25, 0.5, '16', '2', '32', '4', '64', '8', '96', '1', '48', '80',
       '128', '224', 1, '30', '60', '40', '160', '208', '416', '12', '24'],
      dtype=object)

### clock_speed

In [30]:
#compare_col_vals(df_aws,df_azure,'clock_speed')

In [31]:
col = 'clock_speed'
aws_u = df_aws[col].unique()
aws_u

array(['2.5 GHz', '2.8 GHz', nan, '3.1 GHz', '2.3 GHz', '2.4 GHz',
       '3 GHz', '4 GHz', 'Up to 3.3 GHz', '4.5 GHz', '2.9 GHz', '2 GHz',
       '2.6 GHz'], dtype=object)

In [32]:
azu_u = df_azure[col].unique()
azu_u

array([2.1 , 2.5 , 2.  ,  nan, 3.2 , 2.6 , 2.35, 3.7 , 2.55])

In [33]:
df_gcp[col].unique()

array([nan, 2. , 2.8, 3.1, 2.5, 2.2])

### memory

In [34]:
compare_col_vals(df_aws,df_azure,'memory')

7.5 GiB mismatched
61 GiB mismatched
30.5 GiB mismatched
15.25 GiB mismatched
21 GiB mismatched
15 GiB mismatched
5.25 GiB mismatched
3.75 GiB mismatched
30 GiB mismatched
976 GiB mismatched
10.5 GiB mismatched
60 GiB mismatched
1 GiB mismatched
0.5 GiB mismatched
60.5 GiB mismatched
17.1 GiB mismatched
24576 GiB mismatched


In [35]:
col = 'memory'
aws_u = df_aws[col].unique()
aws_u

array(['244 GiB', '64 GiB', '16 GiB', '7.5 GiB', '192 GiB', '256 GiB',
       '128 GiB', '384 GiB', '68.4 GiB', '61 GiB', '30.5 GiB', '8 GiB',
       '160 GiB', '15.25 GiB', '122 GiB', '512 GiB', '32 GiB', '1.7 GiB',
       '96 GiB', '4 GiB', '144 GiB', '768 GiB', '732 GiB', '21 GiB',
       '15 GiB', '5.25 GiB', '1952 GiB', '3.75 GiB', '488 GiB',
       '9216 GiB', '48 GiB', '72 GiB', '3904 GiB', '30 GiB', '976 GiB',
       '10.5 GiB', '12288 GiB', '2 GiB', '60 GiB', '1 GiB', '117 GiB',
       '0.5 GiB', '42 GiB', '6144 GiB', '7 GiB', '60.5 GiB', '17.1 GiB',
       '34.2 GiB', '18432 GiB', '24576 GiB'], dtype=object)

In [36]:
azu_u = df_azure[col].unique()
azu_u

array(['112 GiB', '256 GiB', '504 GiB', '28 GiB', '64 GiB', '437.5 GiB',
       '96 GiB', '56 GiB', '16 GiB', '144 GiB', '128 GiB', '192 GiB',
       '5,700 GiB', '432 GiB', '218.75 GiB', '160 GiB', '32 GiB',
       '14 GiB', '4 GiB', '8 GiB', '224 GiB', '2 GiB', '140 GiB',
       '2,850 GiB', '1,750 GiB', '512 GiB', '672 GiB', '384 GiB',
       '2,000 GiB', '7 GiB', '456 GiB', '448 GiB', '11,400 GiB',
       '875 GiB', '1,000 GiB', '110 GiB', '3,800 GiB', '3.5 GiB',
       '440 GiB', '640 GiB', '352 GiB', '900 GiB'], dtype=object)

In [37]:
df_gcp[col].unique()

array([6.0000e-01, 1.7000e+00, 1.4400e+01, 1.8000e+00, 2.8800e+01,
       3.6000e+00, 5.7600e+01, 7.2000e+00, 8.6400e+01, 1.0400e+02,
       1.3000e+01, 2.0800e+02, 2.6000e+01, 4.1600e+02, 5.2000e+01,
       6.2400e+02, 3.7500e+00, 6.0000e+01, 7.5000e+00, 1.2000e+02,
       1.5000e+01, 2.4000e+02, 3.0000e+01, 3.6000e+02, 8.0000e+00,
       1.6000e+01, 3.2000e+01, 6.4000e+01, 1.2800e+02, 1.9200e+02,
       2.5600e+02, 3.2000e+02, 3.8400e+02, 5.1200e+02, 6.4000e+02,
       2.0000e+00, 4.0000e+00, 4.8000e+01, 8.0000e+01, 8.9600e+02,
       7.6800e+02, 9.6000e+01, 2.2400e+02, 1.0000e+00, 9.6100e+02,
       1.9220e+03, 3.8440e+03, 1.4336e+03, 5.8880e+03, 1.1776e+04,
       8.5000e+01, 1.7000e+02, 3.4000e+02, 6.8000e+02, 1.3600e+03])

In [38]:
df_gcp_str = df_gcp.copy()
df_gcp_str[col] = df_gcp[col].astype(str) + ' GiB'
df_gcp_str[col] = df_gcp_str[col].str.replace('.0', ' ')

  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
df_gcp_str[col].unique()

array(['0.6 GiB', '1.7 GiB', '14.4 GiB', '1.8 GiB', '28.8 GiB', '3.6 GiB',
       '57.6 GiB', '7.2 GiB', '86.4 GiB', ' 4  GiB', '13  GiB', ' 8  GiB',
       '26  GiB', '416  GiB', '52  GiB', '624  GiB', '3.75 GiB', '   GiB',
       '7.5 GiB', '1   GiB', '15  GiB', '2   GiB', '3   GiB', '8  GiB',
       '16  GiB', '32  GiB', '64  GiB', '128  GiB', '192  GiB',
       '256  GiB', '384  GiB', '512  GiB', '6   GiB', '2  GiB', '4  GiB',
       '48  GiB', '896  GiB', '768  GiB', '96  GiB', '224  GiB', '1  GiB',
       '961  GiB', '1922  GiB', '3844  GiB', '1433.6 GiB', '5888  GiB',
       '11776  GiB', '85  GiB', '13   GiB'], dtype=object)

### storage

In [40]:
col = 'storage'
aws_u = df_aws[col].unique()
aws_u

array(['2 x 320 SSD', '1 x 300 NVMe SSD', '125 GB NVMe SSD',
       '600 GB NVMe SSD', '2 x 420 SSD', '2 x 1900 NVMe SSD',
       '2 x 600 NVMe SSD', 'EBS only', '2 x 900 NVMe SSD', '2 x 840 SSD',
       '1 x 80 SSD', '4 x 600 NVMe SSD', '4 x 900 NVMe SSD',
       '1 x 2500 NVMe SSD', '1 x 160 SSD', '1 x 7500 NVMe SSD',
       '4 x 1900 NVMe SSD', '1 x 1250 NVMe SSD', '1 x 950 NVMe SSD',
       '1 x 150 NVMe SSD', '2 x 2500 NVMe SSD', '2 x 800 SSD',
       '24 x 2000 HDD', '2 x 40 SSD', '1 x 50 NVMe SSD', '1 x 1920 SSD',
       '2 x 80 SSD', '8 x 1900 NVMe SSD', '2 x 300 NVMe SSD',
       '1 x 400 NVMe SSD', '1 x 900 NVMe SSD', '2 x 1920 SSD',
       '1 x 960 SSD', '8 x 7500 NVMe SSD', '225 GB NVMe SSD',
       '1 x 480 SSD', '1 x 475 NVMe SSD', '1 x 1900 NVMe SSD',
       '2 x 900 GB NVMe SSD', '2 x 160 SSD', '2 x 7500 NVMe SSD',
       '1 x 75 NVMe SSD', '1 x 200 NVMe SSD', '4 x 7500 NVMe SSD',
       '900 GB NVMe SSD', '1 x 100 NVMe SSD', '1 x 450 NVMe SSD',
       '3 x 2000 HDD', '

In [41]:
azu_u = df_azure[col].unique()
azu_u

array(['224 GiB', '800 GiB', nan, '384 GiB', '80 GiB', '256 GiB',
       '512 GiB', '1,000 GiB', '112 GiB', '400 GiB', '1,200 GiB',
       '576 GiB', '8,192 GiB', '32 GiB', '50 GiB', '360 GiB', '2,000 GiB',
       '864 GiB', '1,600 GiB', '600 GiB', '64 GiB', '100 GiB', '28 GiB',
       '8 GiB', '16 GiB', '1,440 GiB', '75 GiB', '10 GiB', '280 GiB',
       '200 GiB', '128 GiB', '750 GiB', '4,096 GiB', '150 GiB',
       '1,800 GiB', '2,048 GiB', '40 GiB', '2,400 GiB', '300 GiB',
       '20 GiB', '768 GiB', '56 GiB', '88 GiB', '900 GiB', '678 GiB',
       '2,948 GiB', '672 GiB', '480 GiB', '320 GiB', '14 GiB', '4 GiB',
       '1,024 GiB', '1,344 GiB', '352 GiB', '1,536 GiB', '500 GiB',
       '680 GiB', '700 GiB', '5,630 GiB', '7 GiB', '160 GiB', '640 GiB',
       '1,474 GiB', '736 GiB', '2,807 GiB', '336 GiB', '340 GiB',
       '2,880 GiB', '6,144 GiB', '1,388 GiB', '176 GiB', '3,072 GiB',
       '180 GiB', '6,500 GiB', '2,900 GiB'], dtype=object)

In [42]:
df_gcp_storage = df_gcp_str.copy()
df_gcp_storage[col] = np.nan
df_gcp_storage[col].unique()

array([nan])

### tenancy

In [43]:
col = 'tenancy'
aws_u = df_aws[col].unique()
aws_u

array(['Host', 'Shared', 'Dedicated'], dtype=object)

In [44]:
azu_u = df_azure[col].unique()
azu_u

array([False,  True])

In [45]:
df_azure.loc[df_azure[col]==False, col] = 'Shared'
df_azure.loc[df_azure[col]==True, col] = 'Dedicated'

In [46]:
azu_u = df_azure[col].unique()
azu_u

array(['Shared', 'Dedicated'], dtype=object)

In [47]:
df_azure[df_azure[col]=='Dedicated']

Unnamed: 0,location,instance_type,instance_family,vcpu,clock_speed,memory,storage,price,tenancy,provider
863,EU West,Standard_G1,Memory optimized,2,2.0,28 GiB,384 GiB,0.7700,Dedicated,Azure
864,EU West,Standard_G1,Memory optimized,2,2.0,28 GiB,384 GiB,0.7000,Dedicated,Azure
865,US Gov TX,Standard_G1,Memory optimized,2,2.0,28 GiB,384 GiB,0.7625,Dedicated,Azure
866,US West Central,Standard_G1,Memory optimized,2,2.0,28 GiB,384 GiB,0.6100,Dedicated,Azure
867,US West Central,Standard_G1,Memory optimized,2,2.0,28 GiB,384 GiB,0.5500,Dedicated,Azure
...,...,...,...,...,...,...,...,...,...,...
38813,US West 2,Standard_G4,Memory optimized,16,2.0,224 GiB,"3,072 GiB",3.9220,Dedicated,Azure
38814,US West 2,Standard_G4,Memory optimized,16,2.0,224 GiB,"3,072 GiB",4.4750,Dedicated,Azure
38815,US Gov Virginia,Standard_G4,Memory optimized,16,2.0,224 GiB,"3,072 GiB",4.4000,Dedicated,Azure
38816,JA East,Standard_G4,Memory optimized,16,2.0,224 GiB,"3,072 GiB",4.6020,Dedicated,Azure


In [48]:
df_azure

Unnamed: 0,location,instance_type,instance_family,vcpu,clock_speed,memory,storage,price,tenancy,provider
0,US North Central,Standard_DS14_v2_Promo,Memory optimized,16,2.1,112 GiB,224 GiB,1.482,Shared,Azure
1,NO West,Standard_DS14_v2,Memory optimized,16,2.1,112 GiB,224 GiB,2.171,Shared,Azure
2,NO West,Standard_DS14_v2,Memory optimized,16,2.1,112 GiB,224 GiB,2.907,Shared,Azure
3,CH North,Standard_DS14_v2,Memory optimized,16,2.1,112 GiB,224 GiB,1.824,Shared,Azure
4,US Gov AZ,Standard_DS14_v2,Memory optimized,16,2.1,112 GiB,224 GiB,1.491,Shared,Azure
...,...,...,...,...,...,...,...,...,...,...
39219,US West 2,Standard_ND40rs_v2,GPU optimized,40,,672 GiB,"2,900 GiB",23.872,Shared,Azure
39220,US West 2,Standard_ND40rs_v2,GPU optimized,40,,672 GiB,"2,900 GiB",22.032,Shared,Azure
39221,US South Central,Standard_ND40rs_v2,GPU optimized,40,,672 GiB,"2,900 GiB",28.278,Shared,Azure
39222,US South Central,Standard_ND40rs_v2,GPU optimized,40,,672 GiB,"2,900 GiB",26.438,Shared,Azure


### price

In [49]:
col = 'price'
aws_u = df_aws[col].unique()
aws_u

array([ 0.   ,  1.704,  0.19 , ..., 27.289,  0.053, 15.292])

In [50]:
azu_u = df_azure[col].unique()
azu_u

array([ 1.482,  2.171,  2.907, ..., 29.366, 27.526, 28.278])

In [51]:
df_gcp_price = df_gcp_storage.rename(columns={'on demand price':'price'})
df_gcp_price[col].unique()

array([7.600000e-03, 8.600000e-03, 9.100000e-03, ..., 8.796980e+00,
       9.684030e+00, 1.085126e+01], dtype=float32)

In [52]:
df_gcp_price_drop = df_gcp_price.drop(columns=['local_ssd'])

In [53]:
# Good enough..

## Merge AWS and Azure dataframes

In [54]:
df_merged =  df_aws.append(df_azure)
df_merged.reset_index(drop=True, inplace=True)

## Merge GCP

In [55]:
df_full_merge =  df_merged.append(df_gcp_price_drop)
df_full_merge.reset_index(drop=True, inplace=True)

In [56]:
df_full_merge

Unnamed: 0,location,instance_type,instance_family,vcpu,clock_speed,memory,storage,tenancy,price,provider
0,Asia Pacific (Osaka),r3.8xlarge,Memory optimized,32.0,2.5 GHz,244 GiB,2 x 320 SSD,Host,0.00000,AWS
1,EU (Ireland),r5ad.2xlarge,Memory optimized,8.0,2.5 GHz,64 GiB,1 x 300 NVMe SSD,Shared,0.00000,AWS
2,US West (Oregon),g4dn.xlarge,GPU optimized,4.0,2.5 GHz,16 GiB,125 GB NVMe SSD,Host,0.00000,AWS
3,EU (Ireland),g4ad.4xlarge,GPU optimized,16.0,2.8 GHz,64 GiB,600 GB NVMe SSD,Shared,1.70400,AWS
4,EU (Ireland),m1.large,General purpose,2.0,,7.5 GiB,2 x 420 SSD,Shared,0.19000,AWS
...,...,...,...,...,...,...,...,...,...,...
161266,asia-southeast1,a2-highgpu-8g,GPU optimized,96,2.2,6 GiB,,Shared,7.29758,GCP
161267,us-central1,a2-megagpu-16g,GPU optimized,96,2.2,13 GiB,,Shared,8.79698,GCP
161268,europe-west4,a2-megagpu-16g,GPU optimized,96,2.2,13 GiB,,Shared,9.68403,GCP
161269,asia-southeast,a2-megagpu-16g,GPU optimized,96,2.2,13 GiB,,Shared,10.85126,GCP


In [57]:
df_full_merge.to_pickle('./data/all.merged.compute.xz.pkl', compression='xz')

In [58]:
df_full_merge.to_csv('./data/all.merged.compute.csv')

In [60]:
df_full_merge.sort_values(by='price',ascending=False).head(10)


Unnamed: 0,location,instance_type,instance_family,vcpu,clock_speed,memory,storage,tenancy,price,provider
109412,EU (Frankfurt),u-12tb1.112xlarge,Memory optimized,448.0,,12288 GiB,EBS only,Dedicated,320.341,AWS
104461,Asia Pacific (Singapore),u-12tb1.112xlarge,Memory optimized,448.0,,12288 GiB,EBS only,Shared,320.341,AWS
56778,EU (Frankfurt),u-12tb1.112xlarge,Memory optimized,448.0,,12288 GiB,EBS only,Dedicated,320.341,AWS
106281,Asia Pacific (Singapore),u-12tb1.112xlarge,Memory optimized,448.0,,12288 GiB,EBS only,Dedicated,320.341,AWS
39518,EU (Frankfurt),u-12tb1.112xlarge,Memory optimized,448.0,,12288 GiB,EBS only,Shared,320.341,AWS
94454,Asia Pacific (Singapore),u-12tb1.112xlarge,Memory optimized,448.0,,12288 GiB,EBS only,Dedicated,320.341,AWS
109822,Asia Pacific (Singapore),u-12tb1.112xlarge,Memory optimized,448.0,,12288 GiB,EBS only,Shared,320.341,AWS
32528,EU (Frankfurt),u-12tb1.112xlarge,Memory optimized,448.0,,12288 GiB,EBS only,Shared,320.341,AWS
104457,AWS GovCloud (US-West),u-12tb1.112xlarge,Memory optimized,448.0,,12288 GiB,EBS only,Shared,319.475,AWS
102687,AWS GovCloud (US-West),u-12tb1.112xlarge,Memory optimized,448.0,,12288 GiB,EBS only,Shared,319.475,AWS


In [61]:
df_azure.sort_values(by='price',ascending=False).head(10)

Unnamed: 0,location,instance_type,instance_family,vcpu,clock_speed,memory,storage,price,tenancy,provider
22914,BR South,Standard_M416ms_v2,Memory optimized,416,2.5,"11,400 GiB","8,192 GiB",212.48,Shared,Azure
22865,BR South,Standard_M416ms_v2,Memory optimized,416,2.5,"11,400 GiB","8,192 GiB",193.34,Shared,Azure
22915,BR South,Standard_M416ms_v2,Memory optimized,416,2.5,"11,400 GiB","8,192 GiB",193.34,Shared,Azure
22873,AE Central,Standard_M416ms_v2,Memory optimized,416,2.5,"11,400 GiB","8,192 GiB",173.81,Shared,Azure
22866,AP East,Standard_M416ms_v2,Memory optimized,416,2.5,"11,400 GiB","8,192 GiB",172.82,Shared,Azure
22889,JA West,Standard_M416ms_v2,Memory optimized,416,2.5,"11,400 GiB","8,192 GiB",172.82,Shared,Azure
22894,IN South,Standard_M416ms_v2,Memory optimized,416,2.5,"11,400 GiB","8,192 GiB",168.85,Shared,Azure
22899,AU Southeast,Standard_M416ms_v2,Memory optimized,416,2.5,"11,400 GiB","8,192 GiB",162.9,Shared,Azure
22929,AU East,Standard_M416ms_v2,Memory optimized,416,2.5,"11,400 GiB","8,192 GiB",162.9,Shared,Azure
22935,AU Central 2,Standard_M416ms_v2,Memory optimized,416,2.5,"11,400 GiB","8,192 GiB",162.9,Shared,Azure
