In [1]:
import pandas as pd
import numpy as np

In [2]:
azure_src = './data/azure.src.compute.xz.pkl' 
df_azure_api = pd.read_pickle(azure_src, compression="xz")

azure_scraped = './data/azure.src.scraped.compute.csv'
df_azure_scraped = pd.read_csv(azure_scraped)

## Inspect

In [3]:
df_azure_api.head(3)

Unnamed: 0,currencyCode,tierMinimumUnits,retailPrice,unitPrice,armRegionName,location,effectiveStartDate,meterId,meterName,productId,...,skuName,serviceName,serviceId,serviceFamily,unitOfMeasure,type,isPrimaryMeterRegion,armSkuName,effectiveEndDate,reservationTerm
0,USD,0.0,1.025809,1.025809,southindia,IN South,2021-06-01T00:00:00Z,000009d0-057f-5f2b-b7e9-9e26add324a8,D14/DS14 Spot,DZH318Z0BPVW,...,D14 Spot,Virtual Machines,DZH313Z7MMC8,Compute,1 Hour,Consumption,True,Standard_D14,,
1,USD,0.0,0.41055,0.41055,southindia,IN South,2021-06-01T00:00:00Z,000009d0-057f-5f2b-b7e9-9e26add324a8,D14/DS14 Spot,DZH318Z0BPVW,...,D14 Spot,Virtual Machines,DZH313Z7MMC8,Compute,1 Hour,DevTestConsumption,True,Standard_D14,,
2,USD,0.0,0.4224,0.4224,eastasia,AP East,2021-04-15T00:00:00Z,00014e7d-fff9-54dd-962e-4e992887ad3c,D64s v5 Spot,DZH318Z08M9T,...,Standard_D64s_v5 Spot,Virtual Machines,DZH313Z7MMC8,Compute,1 Hour,DevTestConsumption,True,Standard_D64s_v5,,


In [4]:
df_azure_api = df_azure_api[df_azure_api['reservationTerm'].isnull()]

In [5]:
df_azure_scraped.head(3)

Unnamed: 0,Instance,vCPU(s),RAM,Temporary storage,Pay as you go with AHB,1 year reserved with AHB,3 year reserved with AHB,Spot with AHB,Add to estimate,Premium SSD Storage,Clock,Burst,Purpose,Single Customer,Constrained CPU,NVMe Disk,GPU,GPU Model,NVIDIA GRID Supported
0,D2a v4,2,8 GiB,50 GiB,$0.096/hour,$0.0572/hour,$0.0369/hour,$0.0361/hour,,No,2.35,3.35,General,,,,,,
1,D4a v4,4,16 GiB,100 GiB,$0.192/hour,$0.1144/hour,$0.0737/hour,$0.0722/hour,,No,2.35,3.35,General,,,,,,
2,D8a v4,8,32 GiB,200 GiB,$0.384/hour,$0.2288/hour,$0.1474/hour,$0.1444/hour,,No,2.35,3.35,General,,,,,,


## Trim

In [6]:
# drop nulls and unwanted cols 
df_azure_scraped_clean = df_azure_scraped.drop(columns=['Pay as you go with AHB', '1 year reserved with AHB','3 year reserved with AHB','Spot with AHB', 'Add to estimate'])
df_azure_scraped_clean = df_azure_scraped_clean.dropna(subset=['Instance'])
df_azure_scraped_clean.rename(columns={'Premium SSD Storage':'Supports Premium SSD Storage'},inplace=True)
df_azure_scraped_clean.head(3)

Unnamed: 0,Instance,vCPU(s),RAM,Temporary storage,Supports Premium SSD Storage,Clock,Burst,Purpose,Single Customer,Constrained CPU,NVMe Disk,GPU,GPU Model,NVIDIA GRID Supported
0,D2a v4,2,8 GiB,50 GiB,No,2.35,3.35,General,,,,,,
1,D4a v4,4,16 GiB,100 GiB,No,2.35,3.35,General,,,,,,
2,D8a v4,8,32 GiB,200 GiB,No,2.35,3.35,General,,,,,,


## Convert boolean cols to True/False

In [7]:
def set_boolean(df, col):
    new_df = df.replace({col:{'Yes':True, 'No':False, np.nan:False, 'True':True,'TRUE':True,'False':False}})
    return new_df

In [8]:
df_azure_scraped_clean.columns

Index(['Instance', 'vCPU(s)', 'RAM', 'Temporary storage',
       'Supports Premium SSD Storage', 'Clock', 'Burst', 'Purpose',
       'Single Customer', 'Constrained CPU', 'NVMe Disk', 'GPU', 'GPU Model',
       'NVIDIA GRID Supported'],
      dtype='object')

In [9]:
cols_to_clean = ['Supports Premium SSD Storage','Single Customer','Constrained CPU','NVIDIA GRID Supported']
uniques = {col: [df_azure_scraped_clean[col].unique()] for col in cols_to_clean}

df_unique = pd.DataFrame(uniques)
df_unique

Unnamed: 0,Supports Premium SSD Storage,Single Customer,Constrained CPU,NVIDIA GRID Supported
0,"[No, Yes]","[nan, TRUE]","[nan, True]","[nan, True]"


In [10]:
for col in cols_to_clean:
    df_azure_scraped_clean = set_boolean(df_azure_scraped_clean, col)

In [11]:
uniques = {col: [df_azure_scraped_clean[col].unique()] for col in cols_to_clean}
df_unique = pd.DataFrame(uniques)
df_unique

Unnamed: 0,Supports Premium SSD Storage,Single Customer,Constrained CPU,NVIDIA GRID Supported
0,"[False, True]","[False, True]","[False, True]","[False, True]"


In [12]:
df_azure_scraped_clean.head(5)

Unnamed: 0,Instance,vCPU(s),RAM,Temporary storage,Supports Premium SSD Storage,Clock,Burst,Purpose,Single Customer,Constrained CPU,NVMe Disk,GPU,GPU Model,NVIDIA GRID Supported
0,D2a v4,2,8 GiB,50 GiB,False,2.35,3.35,General,False,False,,,,False
1,D4a v4,4,16 GiB,100 GiB,False,2.35,3.35,General,False,False,,,,False
2,D8a v4,8,32 GiB,200 GiB,False,2.35,3.35,General,False,False,,,,False
4,D16a v4,16,64 GiB,400 GiB,False,2.35,3.35,General,False,False,,,,False
5,D32a v4,32,128 GiB,800 GiB,False,2.35,3.35,General,False,False,,,,False


## Merge datasets

In [13]:
df_azure_scraped_clean.index

Int64Index([  0,   1,   2,   4,   5,   6,   7,   8,   9,  10,
            ...
            700, 702, 704, 706, 708, 710, 712, 714, 716, 718],
           dtype='int64', length=419)

In [14]:
# This is probably a result of how I'm merging but the "index" col is getting lost.  Copy before merge.
df_azure_scraped_clean['Machine Image'] = df_azure_scraped_clean['Instance']
df_azure_api['API skuName'] = df_azure_api['skuName']
df_azure_scraped_clean.tail(10)

Unnamed: 0,Instance,vCPU(s),RAM,Temporary storage,Supports Premium SSD Storage,Clock,Burst,Purpose,Single Customer,Constrained CPU,NVMe Disk,GPU,GPU Model,NVIDIA GRID Supported,Machine Image
700,NV4as v4,4,14 GiB,88 GiB,False,,,GPU Optimized,False,False,,1/8th MI25 (2GB VRAM),AMD Radeon Instinct MI25 GPU,False,NV4as v4
702,NV8as v4,8,28 GiB,176 GiB,False,,,GPU Optimized,False,False,,1/4th MI25 (4GB VRAM),AMD Radeon Instinct MI25 GPU,False,NV8as v4
704,NV16as v4,16,56 GiB,352 GiB,False,,,GPU Optimized,False,False,,1/2 MI25 (8GB VRAM),AMD Radeon Instinct MI25 GPU,False,NV16as v4
706,NV32as v4,32,112 GiB,700 GiB,False,,,GPU Optimized,False,False,,1x MI25 (16GB VRAM),AMD Radeon Instinct MI25 GPU,False,NV32as v4
708,ND6s,6,112 GiB,336 GiB,False,,,GPU Optimized,False,False,,1X P40,NVIDIA Tesla P40,False,ND6s
710,ND12s,12,224 GiB,672 GiB,False,,,GPU Optimized,False,False,,2X P40,NVIDIA Tesla P40,False,ND12s
712,ND24rs,24,448 GiB,"1,344 GiB",False,,,GPU Optimized,False,False,,4X P40,NVIDIA Tesla P40,False,ND24rs
714,ND24s,24,448 GiB,"1,344 GiB",False,,,GPU Optimized,False,False,,4X P40,NVIDIA Tesla P40,False,ND24s
716,ND40rs v2,40,672 GiB,"2,900 GiB",False,,,GPU Optimized,False,False,,8X V100 (NVlink),NVIDIA V100 Tensor Core,False,ND40rs v2
718,ND96asr A100 v4,96,900 GiB,"6,500 GiB",False,,,GPU Optimized,False,False,,8x A100 (NVlink),NVIDIA Ampere A100 Tensor Core,False,ND96asr A100 v4


In [15]:
# left outer join
df_join = df_azure_api.set_index('skuName').join(df_azure_scraped_clean.set_index('Instance'))
df_join.columns

Index(['currencyCode', 'tierMinimumUnits', 'retailPrice', 'unitPrice',
       'armRegionName', 'location', 'effectiveStartDate', 'meterId',
       'meterName', 'productId', 'skuId', 'productName', 'serviceName',
       'serviceId', 'serviceFamily', 'unitOfMeasure', 'type',
       'isPrimaryMeterRegion', 'armSkuName', 'effectiveEndDate',
       'reservationTerm', 'API skuName', 'vCPU(s)', 'RAM', 'Temporary storage',
       'Supports Premium SSD Storage', 'Clock', 'Burst', 'Purpose',
       'Single Customer', 'Constrained CPU', 'NVMe Disk', 'GPU', 'GPU Model',
       'NVIDIA GRID Supported', 'Machine Image'],
      dtype='object')

In [16]:
df_join.shape[0]

159393

In [17]:
# inner join
df_merged = df_azure_api.merge(df_azure_scraped_clean, how='inner', left_on='skuName', right_on='Instance')
df_merged.columns

Index(['currencyCode', 'tierMinimumUnits', 'retailPrice', 'unitPrice',
       'armRegionName', 'location', 'effectiveStartDate', 'meterId',
       'meterName', 'productId', 'skuId', 'productName', 'skuName',
       'serviceName', 'serviceId', 'serviceFamily', 'unitOfMeasure', 'type',
       'isPrimaryMeterRegion', 'armSkuName', 'effectiveEndDate',
       'reservationTerm', 'API skuName', 'Instance', 'vCPU(s)', 'RAM',
       'Temporary storage', 'Supports Premium SSD Storage', 'Clock', 'Burst',
       'Purpose', 'Single Customer', 'Constrained CPU', 'NVMe Disk', 'GPU',
       'GPU Model', 'NVIDIA GRID Supported', 'Machine Image'],
      dtype='object')

In [18]:
df_merged.shape[0]

39224

In [19]:
df_join_nans = df_join[df_join['Machine Image'].isnull()]
missing = df_join_nans['API skuName'].unique()
missing.size

1343

In [20]:
missing[-20:]

array(['Standard_FX36mds', 'Standard_FX36mds Low Priority',
       'Standard_FX36mds Spot', 'Standard_FX48mds',
       'Standard_FX48mds Low Priority', 'Standard_FX48mds Spot',
       'Standard_FX4mds', 'Standard_FX4mds Low Priority',
       'Standard_FX4mds Spot', 'Standard_ND96amsr_A100_v4',
       'Standard_ND96amsr_A100_v4 Low Priority',
       'Standard_ND96amsr_A100_v4 Spot', 'Storage', 'Storage DR', 'VS20',
       'VS36', 'Virtustream', 'Wildcard SSL - 1 Year', 'XenDesktop',
       'vCore'], dtype=object)

In [21]:
df_join['Machine Image'][10]

nan

In [22]:
df_merged.head(10)

Unnamed: 0,currencyCode,tierMinimumUnits,retailPrice,unitPrice,armRegionName,location,effectiveStartDate,meterId,meterName,productId,...,Clock,Burst,Purpose,Single Customer,Constrained CPU,NVMe Disk,GPU,GPU Model,NVIDIA GRID Supported,Machine Image
0,USD,0.0,1.482,1.482,northcentralus,US North Central,2018-09-01T00:00:00Z,0008a792-d49f-4f13-a461-9c9f24e92ccf,DS14 v2 - Expired,DZH318Z0BPSQ,...,2.1,2.1,Memory,False,False,,,,False,DS14 v2
1,USD,0.0,2.171,2.171,norwaywest,NO West,2019-11-19T00:00:00Z,0050a04d-4be2-45dd-b907-2c48f1c671a7,DS14 v2,DZH318Z0BQPW,...,2.1,2.1,Memory,False,False,,,,False,DS14 v2
2,USD,0.0,2.907,2.907,norwaywest,NO West,2019-11-19T00:00:00Z,0050a04d-4be2-45dd-b907-2c48f1c671a7,DS14 v2,DZH318Z0BQPW,...,2.1,2.1,Memory,False,False,,,,False,DS14 v2
3,USD,0.0,1.824,1.824,switzerlandnorth,CH North,2019-07-01T00:00:00Z,05d2c93d-d31c-4ae1-9431-964a151aa174,DS14 v2,DZH318Z0BQ4C,...,2.1,2.1,Memory,False,False,,,,False,DS14 v2
4,USD,0.0,1.491,1.491,usgovarizona,US Gov AZ,2018-06-01T00:00:00Z,05d40b3a-6dcf-40c1-b55f-9f5545203385,DS14 v2,DZH318Z0BQ4C,...,2.1,2.1,Memory,False,False,,,,False,DS14 v2
5,USD,0.0,1.866,1.866,uaecentral,AE Central,2019-02-12T00:00:00Z,0672c0d9-aa6e-494e-86c6-c859f1d9fcb1,DS14 v2,DZH318Z0BQ4C,...,2.1,2.1,Memory,False,False,,,,False,DS14 v2
6,USD,0.0,2.248,2.248,uksouth2,UK South 2,2019-06-04T00:00:00Z,08986628-6f1a-4cd8-b769-93f7d54644a6,DS14 v2,DZH318Z0BQPW,...,2.1,2.1,Memory,False,False,,,,False,DS14 v2
7,USD,0.0,2.952,2.952,uksouth2,UK South 2,2016-10-01T00:00:00Z,08986628-6f1a-4cd8-b769-93f7d54644a6,DS14 v2,DZH318Z0BQPW,...,2.1,2.1,Memory,False,False,,,,False,DS14 v2
8,USD,0.0,1.491,1.491,usgovvirginia,US Gov Virginia,2018-06-01T00:00:00Z,0a577a8d-8dd9-4592-9710-a6e6dc794eaf,DS14 v2,DZH318Z0BQ4C,...,2.1,2.1,Memory,False,False,,,,False,DS14 v2
9,USD,0.0,1.726,1.726,germanynorth,DE North,2020-03-01T00:00:00Z,0a9a0e22-1e77-441f-9d1e-088ebea850c3,DS14 v2,DZH318Z0BQ4C,...,2.1,2.1,Memory,False,False,,,,False,DS14 v2


In [23]:
df_final = df_merged.drop(columns=['API skuName'])

In [24]:
df_final.to_pickle('./data/azure.merged.compute.xz.pkl', compression='xz')