In [1]:
import pandas as pd
import numpy as np

In [2]:
# GCP
gcp_compute_src = './data/gcp_compute_costs.csv'
gcp_compute_data = pd.read_csv(gcp_compute_src)

In [3]:
gcp_compute_data.columns

Index(['region', 'family', 'vm type', 'regional_disk', 'sole_tenant',
       'nested_virtualization', 'local_ssd', 'cores', 'gpu', 'cpu', 'memory',
       'network_egress', 'on demand price', 'preemptible price'],
      dtype='object')

In [4]:
gcp_compute_data.family.unique()

array(['f1', 'g1', 'n1', 'n2', 'n2d', 'e2', 'c2', 'm1', 'm2', 'a2'],
      dtype=object)

#### Clean out unwanted columns

In [5]:
gcp_compute_data

Unnamed: 0,region,family,vm type,regional_disk,sole_tenant,nested_virtualization,local_ssd,cores,gpu,cpu,memory,network_egress,on demand price,preemptible price
0,us,f1,f1-micro,Null,0,0,0,shared,0,['N/A'],0.6,Null,0.0076,0.0035
1,us-central1,f1,f1-micro,Null,0,0,0,shared,0,['N/A'],0.6,Null,0.0076,0.0035
2,us-east1,f1,f1-micro,Null,0,0,0,shared,0,['N/A'],0.6,Null,0.0076,0.0035
3,us-east4,f1,f1-micro,Null,0,0,0,shared,0,['N/A'],0.6,Null,0.0086,0.00375
4,us-west4,f1,f1-micro,Null,0,0,0,shared,0,['N/A'],0.6,Null,0.0086,0.00375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3735,australia,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0,0
3736,southamerica-east1,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0,0
3737,asia-south1,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0,0
3738,asia-southeast2,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0,0


In [6]:
df_gcp = gcp_compute_data.drop(columns=['regional_disk','nested_virtualization','regional_disk','network_egress'])

### Instance Family

In [7]:
df_gcp['instance_family'] = np.nan
df_gcp.loc[df_gcp['vm type'].str.contains('standard'), 'instance_family'] = 'General purpose'

df_gcp.loc[df_gcp['vm type'].str.contains('highmem'),  'instance_family'] = 'Memory optimized'
df_gcp.loc[df_gcp['vm type'].str.contains('ultramem'), 'instance_family'] = 'Memory optimized'
df_gcp.loc[df_gcp['vm type'].str.contains('megamem'),  'instance_family'] = 'Memory optimized'
df_gcp.loc[df_gcp['vm type'].str.contains('highcpu'),  'instance_family'] = 'Compute optimized'

df_gcp.loc[df_gcp['vm type'].str.contains('highgpu'),  'instance_family'] = 'GPU optimized'
df_gcp.loc[df_gcp['vm type'].str.contains('megagpu'),  'instance_family'] = 'GPU optimized'

df_gcp.loc[df_gcp['vm type'].str.contains('micro'),    'instance_family'] = 'Shared core'
df_gcp.loc[df_gcp['vm type'].str.contains('small'),    'instance_family'] = 'Shared core'
df_gcp.loc[df_gcp['vm type'].str.contains('medium'),   'instance_family'] = 'Shared core'

df_gcp.instance_family.unique()

array(['Shared core', 'Compute optimized', 'Memory optimized',
       'General purpose', 'GPU optimized'], dtype=object)

### Small Cores

In [8]:
df_gcp.loc[df_gcp['vm type'].str.contains('micro'),  'cores'] = 0.25
df_gcp.loc[df_gcp['vm type'].str.contains('small'),  'cores'] = 0.5
df_gcp.loc[df_gcp['vm type'].str.contains('medium'), 'cores'] = 1

### Clock speed

Based on: https://cloud.google.com/compute/docs/cpu-platforms
Should probably process all clock speeds in GCP notebook and then merge here

In [9]:
df_gcp['clock_speed']=np.nan

In [10]:
df_gcp.cpu.unique()

array(["['N/A']",
       "['Skylake', 'Broadwell', 'Haswell', 'Sandy Bridge', 'Ivy Bridge']",
       "['Cascade Lake']", "['AMD EPYC Rome']",
       "['Skylake', 'Broadwell', 'Haswell', 'AMD EPYC Rome (coming soon)']",
       "['Skylake', 'Broadwell E5']", "['Cascade Lake', 'Broadwell E7']"],
      dtype=object)

#### Many of the above start with either Skylake or Cascade Lake.  Since they present a range of chips, I'm just going to select one of these two

In [11]:
cpu_vms = pd.read_csv('./data/gcp.cpu_vm_clock.csv')

In [12]:
cpu_vms

Unnamed: 0,cpu,vm,clock
0,Cascade Lake,n2,2.8
1,Cascade Lake,c2,3.1
2,Cascade Lake,m2,2.5
3,Cascade Lake,a2,2.2
4,Skylake,e2,2.0
5,Skylake,m1,2.0
6,Skylake,n1,2.0
7,Broadwell E7,m1,2.2
8,Broadwell E5,e2,2.2
9,Broadwell E6,n1,2.2


In [13]:
cpu_vms = cpu_vms[(cpu_vms.cpu == 'Cascade Lake')|(cpu_vms.cpu == 'Skylake')]

In [14]:
cpu_vms

Unnamed: 0,cpu,vm,clock
0,Cascade Lake,n2,2.8
1,Cascade Lake,c2,3.1
2,Cascade Lake,m2,2.5
3,Cascade Lake,a2,2.2
4,Skylake,e2,2.0
5,Skylake,m1,2.0
6,Skylake,n1,2.0


In [15]:
for index, row in cpu_vms.iterrows():
    df_gcp.loc[(df_gcp.cpu.str.contains(row['cpu'])) & (df_gcp.family==row['vm']),'clock_speed'] = row['clock']

In [16]:
df_gcp.clock_speed.unique()

array([nan, 2. , 2.8, 3.1, 2.5, 2.2])

In [17]:
df_gcp_drop = df_gcp.drop(columns=['family','cpu','preemptible price'])
df_gcp_drop = df_gcp_drop.rename(columns={'cores':'vcpu'})

# on demand price
## remove nulls
df_gcp_drop = df_gcp_drop.dropna(subset=['on demand price'])
df_gcp_drop = df_gcp_drop[df_gcp_drop['on demand price'] != "Null"]

## > 0
df_gcp_drop['on demand price'] = pd.to_numeric(df_gcp_drop['on demand price'], downcast="float")
df_gcp_drop = df_gcp_drop[df_gcp_drop['on demand price']>0]
df_gcp_drop

Unnamed: 0,region,vm type,sole_tenant,local_ssd,vcpu,gpu,memory,on demand price,instance_family,clock_speed
0,us,f1-micro,0,0,0.25,0,0.6,0.00760,Shared core,
1,us-central1,f1-micro,0,0,0.25,0,0.6,0.00760,Shared core,
2,us-east1,f1-micro,0,0,0.25,0,0.6,0.00760,Shared core,
3,us-east4,f1-micro,0,0,0.25,0,0.6,0.00860,Shared core,
4,us-west4,f1-micro,0,0,0.25,0,0.6,0.00860,Shared core,
...,...,...,...,...,...,...,...,...,...,...
3698,asia-southeast1,a2-highgpu-8g,-1,1,96,1,680.0,7.29758,GPU optimized,2.2
3707,us-central1,a2-megagpu-16g,-1,1,96,1,1360.0,8.79698,GPU optimized,2.2
3719,europe-west4,a2-megagpu-16g,-1,1,96,1,1360.0,9.68403,GPU optimized,2.2
3731,asia-southeast,a2-megagpu-16g,-1,1,96,1,1360.0,10.85126,GPU optimized,2.2


In [18]:
df_gcp_drop.sole_tenant.unique()

array([ 0,  1, -1])

In [19]:
df_gcp_sole = df_gcp_drop.rename(columns={'sole_tenant':'tenancy'})
df_gcp_sole.loc[df_gcp_sole.tenancy == 0,'tenancy'] = 'Shared'
df_gcp_sole.loc[df_gcp_sole.tenancy == 1,'tenancy'] = 'Dedicated'
df_gcp_sole.loc[df_gcp_sole.tenancy == -1,'tenancy'] = 'Shared'
df_gcp_sole

Unnamed: 0,region,vm type,tenancy,local_ssd,vcpu,gpu,memory,on demand price,instance_family,clock_speed
0,us,f1-micro,Shared,0,0.25,0,0.6,0.00760,Shared core,
1,us-central1,f1-micro,Shared,0,0.25,0,0.6,0.00760,Shared core,
2,us-east1,f1-micro,Shared,0,0.25,0,0.6,0.00760,Shared core,
3,us-east4,f1-micro,Shared,0,0.25,0,0.6,0.00860,Shared core,
4,us-west4,f1-micro,Shared,0,0.25,0,0.6,0.00860,Shared core,
...,...,...,...,...,...,...,...,...,...,...
3698,asia-southeast1,a2-highgpu-8g,Shared,1,96,1,680.0,7.29758,GPU optimized,2.2
3707,us-central1,a2-megagpu-16g,Shared,1,96,1,1360.0,8.79698,GPU optimized,2.2
3719,europe-west4,a2-megagpu-16g,Shared,1,96,1,1360.0,9.68403,GPU optimized,2.2
3731,asia-southeast,a2-megagpu-16g,Shared,1,96,1,1360.0,10.85126,GPU optimized,2.2


In [20]:
df_gcp_final = df_gcp_sole.rename(columns={'vm type':'instance_type', 'on demand price':'price','region':'location'})
df_gcp_final = df_gcp_final.drop(columns=['gpu'])
df_gcp_final

Unnamed: 0,location,instance_type,tenancy,local_ssd,vcpu,memory,price,instance_family,clock_speed
0,us,f1-micro,Shared,0,0.25,0.6,0.00760,Shared core,
1,us-central1,f1-micro,Shared,0,0.25,0.6,0.00760,Shared core,
2,us-east1,f1-micro,Shared,0,0.25,0.6,0.00760,Shared core,
3,us-east4,f1-micro,Shared,0,0.25,0.6,0.00860,Shared core,
4,us-west4,f1-micro,Shared,0,0.25,0.6,0.00860,Shared core,
...,...,...,...,...,...,...,...,...,...
3698,asia-southeast1,a2-highgpu-8g,Shared,1,96,680.0,7.29758,GPU optimized,2.2
3707,us-central1,a2-megagpu-16g,Shared,1,96,1360.0,8.79698,GPU optimized,2.2
3719,europe-west4,a2-megagpu-16g,Shared,1,96,1360.0,9.68403,GPU optimized,2.2
3731,asia-southeast,a2-megagpu-16g,Shared,1,96,1360.0,10.85126,GPU optimized,2.2


In [21]:
df_gcp_final.to_pickle('./data/gcp.compute.xz.pkl', compression='xz')