In [1]:
import pandas as pd
import numpy as np

In [2]:
# GCP
gcp_compute_src = './data/gcp_compute_costs.csv'
gcp_compute_data = pd.read_csv(gcp_compute_src)

In [3]:
gcp_compute_data.columns

Index(['region', 'family', 'vm type', 'regional_disk', 'sole_tenant',
       'nested_virtualization', 'local_ssd', 'cores', 'gpu', 'cpu', 'memory',
       'network_egress', 'on demand price', 'preemptible price'],
      dtype='object')

In [4]:
gcp_compute_data.family.unique()

array(['f1', 'g1', 'n1', 'n2', 'n2d', 'e2', 'c2', 'm1', 'm2', 'a2'],
      dtype=object)

#### Clean out unwanted columns

In [19]:
gcp_compute_data

Unnamed: 0,region,family,vm type,regional_disk,sole_tenant,nested_virtualization,local_ssd,cores,gpu,cpu,memory,network_egress,on demand price,preemptible price,clock
0,us,f1,f1-micro,Null,0,0,0,shared,0,['N/A'],0.6,Null,0.0076,0.0035,
1,us-central1,f1,f1-micro,Null,0,0,0,shared,0,['N/A'],0.6,Null,0.0076,0.0035,
2,us-east1,f1,f1-micro,Null,0,0,0,shared,0,['N/A'],0.6,Null,0.0076,0.0035,
3,us-east4,f1,f1-micro,Null,0,0,0,shared,0,['N/A'],0.6,Null,0.0086,0.00375,
4,us-west4,f1,f1-micro,Null,0,0,0,shared,0,['N/A'],0.6,Null,0.0086,0.00375,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3735,australia,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0,0,
3736,southamerica-east1,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0,0,
3737,asia-south1,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0,0,
3738,asia-southeast2,a2,a2-megagpu-16g,1,-1,-1,1,96,1,['Cascade Lake'],1360.0,100,0,0,


In [34]:
df_gcp = gcp_compute_data.drop(columns=['regional_disk','nested_virtualization','regional_disk','network_egress'])

In [35]:
df_gcp['clock']=np.nan

In [36]:
df_gcp.cpu.unique()

array(["['N/A']",
       "['Skylake', 'Broadwell', 'Haswell', 'Sandy Bridge', 'Ivy Bridge']",
       "['Cascade Lake']", "['AMD EPYC Rome']",
       "['Skylake', 'Broadwell', 'Haswell', 'AMD EPYC Rome (coming soon)']",
       "['Skylake', 'Broadwell E5']", "['Cascade Lake', 'Broadwell E7']"],
      dtype=object)

#### Many of the above start with either Skylake or Cascade Lake.  Since they present a range of chips, I'm just going to select one of these two

In [37]:
cpu_vms = pd.read_csv('./data/gcp.cpu_vm_clock.csv')

In [38]:
cpu_vms

Unnamed: 0,cpu,vm,clock
0,Cascade Lake,n2,2.8
1,Cascade Lake,c2,3.1
2,Cascade Lake,m2,2.5
3,Cascade Lake,a2,2.2
4,Skylake,e2,2.0
5,Skylake,m1,2.0
6,Skylake,n1,2.0
7,Broadwell E7,m1,2.2
8,Broadwell E5,e2,2.2
9,Broadwell E6,n1,2.2


In [39]:
cpu_vms = cpu_vms[(cpu_vms.cpu == 'Cascade Lake')|(cpu_vms.cpu == 'Skylake')]

In [40]:
cpu_vms

Unnamed: 0,cpu,vm,clock
0,Cascade Lake,n2,2.8
1,Cascade Lake,c2,3.1
2,Cascade Lake,m2,2.5
3,Cascade Lake,a2,2.2
4,Skylake,e2,2.0
5,Skylake,m1,2.0
6,Skylake,n1,2.0


In [41]:
for index, row in cpu_vms.iterrows():
    df_gcp.loc[(df_gcp.cpu.str.contains(row['cpu'])) & (df_gcp.family==row['vm']),'clock'] = row['clock']

In [42]:
df_gcp.clock.unique()

array([nan, 2. , 2.8, 3.1, 2.5, 2.2])

In [43]:
df_gcp_drop = df_gcp.drop(columns=['family','cpu'])
df_gcp_drop

Unnamed: 0,region,vm type,sole_tenant,local_ssd,cores,gpu,memory,on demand price,preemptible price,clock
0,us,f1-micro,0,0,shared,0,0.6,0.0076,0.0035,
1,us-central1,f1-micro,0,0,shared,0,0.6,0.0076,0.0035,
2,us-east1,f1-micro,0,0,shared,0,0.6,0.0076,0.0035,
3,us-east4,f1-micro,0,0,shared,0,0.6,0.0086,0.00375,
4,us-west4,f1-micro,0,0,shared,0,0.6,0.0086,0.00375,
...,...,...,...,...,...,...,...,...,...,...
3735,australia,a2-megagpu-16g,-1,1,96,1,1360.0,0,0,2.2
3736,southamerica-east1,a2-megagpu-16g,-1,1,96,1,1360.0,0,0,2.2
3737,asia-south1,a2-megagpu-16g,-1,1,96,1,1360.0,0,0,2.2
3738,asia-southeast2,a2-megagpu-16g,-1,1,96,1,1360.0,0,0,2.2


In [44]:
df_gcp_drop.sole_tenant.unique()

array([ 0,  1, -1])

In [45]:
df_gcp_drop.to_pickle('./data/gcp.compute.xz.pkl', compression='xz')