In [1]:
import pandas as pd

In [2]:
path = "../data/"

# 1.2 Data preparation

In [3]:
pricing = pd.read_csv(path + 'datathon/pricing.csv')

In [4]:

def department():
    df = pd.read_csv(path + 'datathon/department.csv')
    df['department'] = df['department'].str.strip()
    return df

In [5]:
def main_data():
    df = pd.read_csv(path + 'datathon/metrics.csv')
    df = process_date_columns(df)
    df = pd.merge(df, department(), how='left', on='id')
    return df

In [6]:
def process_date_columns(df):
    df['data_timestamp'] = pd.to_datetime(df['data_timestamp'])
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['updated_at'] = pd.to_datetime(df['updated_at'])
    df['last_patch'] = pd.to_datetime(df['last_patch'])
    return df

In [7]:
def text_columns(df):
    df = df[['id', 'department', 'disk_size', 'type', 'size']]
    df = df.drop_duplicates()
    return df

In [8]:
df = main_data()
df_text = text_columns(df)

In [9]:
df_text.head()

Unnamed: 0,id,department,disk_size,type,size
0,i-0a16592fdb1239d51,Marketing,240,deeplearning,t3.xlarge
9846,i-080c53b1618bf299a,IT,90,rstudio,r5a.2xlarge
36057,i-0b464aceb0dd373fc,Sales,240,deeplearning,r5.2xlarge
69465,i-03176e6ab2d9d413b,Marketing,60,matlab,t3.large
72777,i-0b981c18a1e484b20,Sales,120,matlab,g3s.xlarge


# 1.2 How many departments use the appliances of the Data Platform?

In [10]:
def number_of_department(df):
    df = df[['id', 'department']]
    df = df.groupby('department').count()
    df = df.sort_values('id', ascending=False)
    return df

In [11]:
number_of_department(df_text)

Unnamed: 0_level_0,id
department,Unnamed: 1_level_1
Sales,10
HR,6
IT,6
Engineering,5
Marketing,3
Operations,3


In [12]:
len(number_of_department(df_text))

6

# 1.3 What is the most popular appliance size used by all departments? And how many of those popular sizes did you find in the whole dataset?

In [13]:
def appliance_size(df):
    df = df[['id', 'size']]
    df = df.groupby('size').count()
    df = df.sort_values('id', ascending=False)
    return df

In [14]:
appliance_size(df_text)

Unnamed: 0_level_0,id
size,Unnamed: 1_level_1
t3a.medium,21
c5.4xlarge,2
r5a.2xlarge,2
t3a.2xlarge,2
g3s.xlarge,1
m5d.xlarge,1
r5.2xlarge,1
t3.large,1
t3.xlarge,1
t3a.xlarge,1


# 2.1 Which is the most popular appliance type per department?

In [15]:
def type_by_dept(df):
    df = df[['id', 'department', 'type']]
    df = df.groupby(['department', 'type']).count()
    df = df.sort_values(['department', 'id'], ascending=[True, False])
    return df

In [16]:
type_by_dept(df_text)

Unnamed: 0_level_0,Unnamed: 1_level_0,id
department,type,Unnamed: 2_level_1
Engineering,jupyter,3
Engineering,knime,1
Engineering,rstudio,1
HR,knime,5
HR,deeplearning,1
IT,jupyter,3
IT,rstudio,2
IT,knime,1
Marketing,matlab,2
Marketing,deeplearning,1


# 2.2 Wich appliance size had the lowest vCPU utilization over the full time range of the dataset based on the listed metrics? Calculate a value with 6 digits after zero for each metric:

In [17]:
def vcpu_by_size(df):
    df = df[['id', 'data_timestamp', 'size', 'vcpu']]
    df = df.drop_duplicates()
    df = df[['size', 'vcpu']]
    df = df.groupby('size', as_index=False).agg(["min", "median", "mean"])
    return df

In [18]:
vcpu_by_size(df)

Unnamed: 0_level_0,vcpu,vcpu,vcpu
Unnamed: 0_level_1,min,median,mean
size,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
c5.4xlarge,0.105102,0.994667,6.32923
g3s.xlarge,0.391572,0.425,0.450599
m5d.xlarge,0.641,12.194167,15.238145
r5.2xlarge,0.302333,0.652333,1.507907
r5a.2xlarge,0.000165,0.216333,0.545604
t3.large,0.947409,0.98833,1.075165
t3.xlarge,0.256002,0.820327,2.191348
t3a.2xlarge,0.212592,0.480333,0.53795
t3a.medium,0.01063,1.443338,11.02783
t3a.xlarge,0.368,0.97033,1.903183


In [19]:
vcpu_by_size(df)['vcpu', 'min'].sort_values()

size
r5a.2xlarge    0.000165
t3a.medium     0.010630
c5.4xlarge     0.105102
t3a.2xlarge    0.212592
t3.xlarge      0.256002
r5.2xlarge     0.302333
t3a.xlarge     0.368000
g3s.xlarge     0.391572
m5d.xlarge     0.641000
t3.large       0.947409
Name: (vcpu, min), dtype: float64

In [20]:
vcpu_by_size(df)['vcpu', 'median'].sort_values()

size
r5a.2xlarge     0.216333
g3s.xlarge      0.425000
t3a.2xlarge     0.480333
r5.2xlarge      0.652333
t3.xlarge       0.820327
t3a.xlarge      0.970330
t3.large        0.988330
c5.4xlarge      0.994667
t3a.medium      1.443338
m5d.xlarge     12.194167
Name: (vcpu, median), dtype: float64

In [21]:
vcpu_by_size(df)['vcpu', 'mean'].sort_values()

size
g3s.xlarge      0.450599
t3a.2xlarge     0.537950
r5a.2xlarge     0.545604
t3.large        1.075165
r5.2xlarge      1.507907
t3a.xlarge      1.903183
t3.xlarge       2.191348
c5.4xlarge      6.329230
t3a.medium     11.027830
m5d.xlarge     15.238145
Name: (vcpu, mean), dtype: float64

# 2.3 Which department has used the most appliances between 15.12.2022 and 16.01.2023? How many appliances did they use in this time range?

In [22]:
def data_in_periods(df):
    mask = (
        (df['data_timestamp'] > pd.to_datetime('2022-12-15'))
        &
        (df['data_timestamp'] < pd.to_datetime('2023-01-16'))
    )
    df = df.loc[mask]
    return df

In [23]:
def appliance_in_periods(df):
    df = data_in_periods(df)
    df = df[['id', 'department']]
    df = df.drop_duplicates()
    df = df.groupby('department').count()
    df = df.sort_values('id', ascending=False)
    return df

In [24]:
appliance_in_periods(df)

Unnamed: 0_level_0,id
department,Unnamed: 1_level_1
Sales,6
Engineering,4
HR,3
IT,3
Marketing,1
Operations,1


# 2.4 What is the most expensive size of an appliance used in the Data Platform in terms of hours used per department?

In [25]:
def data_usage(df):
    df = pd.merge(df, pricing, how='left', on='size')
    df = df[['department', 'size', 'data_timestamp', 'cost_per_hour']]
    df = df.drop_duplicates()
    df = df.groupby(['department', 'size', 'cost_per_hour'],
                    as_index=False).count()
    return df

In [26]:
def calc_cost(df):
    df = data_usage(df)
    df['cost'] = df['data_timestamp'] / 12 * df['cost_per_hour']
    df = df.sort_values(['department', 'cost'], ascending=False)
    return df

In [27]:
calc_cost(df)

Unnamed: 0,department,size,cost_per_hour,data_timestamp,cost
13,Sales,g3s.xlarge,0.75,15864,991.5
15,Sales,r5.2xlarge,0.504,16704,701.568
14,Sales,m5d.xlarge,0.226,14128,266.077333
16,Sales,t3a.medium,0.0376,3423,10.7254
11,Operations,t3a.2xlarge,0.3008,916,22.961067
12,Operations,t3a.medium,0.0376,47,0.147267
9,Marketing,t3.xlarge,0.1664,3282,45.5104
8,Marketing,t3.large,0.0832,1104,7.6544
10,Marketing,t3a.medium,0.0376,13,0.040733
6,IT,r5a.2xlarge,0.452,8737,329.093667


# 3.1 Which fields are important to find out if an appliance is idle - meaning that an appliance is running but no action is performed on it? Sort the correct values in alphabetic order, before submitting your response.

# 3.2 Which appliances were idle and when?

In [28]:
def check_idle(df):
    df = pd.merge(df, maximum_network(df), how="left", on="size")
    df['network_idle'] = df.apply(check_network, axis=1)
    df['vcpu_idle'] = df.apply(check_vcpu, axis=1)
    df['idle'] = df.apply(both_idle, axis=1)
    return df

In [29]:
def both_idle(row):
    if row['vcpu_idle'] == 'idle' and row['network_idle'] == 'idle':
        return 'yes'
    else:
        return 'no'

In [30]:
def check_vcpu(row):
    if row['type'] == 'deeplearning' and row['vcpu'] < 10 or row['vcpu'] < 5:
        return 'idle'
    else:
        return ''

In [31]:
def check_network(row):
    if row['net_in'] + row['net_out'] < row['net_max'] * 0.02:
        return 'idle'
    else:
        return ''

In [32]:
def maximum_network(df):
    df = df[['size', 'net_in', 'net_out']]
    df = df.groupby('size', as_index=False).agg("max")
    df['net_max'] = df['net_in'] + df['net_out']
    df = df[['size', 'net_max']]
    return df

In [33]:
def export_idle_verbose(df):
    df = check_idle(df)
    df = df[['id', 'data_timestamp', 'idle', 'vcpu_idle', 'vcpu',
             'network_idle', 'net_in', 'net_out', 'net_max']]
    df = df.drop_duplicates()
    df = df.sort_values(['data_timestamp', 'id'], ascending=False)
    return df

In [34]:
def export_idle_final(df):
    df = export_idle_verbose(df)
    df = df[['id', 'data_timestamp', 'idle']]
    return(df)

In [35]:
export_idle_verbose(df).head()

Unnamed: 0,id,data_timestamp,idle,vcpu_idle,vcpu,network_idle,net_in,net_out,net_max
72777,i-0b981c18a1e484b20,2023-01-23 11:55:00,yes,idle,0.45,idle,42345.8,28643.8,18322940.0
36057,i-0b464aceb0dd373fc,2023-01-23 11:55:00,no,idle,6.535667,,1923002000.0,2217275000.0,6278642000.0
0,i-0a16592fdb1239d51,2023-01-23 11:55:00,yes,idle,0.827669,idle,30741.2,24289.8,348561200.0
94170,i-091151b2c9f6411a5,2023-01-23 11:55:00,yes,idle,1.148337,idle,31561.2,22434.6,463863000.0
9846,i-080c53b1618bf299a,2023-01-23 11:55:00,yes,idle,0.396667,idle,193299.8,489680.4,3142678000.0


In [36]:
export_idle_final(df).head()

Unnamed: 0,id,data_timestamp,idle
72777,i-0b981c18a1e484b20,2023-01-23 11:55:00,yes
36057,i-0b464aceb0dd373fc,2023-01-23 11:55:00,no
0,i-0a16592fdb1239d51,2023-01-23 11:55:00,yes
94170,i-091151b2c9f6411a5,2023-01-23 11:55:00,yes
9846,i-080c53b1618bf299a,2023-01-23 11:55:00,yes


# 3.3.1 How much costs did the appliances generate in the idle state?

# 3.3.2 Compared to the total cost generated overall, how much percent are attributed to the idle appliances?