In [5]:
# In Terminal, "pip install ibis-framework[duckdb]"
import os
import ibis
from ibis import selectors as s
from ibis import _
ibis.options.interactive = True


In [6]:
# Path
path = "../"
data_path = path + "data/datathon/"
data_path = os.path.expanduser(path + "data/datathon/")

In [7]:
data_path + "vgsales.parquet"

'../data/datathon/vgsales.parquet'

In [8]:
# Source
vgsales_original = ibis.read_csv(data_path + "vgsales.csv")
vgsales_long = ibis.read_parquet(data_path + "vgsales.parquet")

In [9]:
# Source
pricing = ibis.read_parquet(data_path + "pricing.parquet")
department = ibis.read_parquet(data_path + "department.parquet")
metrics = ibis.read_parquet(data_path + "metrics.parquet")

In [10]:
pricing.head(3)

### 1.2 How many departments use the appliances of the Data Platform?

In [11]:
metrics_department = metrics.join(department, "id", how = "left").select(~s.matches("id_right"))
metrics_department.head(3)

In [12]:
metrics_unique = metrics_department.group_by(["id", "department", "disk_size", "type", "size"]).aggregate()
metrics_unique.head(3)

In [13]:
t = metrics_unique.group_by("department").aggregate(count_department = _.department.count())
t.order_by(ibis.desc("count_department"))

### 1.3 What is the most popular appliance size used by all departments? And how many of those popular sizes did you find in the whole dataset?

In [14]:
t = metrics_unique.group_by("size").aggregate(count_size = _.size.count())
t.order_by(ibis.desc("count_size"))

### 2.1 Which is the most popular appliance type per department?

In [15]:
t = metrics_unique.group_by(["department", "type"]).aggregate(type_per_department=metrics_unique.type.count())
t.order_by(["department", ibis.desc("type_per_department")])

### 2.2 Wich appliance size had the lowest vCPU utilization over the full time range of the dataset based on the listed metrics? Calculate a value with 6 digits after zero for each metric:

In [16]:
vcpu_by_size = metrics.group_by(["id", "data_timestamp", "size", "vcpu"]).aggregate()

In [17]:
t = vcpu_by_size.group_by("size").aggregate(minimum = _.vcpu.min(),
                                            average = _.vcpu.mean(),
                                            median = _.vcpu.median(),)
t.order_by("median")

### 2.3 Which department has used the most appliances between 15.12.2022 and 16.01.2023?  How many appliances did they use in this time range?

In [18]:
t = metrics_department[(_.data_timestamp > "2022-12-15") & (_.data_timestamp < "2023-01-16")]
t = t.group_by(["id", "department"] ).aggregate()
t = t.group_by("department").aggregate(appliance_count = _.id.count())
t.order_by(ibis.desc("appliance_count"))

In [19]:
t = metrics_department[(_["data_timestamp"] > "2022-12-15") & (_["data_timestamp"] < "2023-01-16")]
t = t.group_by(["id", "department"] ).aggregate()
t = t.group_by("department").aggregate(appliance_count = _["id"].count())
t.order_by(ibis.desc("appliance_count"))

### 2.4 What is the most expensive size of an appliance used in the Data Platform in terms of hours used per department?

In [20]:
metrics_extended = metrics_department.join(pricing, "size", how = "left").select(~s.matches("size_right"))
metrics_extended.head(3)

In [21]:
data_usage = metrics_extended.group_by(["department", "size", "cost_per_hour", "data_timestamp"]).aggregate()
data_usage = (data_usage
                .group_by(["department", "size", "cost_per_hour"])
                .aggregate(timestamp_count = _.data_timestamp.count()))
data_usage.order_by(ibis.desc(_.timestamp_count))

In [22]:
calc_cost = data_usage.mutate(cost = _.timestamp_count / 12 * _.cost_per_hour)
calc_cost.order_by([ibis.desc(_.department), ibis.desc(_.cost)])

### 3.1 Which fields are important to find out if an appliance is idle - meaning that an appliance is running but no action is performed on it?  Sort the correct values in alphabetic order, before submitting your response.

### 3.2 Which appliances were idle and when?

In [23]:
r1 = metrics.select("size", "net_in", "net_out")
r2 = (r1
        .group_by("size")
        .aggregate(
                    max_net_in = _.net_in.max(),
                    max_net_out = _.net_out.max()
                    ))
max_network = r2.mutate(max_network = _.max_net_in + _.max_net_out)
max_network.head(3)

In [24]:
r1 = (
        metrics
            .join(max_network, "size", how="left")
            .select(~s.matches("size_right"))
            .mutate(net_usage = _.net_in + _.net_out)
    )
r1.head(3)

In [25]:
r2 = r1.mutate(
            check_network = (ibis.case()
                                .when(r1.net_usage < r1.max_network * 0.02, "idle")
                                .when(True, "running").end()),    
            check_vcpu = (ibis.case()
                            .when((r1.type == "deeplearning") & (r1.vcpu < 10),  "idle")
                            .when(r1.vcpu < 5, "idle")
                            .when(True, "running").end()),
        )
r3 = r2.mutate(
            check_idle = (ibis.case()
                            .when((r2.check_network == "idle") & (r2.check_vcpu == "idle"), "idle")
                            .when(True, "running").end())
        )
r3.filter(_.check_idle == "running").select(_.id, _.data_timestamp, _.check_network, _.check_vcpu, _.check_idle)