In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import json

In [17]:
filename = 'data/stressng_kepler_query.json'
file = open(filename)
data = file.read()
data = json.loads(data)

In [18]:
result = {}

for metric in data:
    temp = []
    if metric == 'kepler_container_joules_total':
        for container_metric in data[metric]:
            tmp = [[container_metric['metric']['container_id'], container_metric['metric']['instance'], values[0], values[1]] for values in container_metric['values']]
            tmp_df = pd.DataFrame(tmp, columns =['container_id', 'node', 'time', 'value'])
            temp.append(tmp_df)
    elif metric == 'node_cpu_seconds_total':
        for node_metric in data[metric]:
            if node_metric['metric']['mode'] == "idle":
                continue
            tmp = [[node_metric['metric']['instance'], node_metric['metric']['cpu'], values[0], values[1]] for values in node_metric['values']]
            tmp_df = pd.DataFrame(tmp, columns =['node', 'cpu', 'time', 'value'])
            temp.append(tmp_df)
    elif metric == 'kepler_node_core_joules_total':
        for node_metric in data[metric]:
            tmp = [[node_metric['metric']['instance'], node_metric['metric']['package'], values[0], values[1]] for values in node_metric['values']]
            tmp_df = pd.DataFrame(tmp, columns =['node', 'cpu', 'time', 'value'])
            temp.append(tmp_df)
    else:
        continue
    t = pd.concat(temp)
    t['value'] = t['value'].astype(float)
    result[metric] = t.reset_index(drop = True)

In [19]:
def get_node_dataframe(data):
    energy = data['kepler_container_joules_total']
    energy_agg = energy.groupby(['node', 'time'], as_index=False)['value'].sum()
    cpu_util = data['node_cpu_seconds_total']
    cpu_util_agg = cpu_util.groupby(['node', 'time'], as_index=False)['value'].sum()
    node_power = data['kepler_node_core_joules_total']
    node_power = node_power.groupby(['node', 'time'], as_index=False)['value'].sum()
    
    node_power['node'] = '172.19.0.2:9100'
    energy_agg['node'] = '172.19.0.2:9100'
    
    final = pd.merge(cpu_util_agg, energy_agg, on = ['time', 'node'])
    final.rename(columns={'value_x':'util', 'value_y':'energy'}, inplace=True)
    final.sort_values('time', inplace=True)

    final = pd.merge(final, node_power, on = ['time', 'node'])
    final.rename(columns={'value':'power'}, inplace=True)
    
    return final


In [54]:
node_df = get_node_dataframe(result)

In [55]:
# import model

In [56]:
# m = model.UtilisationPowerModel()
# node_power_models = {}
# for node in final['node'].unique():
#     node_df = final[final['node'] == node]
#     node_df['util'] = node_df['util'].diff()
#     node_df['energy'] = node_df['energy'].diff()
#     node_df['power'] = node_df['power'].diff()
#     node_df.fillna(0, inplace=True)
#     node_df = node_df[(node_df[['util']] != 0).all(axis=1)]
#     max_util = max(node_df['util'])
#     node_df['util'] = node_df['util']/max_util
#     node_power_models[node] = m.get_model(node_df[['util', 'power']])

In [57]:
data = node_df

In [58]:
data.loc[:,'util'] = data['util'].diff()
data.fillna(0, inplace=True)
data = data[(data[['util']] != 0).all(axis=1)]
max_util = max(data['util'])
data.loc[:,'util'] = data.loc[:,'util']/max_util

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:,'util'] = data.loc[:,'util']/max_util


In [59]:
data.loc[:,"util"]

4       0.181533
7       0.178549
10      0.187029
14      0.179491
17      0.181062
          ...   
7187    0.027324
7190    0.023555
7194    0.024497
7197    0.025911
7200    0.023241
Name: util, Length: 2160, dtype: float64

In [61]:
data

Unnamed: 0,node,time,util,energy,power
4,172.19.0.2:9100,1699436712,0.181533,202347.903,560187.288
7,172.19.0.2:9100,1699436721,0.178549,202392.678,560302.758
10,172.19.0.2:9100,1699436730,0.187029,202428.096,560394.099
14,172.19.0.2:9100,1699436742,0.179491,202487.826,560548.161
17,172.19.0.2:9100,1699436751,0.181062,202529.511,560655.657
...,...,...,...,...,...
7187,172.19.0.2:9100,1699458261,0.027324,245354.406,1104962.292
7190,172.19.0.2:9100,1699458270,0.023555,245376.498,1105019.226
7194,172.19.0.2:9100,1699458282,0.024497,245406.747,1105097.130
7197,172.19.0.2:9100,1699458291,0.025911,245429.421,1105155.576


In [62]:
def roundup(i):
    r = int(i/5)
    if r%2 == 0:
        return math.floor(i/10)*10
    else:
        return math.ceil(i/10)*10

In [63]:
hist = {i*10:0 for i in range(11)}
for i in data['util']:
    hist[roundup(i*100)] += 1

In [64]:
hist

{0: 248,
 10: 32,
 20: 805,
 30: 242,
 40: 52,
 50: 322,
 60: 12,
 70: 156,
 80: 104,
 90: 34,
 100: 153}

In [72]:
degree_of_imbalance = sum(hist.values())/len(hist)*1.7
degree_of_imbalance

333.8181818181818

In [73]:
# Find oversampled region
over_sampled_region = []
for i in hist:
    if hist[i] > degree_of_imbalance:
        over_sampled_region.append(i)
over_sampled_region

[20]

In [74]:
def reduce_data(data, size):
    return data.loc[:size]

In [None]:
for i in over_sampled_region:
    