In [6]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines){
    return false;
}

<IPython.core.display.Javascript object>

In [7]:
# ignoring warnings

import warnings
warnings.filterwarnings('ignore')

In [12]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import random
from random import randrange
import progressbar
from sklearn.cluster import AgglomerativeClustering as agc
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from statsmodels.tsa.arima_model import ARIMA

SELECT_NUMBER = 1000

random.seed(1)
x_axis = np.arange(0, 29-1/288, 1/288).tolist()
np_cpu = np.load("dataset//google-cpu-full.npy")

In [15]:
selected_machines = pd.read_csv("derived_dataset//df_updated_selected_machines.csv", header = None)
selected_machines = selected_machines[1]

# df_cpu is a dataframe with all the cpu utilisation data (columns = machine; rows = time)
# dropping last 32 lines with corrupted data

df_cpu = pd.DataFrame(np_cpu).transpose()
df_selected_machines = df_cpu[selected_machines].drop(df_cpu.tail(32).index)

In [16]:
def kmeans_cluster_map(cpu_t, number_of_cluster = 50):
    # takes in cpu at a single time index and creates a map of cluster index to list of machine index in the cluster
    # cpu_t is a pd.series with values of cpu at time t 
    
    # get machine index
    machine_index = cpu_t.columns
    number_of_machine = len(machine_index)
    
    # converting cpu_t to appropriate data structure
    cpu_t = cpu_t.mean().values.reshape(-1,1)
    
    # computing the clusters
    kmeans = KMeans(n_clusters=number_of_cluster, random_state = 0)
    clustering_output = kmeans.fit(cpu_t).labels_
    
    # initialising a map for clusters
    cluster_map = {}
    for x in range(0,number_of_cluster):
        cluster_map[x] = []

    # get a map for each cluster
    # key is cluster index; values is list of machines in that cluster
    for x in range(0,number_of_machine):
        cluster_map[clustering_output[x]].append(machine_index[x])
        
    return cluster_map

def arima_predictions(timeseries, input_arima_order = (3,0,0)):
    # takes in a timeseries
    # outputs a single prediction in the next timestep
    
    timeseries = timeseries.values
    model = ARIMA(timeseries, order = input_arima_order)
    model_fit = model.fit(disp=0)
    prediction = model_fit.forecast()[0][0]
    
    return prediction

def mse_cluster_prediction(cpu_data, input_arima_order, past_error):
    # generate predictions for all machines in cluster at next single timestep
    # returns a map of machine to predictions
    
    # get generalisation model (analogous to best_machine in correlation_prediction)
    general_model = cpu_data.mean(axis = 1)
    
    # make predictions on best machine
    general_model_prediction = arima_predictions(general_model, input_arima_order = input_arima_order)
    
    # get the rolling_error
    rolling_error = past_error.mean()
    
    # scale general_model to fit all the other machines
    cluster_prediction = dict()
    for index, machine_index in enumerate(cpu_data.columns):
        cluster_prediction[machine_index] = general_model_prediction + rolling_error[machine_index]
        
    return cluster_prediction    

def mse_predictions(cpu_data, number_of_cluster = 50, start_time = 288, end_time = None, past_error_range = 5, rolling_cluster_window = 5):
    ''' high level function for making predictons using the tuor framework

        param:  cpu_data:   df with CPU index as columns and time-step as rows, must contain time-step for at least [start_time-3:end_time-2]
                            ARIMA models are trained with all time-steps before the "prediction time-step" i.e. prediction x[n] is made using data from x[:n]
                            Predictions are made for all machines index included in cpu_data, to exclude predicting certain machine index, drop from cpu_data
        param:  rolling_cluster_window:     number of past values to be considered for clustering at each time-step
        param:  number_of_cluster : number of clusters used for clustering per time-step
        param:  start_time, end_time :  prediction is made for from start_time to (end_time - 1) inclusive
                                     :  keep end_time = None to make predictions for the entire length of cpu_data i.e. predictions made from x[start_time:]
        param: past_error_range  :  predictions are offset using a rolling error window of size (defined here) using past errors   

    returns : df matrix of prediction with columns as machine and rows as time index
    '''
    
    arima_order = (3,0,0)
    machine_index = cpu_data.columns
    number_of_machine = len(cpu_data.columns)
    all_predictions = pd.DataFrame(columns = cpu_data.columns)
    df_past_error = pd.DataFrame(0, columns = cpu_data.columns, index = np.arange(0,past_error_range))
    
    if end_time == None:
        end_time = len(cpu_data.index) - 1
    
    with progressbar.ProgressBar(max_value = end_time-start_time) as bar:
        for current_time in range(start_time, end_time):
    
            bar.update(current_time-start_time)
        
            # reset current prediction map
            curr_all_machine_pred = {}

            # perform clustering at current time index
            # note clustering is only done using average of last "rolling_cluster_window" points
            cluster = kmeans_cluster_map(cpu_data.iloc[current_time - rolling_cluster_window + 1: current_time + 1], 
                                         number_of_cluster = number_of_cluster)

            # initialise dict for machine to prediction in current timestep
            curr_all_machine_pred = dict()

            for ls_machine_in_cluster in cluster.values():
#                 print("ls_machine_in_cluster : ", ls_machine_in_cluster)
                
                # make predictions for all machine in each cluster
                cluster_predictions = mse_cluster_prediction(cpu_data = cpu_data[ls_machine_in_cluster], 
                                                             input_arima_order = (3,0,0), 
                                                             past_error = df_past_error[ls_machine_in_cluster])
            
#                 print("cluster_predictions : ", cluster_predictions)
                
                # curr_all_machine_pred is a dict with key = machine, value = current timestep prediction
                curr_all_machine_pred = {**curr_all_machine_pred, **cluster_predictions}
                
#             print("curr_all_machine_pred : ", curr_all_machine_pred)

            # update df_past_error with most updated time index
            # start by getting the current error
            curr_all_machine_error = cpu_data.iloc[current_time+1] - pd.DataFrame(curr_all_machine_pred, index = [current_time+1])
#             print("curr_all_machine_error : ", curr_all_machine_error)
            
            df_past_error = df_past_error.append(curr_all_machine_error, 
                                                 ignore_index = True)
            df_past_error = df_past_error.drop(0).reset_index().drop("index", axis = 1)
#             print("df_past_error : ", df_past_error)
            
            # append the current predicition to all the predictions
            current_df = pd.DataFrame(curr_all_machine_pred, index = [current_time+1])
#             print("current_df : ", current_df)
            
            all_predictions = all_predictions.append(current_df, sort = True)
        
    return all_predictions

In [17]:
rolling_5 = mse_predictions(df_selected_machines, 
                                   number_of_cluster = 50, 
                                   start_time = 288, 
                                   end_time = 288+72,
                                   past_error_range = 5, 
                                   rolling_cluster_window = 5)

rolling_3 = mse_predictions(df_selected_machines, 
                                   number_of_cluster = 50, 
                                   start_time = 288, 
                                   end_time = 288+72,
                                   past_error_range = 3, 
                                   rolling_cluster_window = 3)

rolling_7 = mse_predictions(df_selected_machines, 
                                   number_of_cluster = 50, 
                                   start_time = 288, 
                                   end_time = 288+72,
                                   past_error_range = 7, 
                                   rolling_cluster_window = 7)

100% (72 of 72) |########################| Elapsed Time: 0:17:02 Time:  0:17:02
100% (72 of 72) |########################| Elapsed Time: 0:16:55 Time:  0:16:55
100% (72 of 72) |########################| Elapsed Time: 0:17:12 Time:  0:17:12


In [25]:
((rolling_3.reset_index() - df_selected_machines[289:361].reset_index())**2).mean().mean()

0.014649238160274419

In [21]:
((rolling_5 - df_selected_machines[289:361])**2).mean().mean()

0.014942982578305135

In [22]:
((rolling_7 - df_selected_machines[289:361])**2).mean().mean()

0.01534071651013925