In [1]:
import numpy
import math
import sys
import os
import parser as p
import math_utils

# Load and parse the job traces from the dataset
google_traces = p.hello()
print google_traces_RDD.take(1)

AttributeError: 'module' object has no attribute 'hello'

In [None]:
# We remove from the traces those that occurred before the beginning of the trace window (timestamp = 0)
# and those that occured after the end of the trace window (timestamp = 2^63-1)
# and we sort them in ascending order wrt the timestamps
filtered_google_traces_RDD = google_traces_RDD.map(lambda elem: (elem[0],(elem[1],elem[2])))\
    .filter(lambda elem: elem[0] != 0 and elem[0] != (2^63 - 1))\
    .sortByKey(1,1)

print google_traces_RDD.count() - filtered_google_traces_RDD.count(), "Traces were removed"

In [None]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

# This function takes as input 
# event_type: int, corresponding to the event type found in the traces
# init_time: int (in seconds!), it is the time from which we want to evaluate the model
# finish_time: int (in seconds!), it is the time when we want to stop the evaluation
# granularity: int (in seconds!), define the level of granularity for plotting the results of the model
# E.G. over a window of 200 seconds we may have a granularity of 10 seconds
# which means that the derived traces will be clustered, based on the timestamps, in groups following
# that granularity
# For example assume to start from time = 0 till time = 200
# cluster 1: time interval 0-10
# cluster 2: time interval 10-20
# ...
# cluster 20: time interval 180-200

def eval_time_window(event_type, init_time, finish_time, granularity):
    # First of all we apply another filter to select only the traces that corresponde to the event_type in input
    eval_traces_RDD = filtered_google_traces_RDD.filter(lambda elem: elem[1][1] == event_type)\
        .filter(lambda elem: elem[0] >= init_time and elem[0] < finish_time)\
        .map(lambda elem: elem[0])
    
    # Collect the RDD to get a python list
    eval_traces_list = eval_traces_RDD.collect()
    evaluated_means_list = []
    # This value will always contain the lowest bound for the clusterization based on the granularity
    # It's initial values is obviously the init_time
    lower_g = init_time
    # Define how many clusters we want to create depending on the input granularity
    n_cluster = (finish_time-init_time)/granularity
    
    # We iterate to creare each cluster
    for i in range(0,int(n_cluster+1)):
        # Define the cluster filtering the derived traces
        cluster_traces = [timestamp for timestamp in eval_traces_list if timestamp >= lower_g and timestamp < (lower_g+granularity)]
        # We then append to our list a tuple of this format
        # (cluster_lower_bound, cluster_upper_bound, mean_time_between_jobs)
        evaluated_means_list.append([lower_g, (lower_g+granularity), mu.mean_time_evaluation(sc, cluster_traces)])
        # Increase the lower bound to reach the next cluster
        lower_g += granularity
    
    # Remove the NaNs and substitute them with 0s
    for elem in evaluated_means_list:
        if math.isnan(elem[2]):
            elem[2] = 0
            
    # Prepare dataset for plotting
    evaluated_means_RDD = sc.parallelize(evaluated_means_list)
    x_axis = evaluated_means_RDD.map(lambda elem: (elem[0]+elem[1])/2).collect()
    y_axis = evaluated_means_RDD.map(lambda elem: elem[2]).collect()
    
    # Plot the graphs
    output_notebook()
    p = figure(title="Mean inter-arrival time", x_axis_label='Time Window (seconds)', y_axis_label='Mean Time (seconds)')
    p.line(x_axis, y_axis, legend="Time", line_width=1.5)

    # Show the results
    show(p)

In [None]:
eval_time_window(0, 6e+8,21e+8,3e+7)