Job Events Inter-arrivals
==
(Run all the sections until the **Usage** section)
--

In [None]:
import numpy
import math
from utils import math_utils as mu
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
from utils import plot_utils as pu
from utils import output_utils as ou

output_notebook()

# Load and parse the job traces from the dataset
# Traces content
# 0: timestamp
# 1: missing info
# 2: job ID
# 3: event type
# 4: user name
# 5: scheduling class
# 6: job name
# 7: logical job name

# Load all the traces inside the directory 
j_google_traces = sc.textFile("../job_events/*.gz")

j_google_traces_RDD = j_google_traces.map(lambda line: line.split(","))\
    .map(lambda tokens: (int(tokens[0]),int(tokens[2]),int(tokens[3]),tokens[4],int(tokens[5]),tokens[6],tokens[7]))\
    .cache()

# The 2nd element of the trace was a "missing information", so for the sake of simplicity was cut out
# So the final RDD will contain the following informations
# 0: timestamp
# 1: job ID
# 2: event type
# 3: user name
# 4: scheduling class
# 5: job name
# 6: logical job name
print j_google_traces_RDD.take(1)

In [None]:
# We remove from the traces those that occurred before the beginning of the trace window (timestamp = 0)
# and those that occured after the end of the trace window (timestamp = 2^63-1)
# and we sort them in ascending order wrt the timestamps
j_filtered_google_traces_RDD = j_google_traces_RDD.map(lambda elem: (elem[0],(elem[1],elem[2])))\
    .filter(lambda elem: elem[0] != 0 and elem[0] != (2^63 - 1))\
    .sortByKey(1,1)

print j_google_traces_RDD.count() - j_filtered_google_traces_RDD.count(), "Traces were removed"
print j_filtered_google_traces_RDD.take(1)

Main Functions
---

In [None]:
# This function takes as input 
# event_type: int, corresponding to the event type found in the traces
# init_time: int (in seconds!), it is the time from which we want to evaluate the model
# finish_time: int (in seconds!), it is the time when we want to stop the evaluation
# granularity: int (in seconds!), define the level of granularity for plotting the results of the model
# E.G. over a window of 200 seconds we may have a granularity of 10 seconds
# which means that the derived traces will be clustered, based on the timestamps, in groups following
# that granularity
# For example assume to start from time = 0 till time = 200
# cluster 1: time interval 0-10
# cluster 2: time interval 10-20
# ...
# cluster 20: time interval 180-200

def j_eval_time_window(event_type, init_time, finish_time, granularity):
    init_time, finish_time, granularity = mu.adjust_values(init_time, finish_time, granularity)
    
    # First of all we apply another filter to select only the traces that corresponde to the event_type in input
    if(event_type != None):
        j_eval_traces_RDD = j_filtered_google_traces_RDD.filter(lambda elem: elem[1][1] == event_type)
        
    j_eval_traces_RDD = j_filtered_google_traces_RDD.filter(lambda elem: elem[0] >= init_time and elem[0] < finish_time)\
        .map(lambda elem: elem[0])
    
    # Collect the RDD to get a python list
    j_eval_traces_list = j_eval_traces_RDD.collect()
    j_evaluated_means_list = []
    # This value will always contain the lowest bound for the clusterization based on the granularity
    # It's initial values is obviously the init_time
    j_lower_g = init_time
    # Define how many clusters we want to create depending on the input granularity
    j_n_cluster = (finish_time-init_time)/granularity
    
    j_interval_values_list = []
    # We iterate to creare each cluster
    for i in range(0,int(j_n_cluster+1)):
        # Define the cluster filtering the derived traces
        j_cluster_traces = [timestamp for timestamp in j_eval_traces_list if timestamp >= j_lower_g and timestamp < (j_lower_g+granularity)]
        # We then append to our list a tuple of this format
        # (cluster_lower_bound, cluster_upper_bound, mean_time_between_jobs)
        j_evaluated_means_list.append([j_lower_g, (j_lower_g+granularity), mu.mean_time_evaluation(sc, j_cluster_traces)])
        
        # interval_values_list contains all the inter-arrivals times between subsequent events among the intervals (clusters)
        j_interval_values_list.extend(mu.get_interval_values(sc, j_cluster_traces, j_lower_g, j_lower_g+granularity))
        # Increase the lower bound to reach the next cluster
        j_lower_g += granularity
    
    # Evaluate some metrics and plot them
    j_metrics_list = mu.evaluate_statistics(sc, j_interval_values_list)
    
    # Parameters for 
    # plot_custom_metrics(sc, metrics_list, metric_id, trace_id, x_label, y_label, color)
    pu.plot_custom_metrics(sc, j_metrics_list, 0, "jobs", "time", "mean", "red")
    pu.plot_custom_metrics(sc, j_metrics_list, 1, "jobs", "time", "variance", "green")
    pu.plot_custom_metrics(sc, j_metrics_list, 2, "jobs", "time", "median", "blue")
    pu.plot_custom_metrics(sc, j_metrics_list, 3, "jobs", "time", "standard deviation", "orange")
    
    # Write on CSV
    ou.write_csv(j_interval_values_list, event_type, init_time, finish_time, granularity)
    # Remove the NaNs and substitute them with 0s
    j_evaluated_means_list = mu.removeNans(j_evaluated_means_list)
    
    # Calculate the average time for the whole period
    j_mean_time_whole_period = mu.mean_time_evaluation(sc, j_eval_traces_RDD.collect())
    # Plot inter-irravals
    pu.plot_inter_arrivals(sc, j_evaluated_means_list, j_mean_time_whole_period)
    
# eval_day(event_type, day(int), granularity(seconds))
# Evaluate a single day (24 h)
def j_eval_day(event_type, day, granularity):
    init_time, finish_time = mu.get_day_function_parameters(day)
    # If it is None an error occurred
    if (init_time != None and finish_time != None):
        j_eval_time_window(event_type,init_time,finish_time,granularity)

# eval_days(event_type, init_day(int), finish_day(int), granularity(seconds))
# Evaluate multiple days
def j_eval_days(event_type, init_day, finish_day, granularity):
    init_time, finish_time = mu.get_days_function_parameters(init_day, finish_day)
    # If it is None an error occurred
    if (init_time != None and finish_time != None):
        j_eval_time_window(event_type,init_time,finish_time,granularity)

Usage
==

In [None]:
j_eval_days(0,1,30,21600)