### Sign table

In [1]:
from pyDOE2 import *
# Generate sign table for experiments using Plackett-Burman design
sign_table = pbdesign(3)
sign_table

array([[-1., -1.,  1.],
       [ 1., -1., -1.],
       [-1.,  1., -1.],
       [ 1.,  1.,  1.]])

### Data processing

In [2]:
import pandas as pd
import time
from calendar import timegm
from datetime import datetime
from sklearn.linear_model import LinearRegression
from scipy import stats
import researchpy as rp

date_format = "%Y/%m/%d:%H:%M:%S"

# Takes a string containing date and time and converts it to epoch time
def timestamp_converter(timestamp):
    utc_time = time.strptime(timestamp, date_format)
    return timegm(utc_time)
    

In [3]:
def load_service_and_arrival_times(experiment):
    if isinstance(experiment, int):      
        service_path ='data/service/exp' + str(experiment) + "/run" 
        arrivals_path ='data/arrivals/exp' + str(experiment) + "/run"
    else:
        service_path = 'data/service/' + str(experiment) + "/run" 
        arrivals_path = 'data/arrivals/' + str(experiment) + "/run"
        
    times = pd.DataFrame({}, columns = ['job', 'arrival_time', 'serivice_time'])
    times_per_run = {}# pd.DataFrame({}, columns = ['job', 'arrival_time', 'serivice_time'])
    run_times = []
    for run in range(1,4):
        # Load the arrivals and service time data for a single run for this experiment
        temp_service = pd.read_csv(service_path + str(run) + '.csv',  delimiter=' ')
        temp_arrivals = pd.read_csv(arrivals_path + str(run) + '.csv',  delimiter=' ')
        
        # Transform the date string values into epoch values
        temp_arrivals['arrival_time'] = temp_arrivals['arrival_time'].apply(lambda x: timestamp_converter(x))
            
        # Compute the average runtimes for this run
        run_times.append(np.mean(temp_service['end_time']-temp_arrivals['arrival_time']))
        
        # Merge the data into a single df
        merged_temp = pd.merge(temp_arrivals, temp_service, on='job')
        
        times_per_run[str(experiment) + '-' + str(run)]=merged_temp
        
        # Concatenate the df to the results of previous runs 
        if times.empty:
            times = merged_temp
        else:
            times = pd.concat([times, merged_temp])
        
    return times, run_times, times_per_run    
    

In [4]:
# Collecting the timing data for all experiments
all_timing_data = {}
all_runtimes = []
all_timing_data_per_run = {}

for exp in range(1,5):
    # Load the data, remove the job column and reset the row index
    exp_times, exp_runtimes, exp_times_per_run = load_service_and_arrival_times(exp)
    exp_times = exp_times.drop(columns = ['job']).reset_index(drop=True)
    
    # Compute and add the runtimes
    exp_times['runtime']=exp_times['end_time']-exp_times['arrival_time']
    
    # Add the timing data for this experiment to all_timing_data
    all_timing_data[exp] = exp_times
    
    # as well as the timing data per run
    all_timing_data_per_run.update(exp_times_per_run)
    
    # and the avg runtimes to all_runtimes
    all_runtimes.extend(exp_runtimes)
    
all_timing_data_per_run.get('1-1')
#all_runtimes

Unnamed: 0,job,arrival_time,service_time,end_time
0,0,1571314170,139.008,1571314000.0
1,1,1571314477,120.696,1571315000.0
2,2,1571314598,121.408,1571315000.0
3,3,1571314719,121.217,1571315000.0
4,4,1571314840,121.023,1571315000.0
5,5,1571314961,121.926,1571315000.0
6,6,1571315083,120.992,1571315000.0
7,7,1571315204,120.975,1571315000.0
8,8,1571315325,121.279,1571315000.0
9,9,1571315447,120.67,1571316000.0


### Regression model fit using sklearn

In [5]:
# Collecting the timing data for all experiments
all_timing_data = {}
all_runtimes = []

epochs = [5, 20, 5, 20]
cores = [4, 4, 8, 8]
batch_sizes = [256, 64, 64, 256]

for exp in range(1,5):
    # Load the data, remove the job column and reset the row index
    exp_times, exp_runtimes, exp_times_per_run = load_service_and_arrival_times(exp)
    exp_times = exp_times.reset_index(drop=True)
    
    # Compute and add the runtimes
    exp_times['runtime']=exp_times['end_time']-exp_times['arrival_time']
    
    # Add the timing data for this experiment to all_timing_data
    all_timing_data[exp] = exp_times
    
    # and the avg runtimes to all_runtimes
    all_runtimes.extend(exp_runtimes)

    #add factor values to the dataframe
    all_timing_data.get(exp)["epoch"] = epochs[exp-1]
    all_timing_data.get(exp)["cores"] = cores[exp-1]
    all_timing_data.get(exp)["batch_size"] = batch_sizes[exp-1]

    
df = pd.DataFrame.from_dict(all_timing_data.get(1))
for exp in range(2, 5):
    df = df.append(pd.DataFrame.from_dict(all_timing_data.get(exp)))

df = df.drop(columns=["job", "arrival_time", "service_time", "end_time"])

X = df.drop(columns=["runtime"])
y = df["runtime"]

clf = LinearRegression().fit(X, y)

for col, coef in zip(X.columns, clf.coef_):
    print(f"{col} has coefficient {coef}")
    

epoch has coefficient 36.411059957329535
cores has coefficient -64.96256885567598
batch_size has coefficient -2.5128532426916195


In [6]:
# Collect timing data for baseline runs
baseline_times, baseline_avg_runtimes, baseline_times_per_run = load_service_and_arrival_times("baseline")
baseline_times = baseline_times.drop(columns = ['job']).reset_index(drop=True)

# Compute and add the runtimes
baseline_times['runtime']=baseline_times['end_time']-baseline_times['arrival_time']

#add factor values to the dataframe
baseline_times["epoch"] = 10
baseline_times["cores"] = 6
baseline_times["batch"] = 128

print("Average runtime for baseline: " + str(np.average(baseline_avg_runtimes)))

Average runtime for baseline: 383.71768385392653


In [7]:
# Collect timing data for baseline runs
baseline_improved_times, baseline_improved_avg_runtimes, baseline_improved_times_per_run = load_service_and_arrival_times("improved-baseline")
#baseline_times = baseline_times.drop(columns = ['job']).reset_index(drop=True)

# Compute and add the runtimes
baseline_improved_times['runtime']=baseline_improved_times['end_time']-baseline_improved_times['arrival_time']

#add factor values to the dataframe
baseline_improved_times["epoch"] = 10
baseline_improved_times["cores"] = 6
baseline_improved_times["batch"] = 512

print("Average runtime for improved baseline: " + str(np.average(baseline_improved_avg_runtimes)))

Average runtime for improved baseline: 172.26622689815989


### Average jobs per experiment

In [8]:
for exp in range(1,5):
    print(f"Experiment {exp} avg jobs: {len(all_timing_data.get(exp))/3}")
    
#Average jobs for baselines
print(f"Baseline avg jobs: " + str((len(baseline_times["runtime"])/3)))
print(f"Baseline improved avg jobs: " + str((len(baseline_improved_times["runtime"])/3)))

Experiment 1 avg jobs: 19.0
Experiment 2 avg jobs: 3.0
Experiment 3 avg jobs: 10.0
Experiment 4 avg jobs: 8.333333333333334
Baseline avg jobs: 9.0
Baseline improved avg jobs: 19.0


### Baseline vs. improved baseline T-test

In [9]:
t2, p2 = stats.ttest_ind(baseline_times["runtime"],baseline_improved_times["runtime"])
print(f"T: {t2}, p2: {p2}")

T: 228.44555906383425, p2: 9.167919353383126e-117


In [10]:
# Compute the average runtimes
avg_runtimes = {}
for exp in range(1, 5):
    average_runtime = np.average(np.array(all_timing_data[exp].runtime))
    avg_runtimes[exp] = average_runtime
    print("Average runtime for experiment " + str(exp) + ": " + str(average_runtime))

Average runtime for experiment 1: 124.22859359205815
Average runtime for experiment 2: 1152.8623155487908
Average runtime for experiment 3: 346.8461407661438
Average runtime for experiment 4: 410.5442175292969


#### Accuracies

In [11]:
for exp in range(1,5):
    accuracy_path ='data/accuracies/exp' + str(exp) +".csv" 
    accuracies = pd.read_csv(accuracy_path, header=None)
    print("Average accuracy for experiment " +  str(exp) + ": "+ str(np.average(accuracies)))

Average accuracy for experiment 1: 0.891861403508772
Average accuracy for experiment 2: 0.9598454545454547
Average accuracy for experiment 3: 0.9434787878787879
Average accuracy for experiment 4: 0.944562962962963


In [12]:
baseline_accuracy = pd.read_csv('data/accuracies/baseline.csv', header=None)
print("Average accuracy for baseline: " + str(np.average(baseline_accuracy)))

Average accuracy for baseline: 0.94281


In [13]:
improved_baseline_accuracy = pd.read_csv('data/accuracies/improved-baseline.csv', header=None)
print("Average accuracy for improved baseline: " + str(np.average(improved_baseline_accuracy)))

Average accuracy for improved baseline: 0.8913315789473685


### Regression model based on sign-table

In [14]:
# Create/solve regression model
print(sign_table)

y_mean = np.array(list(avg_runtimes.values()))
regression_totals = [sum(avg_runtimes.values())]
for i in range(sign_table.shape[1]):
    regression_totals.append(sum(sign_table[:,i]*y_mean))

print(regression_totals)

# Calulate effect per factor
effect = [x / 4 for x in regression_totals]
print("\nEffect per factor: " + str(effect))

[[-1. -1.  1.]
 [ 1. -1. -1.]
 [-1.  1. -1.]
 [ 1.  1.  1.]]
[2034.4812674362897, 1092.3317987198857, -519.7005508454083, -964.9356451935796]

Effect per factor: [508.62031685907243, 273.0829496799714, -129.92513771135208, -241.2339112983949]


In [15]:
def toNaturalVariable(obtained_value, min_value, max_value):
    return obtained_value * ((min_value+max_value)/2) + (max_value-min_value)/2

epochs = [5, 20]
cores = [4, 8]
batch_sizes = [64, 256]

natural_epoch = toNaturalVariable(effect[3], 64, 256)
natural_epoch/(64+256)



-120.31695564919747

In [16]:
df = pd.DataFrame({'Runtime': np.array(all_runtimes),
                   'Epoch': np.repeat(sign_table[:,0], 3),
                   'Cores': np.repeat(sign_table[:,1], 3),
                   'Batch': np.repeat(sign_table[:,2], 3)})

rp.summary_cont(df['Runtime'])





Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,Runtime,12.0,508.706567,404.330085,116.720042,251.807487,765.605647


In [17]:
stats.f_oneway(df['Runtime'][df['Cores'] == -1], 
             df['Runtime'][df['Cores'] == 1])


F_onewayResult(statistic=1.26751846331358, pvalue=0.2865263936107226)

In [18]:
from statsmodels.formula.api import ols
import statsmodels.api as sm

# ANOVA for Cores
rp.summary_cont(df.groupby(['Cores']))['Runtime']
cores_model = ols('Runtime ~ Cores', data=df).fit()
cores_model.summary()

aov_table = sm.stats.anova_lm(cores_model, typ=2)
aov_table





  "anyway, n=%i" % int(n))


Unnamed: 0,sum_sq,df,F,PR(>F)
Cores,202297.6,1.0,1.267518,0.286526
Residual,1596013.0,10.0,,


In [19]:
# ANOVA for Batch size
rp.summary_cont(df.groupby(['Batch']))['Runtime']
batch_model = ols('Runtime ~ Batch', data=df).fit()
batch_model.summary()

aov_table = sm.stats.anova_lm(batch_model, typ=2)
aov_table





  "anyway, n=%i" % int(n))


Unnamed: 0,sum_sq,df,F,PR(>F)
Batch,697826.3,1.0,6.341082,0.030484
Residual,1100485.0,10.0,,


In [20]:
# ANOVA for Training Epoch size
epoch_model = ols('Runtime ~ Epoch', data=df).fit()
epoch_model.summary()

aov_table = sm.stats.anova_lm(epoch_model, typ=2)
aov_table

  "anyway, n=%i" % int(n))


Unnamed: 0,sum_sq,df,F,PR(>F)
Epoch,895456.942631,1.0,9.91807,0.010345
Residual,902854.054602,10.0,,


In [21]:
# ANOVA for all factors
all_factor_model = ols('Runtime ~  Batch+Cores+Epoch', data=df).fit()
all_factor_model.summary()

# aov_table = sm.stats.anova_lm(interaction_model, typ=2)
# aov_table

  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,Runtime,R-squared:,0.998
Model:,OLS,Adj. R-squared:,0.998
Method:,Least Squares,F-statistic:,1754.0
Date:,"Mon, 28 Oct 2019",Prob (F-statistic):,1.31e-11
Time:,08:47:46,Log-Likelihood:,-49.59
No. Observations:,12,AIC:,107.2
Df Residuals:,8,BIC:,109.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,508.7066,5.333,95.393,0.000,496.409,521.004
Batch,-241.1477,5.333,-45.220,0.000,-253.445,-228.850
Cores,-129.8389,5.333,-24.347,0.000,-142.136,-117.542
Epoch,273.1692,5.333,51.225,0.000,260.872,285.467

0,1,2,3
Omnibus:,6.776,Durbin-Watson:,3.042
Prob(Omnibus):,0.034,Jarque-Bera (JB):,3.36
Skew:,0.398,Prob(JB):,0.186
Kurtosis:,5.467,Cond. No.,1.0


In [22]:
overall_model = ols('Runtime ~ Batch*Cores*Epoch', data=df).fit()
baseline_model = ols('runtime ~batch*cores*epoch', data=baseline_times).fit()
improved_baseline_model = ols('runtime ~batch*cores*epoch', data=baseline_improved_times).fit()

f_val, p_val, _ = overall_model.compare_f_test(batch_model)
print(f_val, p_val, p_val<0.01)


1608.3829332433963 3.7876269235115543e-11 True


### G/G/1

In [23]:
# Simulate G/G/1 queue
def queuing_simulation(data):
    N = len(data)
    At = np.array(data.arrival_time)

    S = np.zeros(N) # -> Service start time
    C = np.zeros(N) # -> Complete time
    W = np.zeros(N) # -> response time of job i (waiting time) 

    S[0] = At[0]
    C[0] = S[0] + data.service_time[0]
    W[0] = C[0] - At[0]

    for i in range(1, N):
        S[i] = max(C[i-1], At[i])
        C[i] = S[i] + data.service_time[i]
        W[i] = C[i] - At[i]

    return np.average(W)

In [24]:
# Run the G/G/1 queuing simulation for all experiments, using only the first run
for i in range(1,5):
    key = str(i) + '-1'
    print(f"Exp {i}: {queuing_simulation(all_timing_data_per_run.get(key))}")
    
print(f"\nBaseline: {queuing_simulation(baseline_times_per_run.get('baseline-1'))}")
print(f"Baseline improved: {queuing_simulation(baseline_improved_times_per_run.get('improved-baseline-1'))}")

Exp 1: 121.88989477408559
Exp 2: 1148.0766666730244
Exp 3: 346.6546999454498
Exp 4: 416.1967500448227

Baseline: 382.3243333498637
Baseline improved: 173.73405265808105


In [25]:
all_timing_data_per_run.get('1-1').arrival_time
baseline_improved_times_per_run
# print(queuing_simulation(all_timing_data.get('1-1')))

{'improved-baseline-1':     job  arrival_time  service_time      end_time
 0     0    1571832868       192.507  1.571833e+09
 1     1    1571833175       172.454  1.571833e+09
 2     2    1571833348       172.943  1.571834e+09
 3     3    1571833521       169.337  1.571834e+09
 4     4    1571833690       176.415  1.571834e+09
 5     5    1571833866       171.775  1.571834e+09
 6     6    1571834038       173.522  1.571834e+09
 7     7    1571834212       169.170  1.571834e+09
 8     8    1571834381       171.934  1.571835e+09
 9     9    1571834553       171.848  1.571835e+09
 10   10    1571834725       173.006  1.571835e+09
 11   11    1571834898       172.782  1.571835e+09
 12   12    1571835071       174.316  1.571835e+09
 13   13    1571835245       173.258  1.571835e+09
 14   14    1571835474       173.050  1.571836e+09
 15   15    1571835659       175.623  1.571836e+09
 16   16    1571835835       172.829  1.571836e+09
 17   17    1571836014       171.607  1.571836e+09
 18   18