### Sign table

In [16]:
from pyDOE2 import *
# Generate sign table for experiments using Plackett-Burman design
sign_table = pbdesign(3)
sign_table

array([[-1., -1.,  1.],
       [ 1., -1., -1.],
       [-1.,  1., -1.],
       [ 1.,  1.,  1.]])

### Data processing

In [17]:
import pandas as pd
import time
from calendar import timegm
from datetime import datetime
from sklearn.linear_model import LinearRegression

date_format = "%Y/%m/%d:%H:%M:%S"

# Takes a string containing date and time and converts it to epoch time
def timestamp_converter(timestamp):
    utc_time = time.strptime(timestamp, date_format)
    return timegm(utc_time)
    

In [18]:
test = 1


In [19]:
def load_service_and_arrival_times(experiment):
    if isinstance(experiment, int):      
        service_path ='data/service/exp' + str(experiment) + "/run" 
        arrivals_path ='data/arrivals/exp' + str(experiment) + "/run"
    else:
        service_path = 'data/service/exp' + str(experiment) + "/run" 
        arrivals_path = 'data/arrivals/exp' + str(experiment) + "/run"
        
    times = pd.DataFrame({}, columns = ['job', 'arrival_time', 'serivice_time'])
    run_times = []
    for run in range(1,4):
        # Load the arrivals and service time data for a single run for this experiment
        temp_service = pd.read_csv(service_path + str(run) + '.csv',  delimiter=' ')
        temp_arrivals = pd.read_csv(arrivals_path + str(run) + '.csv',  delimiter=' ')
        
        # Transform the date string values into epoch values
        temp_arrivals['arrival_time'] = temp_arrivals['arrival_time'].apply(lambda x: timestamp_converter(x))
            
        # Compute the average runtimes for this run
        run_times.append(np.mean(temp_service['end_time']-temp_arrivals['arrival_time']))
        
        # Merge the data into a single df
        merged_temp = pd.merge(temp_arrivals, temp_service, on='job')
        
        # Concatenate the df to the results of previous runs 
        if times.empty:
            times = merged_temp
        else:
            times = pd.concat([times, merged_temp])
        
    return times, run_times    
    

In [20]:
# Collecting the timing data for all experiments
all_timing_data = {}
all_runtimes = []
for exp in range(1,5):
    # Load the data, remove the job column and reset the row index
    exp_times, exp_avg_runtimes = load_service_and_arrival_times(exp)
    exp_times = exp_times.drop(columns = ['job']).reset_index(drop=True)
    
    # Compute and add the runtimes
    exp_times['runtime']=exp_times['end_time']-exp_times['arrival_time']
    
    # Add the timing data for this experiment to all_timing_data
    all_timing_data[exp] = exp_times
    
    # and the avg runtimes to all_runtimes
    all_runtimes.extend(exp_avg_runtimes)
    
#all_timing_data.get(1)
all_runtimes

[122.27266749582793,
 123.81915235519409,
 126.59396092515243,
 1148.4932452837627,
 1190.928062359492,
 1119.1656390031178,
 346.76989209651947,
 345.7390785932541,
 348.02945160865784,
 416.36790174245834,
 414.0355792045593,
 402.26417673958673]

### Regression model fit using sklearn

In [21]:
# Collecting the timing data for all experiments
all_timing_data = {}
all_runtimes = []

epochs = [5, 20, 5, 20]
cores = [4, 4, 8, 8]
batch_sizes = [256, 64, 64, 256]

for exp in range(1,5):
    # Load the data, remove the job column and reset the row index
    exp_times, exp_avg_runtimes = load_service_and_arrival_times(exp)
    #exp_times = exp_times.drop(columns = ['job']).reset_index(drop=True)
    
    # Compute and add the runtimes
    exp_times['runtime']=exp_times['end_time']-exp_times['arrival_time']
    
    # Add the timing data for this experiment to all_timing_data
    all_timing_data[exp] = exp_times
    
    # and the avg runtimes to all_runtimes
    all_runtimes.extend(exp_avg_runtimes)

    #add factor values to the dataframe
    all_timing_data.get(exp)["epoch"] = epochs[exp-1]
    all_timing_data.get(exp)["cores"] = cores[exp-1]
    all_timing_data.get(exp)["batch_size"] = batch_sizes[exp-1]

    
df = pd.DataFrame.from_dict(all_timing_data.get(1))
for exp in range(2, 5):
    df = df.append(pd.DataFrame.from_dict(all_timing_data.get(exp)))

df = df.drop(columns=["job", "arrival_time", "service_time", "end_time"])

X = df.drop(columns=["runtime"])
y = df["runtime"]

clf = LinearRegression().fit(X, y)

for col, coef in zip(X.columns, clf.coef_):
   print(f"{col} has coefficient {coef}")
    

epoch has coefficient 36.411059957329535
cores has coefficient -64.96256885567598
batch_size has coefficient -2.5128532426916195


In [22]:
# Collect timing data for baseline runs
baseline_times, baseline_avg_runtimes = load_service_and_arrival_times(exp)
baseline_times = baseline_times.drop(columns = ['job']).reset_index(drop=True)

# Compute and add the runtimes
baseline_times['runtime']=baseline_times['end_time']-baseline_times['arrival_time']


print("Average runtime for baseline: " + str(np.average(baseline_avg_runtimes)))
baseline_times

Average runtime for baseline: 410.8892192288681


Unnamed: 0,arrival_time,service_time,end_time,runtime
0,1571324230,411.834,1571325000.0,412.149813
1,1571324642,414.206,1571325000.0,414.355975
2,1571325056,414.858,1571325000.0,415.214545
3,1571325471,414.619,1571326000.0,414.833795
4,1571325885,421.079,1571326000.0,421.912971
5,1571326306,417.981,1571327000.0,418.893661
6,1571326724,416.751,1571327000.0,417.644428
7,1571327141,415.294,1571328000.0,415.938026
8,1571328030,412.388,1571328000.0,412.723181
9,1571328442,416.78,1571329000.0,417.503261


In [23]:
# Compute the average runtimes
avg_runtimes = {}
for exp in range(1, 5):
    average_runtime = np.average(np.array(all_timing_data[exp].runtime))
    avg_runtimes[exp] = average_runtime
    print("Average runtime for experiment " + str(exp) + ": " + str(average_runtime))

Average runtime for experiment 1: 124.22859359205815
Average runtime for experiment 2: 1152.8623155487908
Average runtime for experiment 3: 346.8461407661438
Average runtime for experiment 4: 410.5442175292969


#### Accuracies

In [24]:
for exp in range(1,5):
    accuracy_path ='data/accuracies/exp' + str(exp) +".csv" 
    accuracies = pd.read_csv(accuracy_path, header=None)
    print("Average accuracy for experiment " +  str(exp) + ": "+ str(np.average(accuracies)))

Average accuracy for experiment 1: 0.891861403508772
Average accuracy for experiment 2: 0.9598454545454547
Average accuracy for experiment 3: 0.9434787878787879
Average accuracy for experiment 4: 0.944562962962963


In [25]:
baseline_accuracy = pd.read_csv('data/accuracies/baseline.csv', header=None)
print("Average accuracy for baseline: " + str(np.average(baseline_accuracy)))

Average accuracy for baseline: 0.94281


### Regression model based on sign-table

In [26]:
# Create/solve regression model
print(sign_table)

y_mean = np.array(list(avg_runtimes.values()))
regression_totals = [sum(avg_runtimes.values())]
for i in range(sign_table.shape[1]):
    regression_totals.append(sum(sign_table[:,i]*y_mean))

print(regression_totals)

# Calulate effect per factor
effect = [x / 4 for x in regression_totals]
print("\nEffect per factor: " + str(effect))

[[-1. -1.  1.]
 [ 1. -1. -1.]
 [-1.  1. -1.]
 [ 1.  1.  1.]]
[2034.4812674362897, 1092.3317987198857, -519.7005508454083, -964.9356451935796]

Effect per factor: [508.62031685907243, 273.0829496799714, -129.92513771135208, -241.2339112983949]


In [35]:
def toNaturalVariable(obtained_value, min_value, max_value):
    return obtained_value * ((min_value+max_value)/2) + (max_value-min_value)/2

# epochs = [5, 20]
# cores = [4, 8]
# batch_sizes = [64, 256]

# natural_epoch = toNaturalVariable(effect[1], 5, 20)
# natural_epoch

In [8]:
import researchpy as rp
df = pd.DataFrame({'Runtime': np.array(all_runtimes),
                   'Epoch': np.repeat(sign_table[:,0], 3),
                   'Cores': np.repeat(sign_table[:,1], 3),
                   'Batch': np.repeat(sign_table[:,2], 3)})

rp.summary_cont(df['Runtime'])






Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,Runtime,12.0,508.706567,404.330085,116.720042,251.807487,765.605647


In [9]:
from statsmodels.formula.api import ols
import statsmodels.api as sm

# ANOVA for Cores
rp.summary_cont(df.groupby(['Cores']))['Runtime']
cores_model = ols('Runtime ~ Cores', data=df).fit()
cores_model.summary()

aov_table = sm.stats.anova_lm(cores_model, typ=2)
aov_table





  "anyway, n=%i" % int(n))


Unnamed: 0,sum_sq,df,F,PR(>F)
Cores,202297.6,1.0,1.267518,0.286526
Residual,1596013.0,10.0,,


In [10]:
# ANOVA for Batch size
rp.summary_cont(df.groupby(['Batch']))['Runtime']
batch_model = ols('Runtime ~ Batch', data=df).fit()
batch_model.summary()

aov_table = sm.stats.anova_lm(batch_model, typ=2)
aov_table





  "anyway, n=%i" % int(n))


Unnamed: 0,sum_sq,df,F,PR(>F)
Batch,697826.3,1.0,6.341082,0.030484
Residual,1100485.0,10.0,,


In [11]:
# ANOVA for Training Epoch size
batch_model = ols('Runtime ~ Epoch', data=df).fit()
batch_model.summary()

aov_table = sm.stats.anova_lm(batch_model, typ=2)
aov_table





  "anyway, n=%i" % int(n))


Unnamed: 0,sum_sq,df,F,PR(>F)
Epoch,895456.942631,1.0,9.91807,0.010345
Residual,902854.054602,10.0,,


In [15]:
# ANOVA for interaction
batch_model = ols('Runtime ~  Batch*Cores*Epoch', data=df).fit()
batch_model.summary()

aov_table = sm.stats.anova_lm(batch_model, typ=2)
aov_table

  "anyway, n=%i" % int(n))


Unnamed: 0,sum_sq,df,F,PR(>F)
Batch,697826.3,1.0,2044.850426,6.31636e-11
Cores,202297.6,1.0,592.795651,8.642458e-09
Batch:Cores,895456.9,1.0,2623.970215,2.336807e-11
Epoch,895456.9,1.0,2623.970215,2.336807e-11
Batch:Epoch,202297.6,1.0,592.795651,8.642458e-09
Cores:Epoch,697826.3,1.0,2044.850426,6.31636e-11
Batch:Cores:Epoch,3105388.0,1.0,9099.76397,1.628259e-13
Residual,2730.083,8.0,,


In [12]:
overall_model = ols('Runtime ~ Batch*Cores*Epoch', data=df).fit()
overall_model.summary()

  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,Runtime,R-squared:,0.998
Model:,OLS,Adj. R-squared:,0.998
Method:,Least Squares,F-statistic:,1754.0
Date:,"Tue, 22 Oct 2019",Prob (F-statistic):,1.31e-11
Time:,12:04:37,Log-Likelihood:,-49.59
No. Observations:,12,AIC:,107.2
Df Residuals:,8,BIC:,109.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,254.3533,2.666,95.393,0.000,248.205,260.502
Batch,-120.5738,2.666,-45.220,0.000,-126.723,-114.425
Cores,-64.9194,2.666,-24.347,0.000,-71.068,-58.771
Batch:Cores,136.5846,2.666,51.225,0.000,130.436,142.733
Epoch,136.5846,2.666,51.225,0.000,130.436,142.733
Batch:Epoch,-64.9194,2.666,-24.347,0.000,-71.068,-58.771
Cores:Epoch,-120.5738,2.666,-45.220,0.000,-126.723,-114.425
Batch:Cores:Epoch,254.3533,2.666,95.393,0.000,248.205,260.502

0,1,2,3
Omnibus:,6.776,Durbin-Watson:,3.042
Prob(Omnibus):,0.034,Jarque-Bera (JB):,3.36
Skew:,0.398,Prob(JB):,0.186
Kurtosis:,5.467,Cond. No.,2.12e+17


### G/G/1

In [9]:
# Simulate G/G/1 queue
def queuing_simulation(data):
    N = len(data)
    At = data.arrival_time

    S = np.zeros(N) # -> Service start time
    C = np.zeros(N) # -> Complete time
    W = np.zeros(N) # -> response time of job i (waiting time) 

    S[0] = At[0]
    C[0] = S[0] + data.service_time[0]
    W[0] = C[0] - At[0]

    for i in range(1, N):
        S[i] = max(C[i-1], At[i])
        C[i] = S[i] + data.service_time[i]
        W[i] = C[i] - At[i]

#     print("Average simulated run time: " + str(np.average(W)))
    return np.average(W)

In [10]:
# Run the G/G/1 queuing simulation for all experiments
for i in range(1,5):
    # avrg simulated runtime
    print(queuing_simulation(all_timing_data.get(i)))

123.94045613941394
1152.5422222879197
346.7322332938512
410.3491999912262
