In [79]:
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from datetime import datetime
from sklearn.preprocessing import Normalizer

In [81]:
# Function for converting series data to a supervised data of format, t-1, t, t+1
## Basically feeding in the (t-1)th data to predict the t data
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg


In [82]:
def break_timestamp(dataset):
    temp_time = [datetime.strptime(d, "%Y-%m-%d %H:%M:%S.%f") for d in dataset["timestamp"]]
    dataset["hours"] = [dh.hour for dh in temp_time]
    dataset["minutes"] = [dm.minute for dm in temp_time]
    dataset["seconds"] = [ds.second for ds in temp_time]
    
    '''
    for index in range(0, len(dataset["time"])):
        time_val = dataset["time"].iloc[index]
        dataset["hours"] = time_val.hour
        dataset["minutes"] = time_val.minute
        dataset["seconds"] = time_val.second
    '''
    
        
    return dataset

In [54]:
def normalize(data_arr):
    data_max = data_arr.max()
    data_min = data_arr.min()
    
    data_arr = (data_arr - data_min)/(data_max - data_min)
    
    return data_arr

In [133]:
col_arr = ["timestamp", "time_since_last_recording", "ping_timestamp", "time_since_last_ping", "ping_success","ping_time"]
# load dataset
dataset = read_csv('../parse_tactics/normalized_tva_server_1_tactic_1_train.csv')
dataset = break_timestamp(dataset)
dataset = dataset.drop(columns=col_arr)#["ID"])
dataset = dataset[["hours","minutes","seconds","latency","cost","reliability"]]
norm_scaler = Normalizer().fit(dataset.iloc[:,0:3])
dataset.loc[:,0:3] = norm_scaler.transform(dataset.iloc[:,0:3])
values = dataset
'''
dataset["hours"] = normalize(dataset["hours"].values)
dataset["minutes"] = normalize(dataset["minutes"].values)
dataset["seconds"] = normalize(dataset["seconds"].values)
'''


#values = dataset.values
print(dataset)
## Load Validation
validation = read_csv('../parse_tactics/normalized_tva_server_1_tactic_1_test.csv')
validation = break_timestamp(validation)
validation = validation.drop(columns=col_arr)#["ID"])
validation = validation[["hours","minutes","seconds","latency","cost","reliability"]]
validation.loc[:,0:3] = norm_scaler.transform(validation.iloc[:,0:3])
values_validation = validation

'''
validation["hours"] = normalize(validation["hours"].values)
validation["minutes"] = normalize(validation["minutes"].values)
validation["seconds"] = normalize(validation["seconds"].values)


'''

          hours   minutes   seconds   latency      cost  reliability
0      0.272917  0.579948  0.767578  0.015102  0.193359            1
1      0.243969  0.579427  0.777652  0.015117  0.310547            1
2      0.241814  0.589423  0.770783  0.015297  0.169922            1
3      0.322198  0.865908  0.382611  0.014803  0.191406            1
4      0.296806  0.871867  0.389557  0.014817  0.167969            1
...         ...       ...       ...       ...       ...          ...
12152  0.080750  0.452201  0.888253  0.016195  0.175781            1
12153  0.078490  0.439544  0.894785  0.016146  0.181641            1
12154  0.115011  0.667063  0.736070  0.016246  0.166016            1
12155  0.152570  0.976445  0.152570  0.016016  0.167969            1
12156  0.109817  0.702830  0.702830  0.015651  0.173828            1

[12157 rows x 6 columns]


'\nvalidation["hours"] = normalize(validation["hours"].values)\nvalidation["minutes"] = normalize(validation["minutes"].values)\nvalidation["seconds"] = normalize(validation["seconds"].values)\n\n\n'

In [134]:
## Calling the function to do the preprocessing the data and removing unwanted columns


# frame as supervised learning
reframed = series_to_supervised(values, 1, 1)
reframed_validation = series_to_supervised(values_validation, 1, 1)
# drop columns we don't want to predict
reframed.drop(reframed.columns[[6,7,8]], axis=1, inplace=True)
reframed_validation.drop(reframed_validation.columns[[6,7,8]], axis=1, inplace=True)
print(reframed.head(3))

   var1(t-1)  var2(t-1)  var3(t-1)  var4(t-1)  var5(t-1)  var6(t-1)   var4(t)  \
1   0.272917   0.579948   0.767578   0.015102   0.193359        1.0  0.015117   
2   0.243969   0.579427   0.777652   0.015117   0.310547        1.0  0.015297   
3   0.241814   0.589423   0.770783   0.015297   0.169922        1.0  0.014803   

    var5(t)  var6(t)  
1  0.310547        1  
2  0.169922        1  
3  0.191406        1  


In [135]:
## Splitting the data into training and validation sets


test = reframed.values
train = reframed_validation.values
# split into input and outputs
train_X, train_y = train[:, :-3], train[:,-3:]
test_X, test_y = test[:, :-3], test[:,-3:]
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

(2604, 1, 6) (2604, 3) (12156, 1, 6) (12156, 3)


In [136]:
# flatten input
n_input = train_X.shape[1] * train_X.shape[2]
X = train_X.reshape((train_X.shape[0], n_input))

In [137]:
### Doing the same process for testing dataset

test_dataset = read_csv('../parse_tactics/normalized_tva_server_1_tactic_1_validation.csv')
test_dataset = break_timestamp(test_dataset)
test_dataset = test_dataset.drop(columns=col_arr)#["ID"])
test_dataset = test_dataset[["hours","minutes","seconds","latency","cost","reliability"]]
test_dataset.iloc[:,0:3] = norm_scaler.transform(test_dataset.iloc[:,0:3])
test_values = test_dataset#.values
reframed_test = series_to_supervised(test_dataset, 1, 1)
reframed_test.drop(reframed_test.columns[[6,7,8]], axis=1, inplace=True)
print(reframed_test.shape)
testset = reframed_test.values
testset_X, testset_y = testset[:, :-3], testset[:,-3:]
print(testset_X.shape, testset_y.shape)
testdataReshaped = testset_X.reshape((testset_X.shape[0], 1, testset_X.shape[1]))


(2605, 9)
(2605, 6) (2605, 3)


## SVR algorithm with RBF kernel. 

In [138]:
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
regressor = SVR(kernel='rbf')
# flatten input
n_input = testdataReshaped.shape[1] * testdataReshaped.shape[2]
X2 = testdataReshaped.reshape((testdataReshaped.shape[0], n_input))
regr = MultiOutputRegressor(regressor)

regr.fit(X,train_y)
out= regr.predict(X2)

rmse = sqrt(mean_squared_error(out,testset_y))
print('Test RMSE: %.3f' % rmse)

Test RMSE: 0.061




## SVR with Linear kernel 

In [139]:
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
regressor = SVR(kernel='linear')
# flatten input
n_input = testdataReshaped.shape[1] * testdataReshaped.shape[2]
X2 = testdataReshaped.reshape((testdataReshaped.shape[0], n_input))
regr = MultiOutputRegressor(regressor)

regr.fit(X,train_y)
out= regr.predict(X2)

rmse = sqrt(mean_squared_error(out,testset_y))
print('Test RMSE: %.3f' % rmse)

Test RMSE: 0.062


## KNN Regression

In [140]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
knn = KNeighborsRegressor()
regr_knn = MultiOutputRegressor(knn)

# flatten input
n_input = testdataReshaped.shape[1] * testdataReshaped.shape[2]
X2 = testdataReshaped.reshape((testdataReshaped.shape[0], n_input))

regr_knn.fit(X,train_y)
regr_knn.predict(testset_X)
out= regr_knn.predict(X2)

rmse = sqrt(mean_squared_error(out,testset_y))
print('Test RMSE: %.3f' % rmse)

Test RMSE: 0.052


# Saving to file

In [141]:
## Feeding the test dataset for predictions
import pandas as pd

dataset = pd.DataFrame({'predicted_Latency': out[:, 0], 'predicted_Cost': out[:, 1],
                       'predicted_Reliability': out[:, 2]})
dataset['predicted_Reliability'].loc[dataset['predicted_Reliability'] >0.5] = 1
dataset['predicted_Reliability'].loc[dataset['predicted_Reliability'] <.5] = 0

In [142]:
frames = [pd.DataFrame(data=test_dataset), pd.DataFrame(data=dataset)]
result = pd.concat(frames,axis =1)

In [143]:
import numpy as np
print(result.head(10))
result.to_csv('../Prediction_Files/predictions_SVR_RBF_Server_1_Tactic_1.csv', sep=',', index=0)

      hours   minutes   seconds   latency      cost  reliability  \
0  0.242140  0.282497  0.928204  0.017155  0.160156            1   
1  0.174004  0.232006  0.957024  0.014948  0.167969            1   
2  0.130158  0.173544  0.976187  0.015292  0.164062            1   
3  0.134670  0.224450  0.965135  0.016283  0.169922            1   
4  0.131812  0.219687  0.966625  0.015571  0.173828            1   
5  0.158666  0.343776  0.925550  0.015474  0.171875            1   
6  0.354787  0.886969  0.295656  0.016747  0.175781            1   
7  0.099231  0.264616  0.959235  0.015070  0.162109            1   
8  0.205316  0.581728  0.787044  0.014855  0.164062            1   
9  0.235521  0.706562  0.667308  0.016296  0.167969            1   

   predicted_Latency  predicted_Cost  predicted_Reliability  
0           0.015661        0.175000                    1.0  
1           0.014950        0.171875                    1.0  
2           0.015437        0.170703                    1.0  
3  