In [1]:
import numpy as np
import pandas as pd
import os
import itertools
from sklearn.model_selection import train_test_split

In [27]:
# Create 'test' dataset for testing rules.
# !!! This code saves only TEST .tsv files

from sklearn.preprocessing import StandardScaler, MinMaxScaler

data = pd.read_csv(r'/home/vaibhavs/Master_Thesis/ma-vaibhav/Data/data_prin_winter_15m_Nov23_Feb24.csv') # !!! CHANGE HERE (for new dataset)

dataset_dir = './SimTSC/datasets/EBC'
dataset_name = 'AHU_prin_winter_2023_stanscaler_RULES' # !!! CHANGE HERE (for new dataset)
dataset_dir = os.path.join(dataset_dir, dataset_name)

all_points = []
all_labels = []
all_windows = []
datapoint_names = data.columns.tolist()
datapoint_names.remove('time')
minute_multiples = 480  # 480=5day, 1920=20day, 1440=1day(60s sampling)

for datapoint in datapoint_names:
    data_list = []
    label_list = []
    window_list = []
    window_num = 0
    dp_timeseries = data[['time',datapoint]] # isolate time-series of a single datapoint
    rows = dp_timeseries.shape[0] # get number of timestamps
    for i in range(0,rows,minute_multiples+1):
        if i <= rows-minute_multiples:
            sample = dp_timeseries.iloc[i:i+minute_multiples, [1]].transpose()
            sample_list = sample.values.tolist()
            # print(len(sample_list[0]))
            data_list.append(sample_list[0])
            window_num += 1
            window_list.append(window_num)
    
    label_list = [datapoint] * len(data_list)
    all_labels.append(label_list) 
    all_points.append(data_list)
    all_windows.append(window_list)

# Select a subset of windows to be in the test set
total_windows = len(all_points[0])
wins = np.arange(1, total_windows-1, 3)
for val in wins:
    wins = np.append(wins, val+1)
wins = sorted(wins)
print("Windows: ", wins)

test_stubs = []
window_stubs = []
label_stubs = []
for ind in range(len(all_points)):
    for window in wins:   
        test_stubs.append(all_points[ind][window])
        window_stubs.append(all_windows[ind][window])
        label_stubs.append(all_labels[ind][window])

all_pointstubs = test_stubs
all_labelstubs = label_stubs
all_windowstubs = window_stubs
print(list(set(all_labelstubs)))
numpy_pointstubs = np.asarray(all_pointstubs)
numpy_labelstubs = np.asarray(all_labelstubs)
numpy_windowstubs = np.asarray(all_windowstubs)
numpy_label_windows = np.column_stack((numpy_labelstubs, numpy_windowstubs)) # stack labels and their corresponding windows side-by-side

print(numpy_pointstubs.shape)
print(numpy_label_windows.shape)

Xseries_septest = pd.DataFrame(data=numpy_pointstubs, columns=range(1,minute_multiples+1))
Ylabels_septest = pd.DataFrame(data=numpy_label_windows)
septest_merged = pd.concat([Ylabels_septest, Xseries_septest], axis=1)

if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
septest_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_SEPTEST.tsv'), sep='\t', index=False, header=None)

# Standard Scaler
scaler = StandardScaler()
scaled_numpy_data = scaler.fit_transform(numpy_pointstubs)

# # Min-Max Scaler
# scaler = MinMaxScaler()
# normalized_data = scaler.fit_transform(numpy_pointstubs)

ltest = numpy_labelstubs.tolist()
test_temp_labels = ['Temperature' if 'Temp' in x else x for x in ltest]
test_valve_labels = ['Valve' if 'Valve' in x else x for x in test_temp_labels]
test_speed_labels = ['Speed' if 'Speed' in x else x for x in test_valve_labels]
test_oper_labels = ['Operating' if 'Operating' in x else x for x in test_speed_labels]

Xseries_scaledtest = pd.DataFrame(data=scaled_numpy_data, columns=range(1,minute_multiples+1))
Ylabels_scaledtest = pd.DataFrame(data=test_oper_labels)
scaledtest_merged = pd.concat([Ylabels_scaledtest, Xseries_scaledtest], axis=1)
scaledtest_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_TEST_ed.tsv'), sep='\t', index=False, header=None)


Windows:  [1, 2, 4, 5, 7, 8, 10, 11, 13, 14, 16, 17, 19, 20]
['ADS.fAHURHValveActADSInternalValuesMirror', 'ADS.fAHUTempODAADSInternalValuesMirror', 'ADS.fAHUFanSUPSpeedActADSInternalValuesMirror', 'ADS.fAHURHValveSetADSInternalValuesMirror', 'ADS.bAHUPHPumpOperatingADSInternalValuesMirror', 'ADS.fAHUPHValveActADSInternalValuesMirror', 'ADS.fAHUPHValveSetADSInternalValuesMirror', 'ADS.bAHURHPumpOperatingADSInternalValuesMirror', 'ADS.fAHUTempSUPADSInternalValuesMirror', 'ADS.fAHUFanSUPSpeedSetADSInternalValuesMirror', 'ADS.fAHUTempETAADSInternalValuesMirror', 'ADS.fAHUTempSUPSetADSInternalValuesMirror']
(168, 480)
(168, 2)


In [28]:
# Run 'create_dataset' code for RULES dataset with dummy train and val; only test set exists
from sklearn.preprocessing import LabelEncoder

dataset_name = "AHU_prin_winter_2023_stanscaler_RULES" # !!! CHANGE HERE (for new dataset)
dataset_dir = './SimTSC/datasets/EBC'
output_dir = './SimTSC/tmp'

dataset_dir = os.path.join(dataset_dir, dataset_name)
df_test = pd.read_csv(os.path.join(dataset_dir, dataset_name+'_TEST_ed.tsv'), sep='\t', header=None)

y_test = df_test.values[:, 0].astype(str)
y = y_test
le = LabelEncoder()
le.fit(y)
y = le.transform(y)

X_test = df_test.drop(columns=[0]).astype(np.float32)
X_test.columns = range(X_test.shape[1])

X_test = X_test.values
X = X_test
idx = np.array([i for i in range(len(X))])

# np.random.shuffle(idx)
# train_idx = idx[:int(len(idx)*0.15)] 
# val_idx = idx[int(len(idx)*0.15):int(len(idx)*0.7)]
# test_idx = idx[int(len(idx)*0.7):]

point_1 = len(X_test)

test_idx = idx[:point_1]
train_idx = 0
val_idx = 0

# add a dimension to make it multivariate with one dimension 
X = X.reshape((X.shape[0], 1, X.shape[1]))

data = {
        'X': X,
        'y': y,
        'train_idx': train_idx,
        'test_idx': test_idx,
        'val_idx': val_idx
}
output_dir = os.path.join(output_dir, 'ebc_'+'1_shot')
if not os.path.exists(output_dir):
        os.makedirs(output_dir)
np.save(os.path.join(output_dir, dataset_name), data)

In [3]:
# Distances for only the TEST data; (basically the 'create_dtw' script to save dtw/euclidean values)

import sys
 
# adding Folder_2 to the system path
sys.path.insert(0, '/home/vaibhavs/Master_Thesis/ma-vaibhav/Code/SimTSC/pydtw')
import dtw

# Read dataset
dataset_dir = './SimTSC/datasets/EBC'
dataset_name = 'StadtAachen_Winter_2023_stanscaler'
output_dir = './SimTSC/tmp'

euc_dir = os.path.join(output_dir, 'ebc_euclidean')
if not os.path.exists(euc_dir):
    os.makedirs(euc_dir)

cid_dir = os.path.join(output_dir, 'ebc_cid')
if not os.path.exists(cid_dir):
    os.makedirs(cid_dir)

dtw_dir = os.path.join(output_dir, 'ebc_dtw')
if not os.path.exists(dtw_dir):
    os.makedirs(dtw_dir)

dataset_dir = os.path.join(dataset_dir, dataset_name)
# df_train = pd.read_csv(os.path.join(dataset_dir, dataset_name+'_TRAIN.tsv'), sep='\t', header=None)
df_test = pd.read_csv(os.path.join(dataset_dir, dataset_name+'_TEST_ed.tsv'), sep='\t', header=None)
# df_val = pd.read_csv(os.path.join(dataset_dir, dataset_name+'_VAL.tsv'), sep='\t', header=None)

# X_train = df_train.drop(columns=[0]).astype(np.float32)
X_test = df_test.drop(columns=[0]).astype(np.float32)
# X_val = df_val.drop(columns=[0]).astype(np.float32)

# X_train.columns = range(X_train.shape[1])
X_test.columns = range(X_test.shape[1])
# X_val.columns = range(X_val.shape[1])

# X_train = X_train.values
X_test = X_test.values
# X_val = X_val.values
X = X_test

X[np.isnan(X)] = 0
std_ = X.std(axis=1, keepdims=True)
std_[std_ == 0] = 1.0
X = (X - X.mean(axis=1, keepdims=True)) / std_

X = X.copy(order='C').astype(np.float64)

print(X.shape)

# # calculating DTW distances
# distances = np.zeros((X.shape[0], X.shape[0]), dtype=np.float64)
# for i in range(len(X)):
#     for j in range(len(X)):
#         data = X[i]
#         query = X[j]
#         distances[i][j] = dtw.query(data, query, r=min(len(data)-1, len(query)-1, 100))['value']

# calculating Euclidean distances
distances = np.zeros((X.shape[0], X.shape[0]), dtype=np.float64)
for i in range(len(X)):
    for j in range(len(X)):
        data = X[i]
        query = X[j]
        distances[i][j] = np.linalg.norm(data - query)

# # calculating CID distances
# distances = np.zeros((X.shape[0], X.shape[0]), dtype=np.float64)
# for i in range(len(X)):
#     for j in range(len(X)):
#         data = X[i]
#         query = X[j]
#         CE_Q = np.sqrt(np.sum(np.diff(data, axis=0)**2))
#         CE_C = np.sqrt(np.sum(np.diff(query, axis=0)**2))
#         distances[i][j] = np.sqrt(np.sum((data - query)**2)) * max(CE_Q, CE_C) / min(CE_Q, CE_C)
        
print(distances.shape)
np.save(os.path.join(euc_dir, dataset_name), distances)
       

(69, 480)
(69, 69)
