In [1]:
import numpy as np
import pandas as pd
import os
import itertools
from sklearn.model_selection import train_test_split

If the data is in multiple .CSV files, then they need to be combined together into a single dataframe. For this, run the code in the cell below. 

If the data is in a single .CSV file, skip the cell below.

In [5]:
# Concatenate multiple csv files (with same datapoint columns) to a single dataframe

data_part1 = pd.read_csv(r'/home/vaibhavs/Master_Thesis/ma-vaibhav/Data/data_survey_15m_Nov22_Mar23.csv')
data_part2 = pd.read_csv(r'/home/vaibhavs/Master_Thesis/ma-vaibhav/Data/data_survey_15m_Nov23_Feb24.csv')
print(data_part1.shape)
print(data_part2.shape)

assert len(data_part1.columns) == len(data_part2.columns), f"number of columns do not match"

data = pd.concat([data_part1, data_part2], axis=0)
print(data.shape)

(14401, 9)
(11233, 9)
(25634, 9)


In [6]:
# Load AHU data from .csv file (generated by Aedifion) and convert into (n_samples X n_timestamps) format

from sklearn.preprocessing import StandardScaler, MinMaxScaler

data = pd.read_csv(r'/home/vaibhavs/Master_Thesis/ma-vaibhav/Data/data_prin_summer_15m_Jun23_Aug23.csv')

all_points = []
all_labels = []
all_windows = []
datapoint_names = data.columns.tolist()
datapoint_names.remove('time')
minute_multiples = 480  # 480=5day, 1920=20day, 96=1day, 1440=1day(60s sampling)

for datapoint in datapoint_names:
    data_list = []
    label_list = []
    window_list = []
    window_num = 0
    dp_timeseries = data[['time',datapoint]] # isolate time-series of a single datapoint
    rows = dp_timeseries.shape[0] # get number of timestamps
    for i in range(0,rows,minute_multiples+1):
        if i <= rows-minute_multiples:
            sample = dp_timeseries.iloc[i:i+minute_multiples, [1]].transpose()
            sample_list = sample.values.tolist()
            # print(len(sample_list[0]))
            data_list.append(sample_list[0])
            window_num += 1
            window_list.append(window_num)
    
    label_list = [datapoint] * len(data_list)
    all_labels.append(label_list) 
    all_points.append(data_list)
    all_windows.append(window_list)

all_points = list(itertools.chain.from_iterable(all_points))
all_labels = list(itertools.chain.from_iterable(all_labels))

orig_labels = all_labels.copy()

# convert datapoint names to class labels
# uniform_HR_labels = ['HR' if 'HR' in x else x for x in all_labels]
# uniform_EHA_labels = ['EHA' if 'EHA' in x else x for x in uniform_HR_labels]
# uniform_PH_labels = ['ODA' if 'PH' in x else x for x in uniform_EHA_labels]
# uniform_ODA_labels = ['ODA' if 'ODA' in x else x for x in uniform_PH_labels]

# all_labels = uniform_ODA_labels
print(list(set(all_labels)))

# check if number of labels (y) and number of time series (x) is equal
assert len(all_points) == len(all_labels), f"length of timeseries values list ({len(all_points)}) is not equal to length of datapoint labels list ({len(all_labels)}). Please make sure timeseries and their datapoint labels are of equal length"

# comb_labels = [all_labels, orig_labels]
numpy_data = np.asarray(all_points)
# numpy_labels = np.asarray(comb_labels)
numpy_labels = np.asarray(all_labels)

print(numpy_data.shape)

# Standardize data with mean=0 and variance=1
scaler = StandardScaler()
scaled_numpy_data = scaler.fit_transform(numpy_data)

# # Min-Max Scaler
# scaler = MinMaxScaler()
# normalized_data = scaler.fit_transform(numpy_data)

X_train, X_test, y_train, y_test = train_test_split(scaled_numpy_data, numpy_labels, test_size=0.30, random_state=42, shuffle=True, stratify=numpy_labels)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.30, random_state=42, shuffle=True, stratify=y_train)
# X_train, X_test, y_train, y_test = train_test_split(scaled_numpy_data, numpy_labels, test_size=0.30, random_state=42, shuffle=False)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.30, random_state=42, shuffle=False)

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(numpy_data, numpy_labels, test_size=0.30, random_state=42, shuffle=False)
X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(X_train_raw, y_train_raw, test_size=0.30, random_state=42, shuffle=False)

['ADS.bAHUPHPumpOperatingADSInternalValuesMirror', 'ADS.fAHUTempSUPSetADSInternalValuesMirror', 'ADS.fAHUPHValveSetADSInternalValuesMirror', 'ADS.fAHUTempODAADSInternalValuesMirror', 'ADS.fAHUPHValveActADSInternalValuesMirror', 'ADS.bAHURHPumpOperatingADSInternalValuesMirror', 'ADS.fAHUFanSUPSpeedActADSInternalValuesMirror', 'ADS.fAHURHValveSetADSInternalValuesMirror', 'ADS.fAHUFanSUPSpeedSetADSInternalValuesMirror', 'ADS.fAHUTempETAADSInternalValuesMirror', 'ADS.fAHUTempSUPADSInternalValuesMirror', 'ADS.fAHURHValveActADSInternalValuesMirror']
(216, 480)


Change the folder name for saving the dataset files in the `dataset_name` variable.

In [7]:
# Save train and test datasets as .tsv files

dataset_dir = './SimTSC/datasets/EBC'
dataset_name = 'AHU_principal_SUMMER_2023_stanscaler'

dataset_dir = os.path.join(dataset_dir, dataset_name)

# Save train dataset
Xseries_train = pd.DataFrame(data=X_train, columns=range(1,minute_multiples+1))
Ylabels_train = pd.DataFrame(data=y_train)

train_merged = pd.concat([Ylabels_train, Xseries_train], axis=1)

if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
train_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_TRAIN.tsv'), sep='\t', index=False, header=None)

# saving raw train
Xseries_train_raw = pd.DataFrame(data=X_train_raw, columns=range(1,minute_multiples+1))
Ylabels_train_raw = pd.DataFrame(data=y_train_raw)

train_raw_merged = pd.concat([Ylabels_train_raw, Xseries_train_raw], axis=1)

if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
train_raw_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_TRAIN_RAW.tsv'), sep='\t', index=False, header=None) 

# Save test dataset
Xseries_test = pd.DataFrame(data=X_test, columns=range(1,minute_multiples+1))
Ylabels_test = pd.DataFrame(data=y_test)

test_merged = pd.concat([Ylabels_test, Xseries_test], axis=1)

if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
test_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_TEST.tsv'), sep='\t', index=False, header=None)

# saving raw test
Xseries_test_raw = pd.DataFrame(data=X_test_raw, columns=range(1,minute_multiples+1))
Ylabels_test_raw = pd.DataFrame(data=y_test_raw)

test_raw_merged = pd.concat([Ylabels_test_raw, Xseries_test_raw], axis=1)

if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
test_raw_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_TEST_RAW.tsv'), sep='\t', index=False, header=None) 

# Save validation dataset
Xseries_val = pd.DataFrame(data=X_val, columns=range(1,minute_multiples+1))
Ylabels_val = pd.DataFrame(data=y_val)

val_merged = pd.concat([Ylabels_val, Xseries_val], axis=1)

if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
val_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_VAL.tsv'), sep='\t', index=False, header=None)

# saving raw val
Xseries_val_raw = pd.DataFrame(data=X_val_raw, columns=range(1,minute_multiples+1))
Ylabels_val_raw = pd.DataFrame(data=y_val_raw)

val_raw_merged = pd.concat([Ylabels_val_raw, Xseries_val_raw], axis=1)

if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
val_raw_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_VAL_RAW.tsv'), sep='\t', index=False, header=None) 

The cell below reads the dataset and assigns class labels to each datapoint based on the datapoint name. For e.g., the datapoint name 'fAHUTempEHAADS' will be assigned the class label 'Temperature'.

The class labels currently defined are 'Temperature', 'Valve', 'Speed', and 'Operating'. Change them as per the class labels for your classification problem. 

In [8]:
# Transform datapoint names to class labels

dataset_dir = './SimTSC/datasets/EBC'
dataset_name = 'AHU_principal_SUMMER_2023_stanscaler'

dataset_dir = os.path.join(dataset_dir, dataset_name)
df_train = pd.read_csv(os.path.join(dataset_dir, dataset_name+'_TRAIN.tsv'), sep='\t', header=None)
df_test = pd.read_csv(os.path.join(dataset_dir, dataset_name+'_TEST.tsv'), sep='\t', header=None)
df_val = pd.read_csv(os.path.join(dataset_dir, dataset_name+'_VAL.tsv'), sep='\t', header=None)

ltrain = df_train.iloc[:,0].tolist()
ltest = df_test.iloc[:,0].tolist()
lval = df_val.iloc[:,0].tolist()

# convert datapoint names to class labels
train_temp_labels = ['Temperature' if 'Temp' in x else x for x in ltrain]
train_valve_labels = ['Valve' if 'Valve' in x else x for x in train_temp_labels]
train_speed_labels = ['Speed' if 'Speed' in x else x for x in train_valve_labels]
train_oper_labels = ['Operating' if 'Operating' in x else x for x in train_speed_labels]

print(list(set(train_oper_labels)))

test_temp_labels = ['Temperature' if 'Temp' in x else x for x in ltest]
test_valve_labels = ['Valve' if 'Valve' in x else x for x in test_temp_labels]
test_speed_labels = ['Speed' if 'Speed' in x else x for x in test_valve_labels]
test_oper_labels = ['Operating' if 'Operating' in x else x for x in test_speed_labels]

val_temp_labels = ['Temperature' if 'Temp' in x else x for x in lval]
val_valve_labels = ['Valve' if 'Valve' in x else x for x in val_temp_labels]
val_speed_labels = ['Speed' if 'Speed' in x else x for x in val_valve_labels]
val_oper_labels = ['Operating' if 'Operating' in x else x for x in val_speed_labels]

df_train.iloc[:,0] = train_oper_labels
df_test.iloc[:,0] = test_oper_labels
df_val.iloc[:,0] = val_oper_labels

print(np.unique(train_oper_labels, return_counts=True))
print(np.unique(test_oper_labels, return_counts=True))
print(np.unique(val_oper_labels, return_counts=True))

df_train.to_csv(os.path.join(dataset_dir, dataset_name+'_TRAIN_ed.tsv'), sep='\t', index=False, header=None)
df_test.to_csv(os.path.join(dataset_dir, dataset_name+'_TEST_ed.tsv'), sep='\t', index=False, header=None)
df_val.to_csv(os.path.join(dataset_dir, dataset_name+'_VAL_ed.tsv'), sep='\t', index=False, header=None)

['Operating', 'Temperature', 'Valve', 'Speed']
(array(['Operating', 'Speed', 'Temperature', 'Valve'], dtype='<U11'), array([18, 17, 36, 34]))
(array(['Operating', 'Speed', 'Temperature', 'Valve'], dtype='<U11'), array([10, 11, 21, 23]))
(array(['Operating', 'Speed', 'Temperature', 'Valve'], dtype='<U11'), array([ 8,  8, 15, 15]))


### Other miscellaneous code

If you are using statistical functions to compute features (e.g., mean change, sample entropy, standard deviation) on the time series data, use the code cell below to: 

- Load .CSV data into a dataframe
- Compute features on the time series data
- Standardize the data and split it into train, test, validation data sets

In [61]:
# Compute feature set from time-series samples

from sklearn.preprocessing import StandardScaler, MinMaxScaler
import tsfresh.feature_extraction.feature_calculators as tscalc

data = pd.read_csv(r'/home/vaibhavs/Master_Thesis/ma-vaibhav/Data/data_principal_15m_Jan23_Feb24.csv')

all_points = []
all_labels = []
all_windows = []
datapoint_names = data.columns.tolist()
datapoint_names.remove('time')
minute_multiples = 480  # 480=5day, 1920=20day, 1440=1day(60s sampling)

def get_features(sample):
    features = []
    abs_sum_changes = tscalc.absolute_sum_of_changes(sample)
    cid_ce = tscalc.cid_ce(sample, True)
    mean_change = tscalc.mean_change(sample)
    reocc_vals = tscalc.percentage_of_reoccurring_values_to_all_values(sample)
    sample_entropy = tscalc.sample_entropy(sample)
    std = tscalc.standard_deviation(sample)
    features = [abs_sum_changes, cid_ce, mean_change, reocc_vals, sample_entropy, std]
    return features


for datapoint in datapoint_names:
    data_list = []
    label_list = []
    window_list = []
    window_num = 0
    dp_timeseries = data[['time',datapoint]] # isolate time-series of a single datapoint
    rows = dp_timeseries.shape[0] # get number of timestamps
    for i in range(0,rows,minute_multiples+1):
        if i <= rows-minute_multiples:
            sample = dp_timeseries.iloc[i:i+minute_multiples, [1]].transpose()
            sample_list = sample.values.tolist()
            features = get_features(sample_list[0])
            data_list.append(features)
            window_num += 1
            window_list.append(window_num)
    
    label_list = [datapoint] * len(data_list)
    all_labels.append(label_list) 
    all_points.append(data_list)
    all_windows.append(window_list)

all_points = list(itertools.chain.from_iterable(all_points))
all_labels = list(itertools.chain.from_iterable(all_labels))

print(list(set(all_labels)))

# check if number of labels (y) and number of time series (x) is equal
assert len(all_points) == len(all_labels), f"length of timeseries values list ({len(all_points)}) is not equal to length of datapoint labels list ({len(all_labels)}). Please make sure timeseries and their datapoint labels are of equal length"

# comb_labels = [all_labels, orig_labels]
numpy_data = np.asarray(all_points)
# numpy_labels = np.asarray(comb_labels)
numpy_labels = np.asarray(all_labels)

print(numpy_data.shape)
col_len = numpy_data.shape[1]

# Standardize data with mean=0 and variance=1
scaler = StandardScaler()
scaled_numpy_data = scaler.fit_transform(numpy_data)

# # Min-Max Scaler
# scaler = MinMaxScaler()
# normalized_data = scaler.fit_transform(numpy_data)

X_train, X_test, y_train, y_test = train_test_split(scaled_numpy_data, numpy_labels, test_size=0.30, random_state=42, shuffle=True, stratify=numpy_labels)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.30, random_state=42, shuffle=True, stratify=y_train)

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(numpy_data, numpy_labels, test_size=0.30, random_state=42, shuffle=True, stratify=numpy_labels)
X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(X_train_raw, y_train_raw, test_size=0.30, random_state=42, shuffle=True, stratify=y_train_raw)


['ADS.fAHUCOValveSetADSInternalValuesMirror', 'ADS.fAHUPHValveActADSInternalValuesMirror', 'ADS.fAHUPHValveSetADSInternalValuesMirror', 'ADS.fAHURHValveSetADSInternalValuesMirror', 'ADS.fAHUCOValveActADSInternalValuesMirror', 'ADS.fAHURHValveActADSInternalValuesMirror']
(432, 6)


In [63]:
# !!! Run ONLY for feature-computed dataset
# Save train and test datasets as .tsv files

dataset_dir = './SimTSC/datasets/EBC'
dataset_name = 'AHU_valve_actset2023_features_stanscaler'

dataset_dir = os.path.join(dataset_dir, dataset_name)

# Save train dataset
Xseries_train = pd.DataFrame(data=X_train, columns=range(1,col_len+1))
Ylabels_train = pd.DataFrame(data=y_train)

train_merged = pd.concat([Ylabels_train, Xseries_train], axis=1)

if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
train_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_TRAIN.tsv'), sep='\t', index=False, header=None)

# saving raw train
Xseries_train_raw = pd.DataFrame(data=X_train_raw, columns=range(1,col_len+1))
Ylabels_train_raw = pd.DataFrame(data=y_train_raw)

train_raw_merged = pd.concat([Ylabels_train_raw, Xseries_train_raw], axis=1)

if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
train_raw_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_TRAIN_RAW.tsv'), sep='\t', index=False, header=None) 

# Save test dataset
Xseries_test = pd.DataFrame(data=X_test, columns=range(1,col_len+1))
Ylabels_test = pd.DataFrame(data=y_test)

test_merged = pd.concat([Ylabels_test, Xseries_test], axis=1)

if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
test_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_TEST.tsv'), sep='\t', index=False, header=None)

# saving raw test
Xseries_test_raw = pd.DataFrame(data=X_test_raw, columns=range(1,col_len+1))
Ylabels_test_raw = pd.DataFrame(data=y_test_raw)

test_raw_merged = pd.concat([Ylabels_test_raw, Xseries_test_raw], axis=1)

if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
test_raw_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_TEST_RAW.tsv'), sep='\t', index=False, header=None) 

# Save validation dataset
Xseries_val = pd.DataFrame(data=X_val, columns=range(1,col_len+1))
Ylabels_val = pd.DataFrame(data=y_val)

val_merged = pd.concat([Ylabels_val, Xseries_val], axis=1)

if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
val_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_VAL.tsv'), sep='\t', index=False, header=None)

# saving raw val
Xseries_val_raw = pd.DataFrame(data=X_val_raw, columns=range(1,col_len+1))
Ylabels_val_raw = pd.DataFrame(data=y_val_raw)

val_raw_merged = pd.concat([Ylabels_val_raw, Xseries_val_raw], axis=1)

if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
val_raw_merged.to_csv(os.path.join(dataset_dir, dataset_name+'_VAL_RAW.tsv'), sep='\t', index=False, header=None) 

In [7]:
# load prediction labels as list from pickle file
import pickle
seed = 0
dataset = "AHU_Minimal_2023"
log_dir = './SimTSC/logs'
test_out_dir = os.path.join(log_dir, 'TEST')
preds_out_path = os.path.join(test_out_dir, dataset+'_'+str(seed)+'_preds')
with open (preds_out_path, 'rb') as fp:
    labellist = pickle.load(fp)

In [59]:
log_dir = './SimTSC/logs'
tmp_dir = 'tmp'
dataset = "data_principal_plus1_15m_Jan23_Dec23"
dataset_dir = '../Data'

# dataset_dir = os.path.join(dataset_dir, dataset)
data = pd.read_csv(os.path.join(dataset_dir, dataset+'.csv'))
data.head()

ODAtemp_idx = [idx for idx, x in enumerate(data.columns) if "TempODA" in x]
SUPtemp_idx = [idx for idx, x in enumerate(data.columns) if "TempSUPADS" in x]
SUPset_idx = [idx for idx, x in enumerate(data.columns) if "TempSUPSet" in x]
RHvalve_idx = [idx for idx, x in enumerate(data.columns) if "RHValve" in x]
RHpump_idx = [idx for idx, x in enumerate(data.columns) if "RHPump" in x]
COtemp_idx = [idx for idx, x in enumerate(data.columns) if "COTempOut" in x]
PHvalve_idx = [idx for idx, x in enumerate(data.columns) if "PHValveAct" in x]
PHpump_idx = [idx for idx, x in enumerate(data.columns) if "PHPump" in x]

# If SUP_set - ODA_temp > 6°C --> RH_valve > 20%
# np.abs(data.iloc[idx,SUPset_idx][0] - data.iloc[idx,ODAtemp_idx][0]) > 6.0 and data.iloc[idx,RHvalve_idx][0] > 20.0

# If SUP_temp - CO_temp_out > 1°K --> RH_Pump = 1.0
# data.iloc[idx,SUPtemp_idx][0] - data.iloc[idx,COtemp_idx][0] > 1.0 and data.iloc[idx,RHpump_idx][0] == 1.0

# ODATemp < 3°C implies PH ValveAct > 0%


nsamples = data.shape[0]
count = 0
precedent = 0
consequent = 0
for idx in range(nsamples):
    if -10.0 <= data.iloc[idx,ODAtemp_idx][0] <= 30.0:
        precedent+=1
        # if data.iloc[idx,PHpump_idx][0] == 0.0:
        #     consequent+=1

precision = (consequent / nsamples)*100
cons_acc = (consequent / precedent)*100
prec_acc = (precedent / nsamples)*100
print("ODATemp rule holds overall --> Accuracy: {:.2f}".format(precision))
print("Precedent holds for {:.2f} percent of all samples".format(prec_acc))
print("Consequent holds {:.2f} percent when precedent holds".format(cons_acc))


ODATemp rule holds overall --> Accuracy: 0.00
Precedent holds for 99.54 percent of all samples
Consequent holds 0.00 percent when precedent holds
