In [1]:
import pandas as pd

In [2]:
def convert_to_minutes(s):
    parts = s.split('hh')
    hours = int(parts[0])
    minutes = int(parts[1][:-2])  # remove 'mm' and convert to int
    return hours * 60 + minutes

ori_data = pd.read_csv('Boiler.csv')

ori_data['TIME'] = ori_data['TIME'].apply(convert_to_minutes)
print(ori_data.head())

   boiler_no  TIME  Steam Pressure_Main Header  outdoor temperature  \
0          3   645                    0.670455             0.859671   
1          3   660                    0.681818             0.872124   
2          3   675                    0.693182             0.875664   
3          3   690                    0.681818             0.870479   
4          3   705                    0.727273             0.875117   

   Temperature_Concentrated Water  Operating Time_Feed water  \
0                        0.433385                        0.5   
1                        0.445957                        0.5   
2                        0.449465                        0.5   
3                        0.443662                        0.5   
4                        0.449465                        0.5   

   Temperature_Exhaust Gas  Volume_Feed water_delta  Temperature_Feed water  \
0                 0.237288                 0.854282                0.355072   
1                 0.188862    

### TSG BENCH Preprocessing ###

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import acf
from scipy.signal import argrelextrema
from tslearn.datasets import UCR_UEA_datasets
import pickle
# import mgzip


class MinMaxScaler():
    def fit_transform(self, data): 
        self.fit(data)
        scaled_data = self.transform(data)
        return scaled_data

    def fit(self, data):    
        self.mini = np.min(data, 0)
        self.range = np.max(data, 0) - self.mini
        return self
        
    def transform(self, data):
        numerator = data - self.mini
        scaled_data = numerator / (self.range + 1e-7)
        return scaled_data

    def inverse_transform(self, data):
        data *= self.range
        data += self.mini
        return data

# adapt from https://github.com/TheDatumOrg/VUS
def find_length(data):
    if len(data.shape)>1:
        return 0
    data = data[:min(20000, len(data))]
    base = 3
    nobs = len(data)
    nlags = int(min(10 * np.log10(nobs), nobs - 1))
    auto_corr = acf(data, nlags=nlags, fft=True)[base:]
    local_max = argrelextrema(auto_corr, np.greater)[0]
    # try:
    #     max_local_max = np.argmax([auto_corr[lcm] for lcm in local_max])
    #     # print(local_max[max_local_max]+base)
    #     if local_max[max_local_max]<3 or local_max[max_local_max]>300:
    #         return 125
    #     return local_max[max_local_max]+base
    # except:
    #     return 125
    return 125


window_all = []
for i in range(ori_data.shape[1]):
    window_all.append(find_length(ori_data.iloc[:,i]))


seq_len = int(np.mean(np.array(window_all)))
print(seq_len)


# Preprocess the dataset
temp_data = []    
# Cut data by sequence length
for i in range(0, len(ori_data) - seq_len):
    _x = ori_data[i:i + seq_len]
    temp_data.append(_x)
    
# Mix the datasets (to make it similar to i.i.d)
idx = np.random.permutation(len(temp_data))    
data = []
for i in range(len(temp_data)):
    data.append(temp_data[idx[i]])


full_train_data = np.array(data)
N, T, D = full_train_data.shape   
print('data shape:', N, T, D) 

valid_perc = 0.1

# further split the training data into train and validation set - same thing done in forecasting task
N_train = int(N * (1 - valid_perc))
N_valid = N - N_train

# Shuffle data
np.random.shuffle(full_train_data)

train_data = full_train_data[:N_train]
valid_data = full_train_data[N_train:]   
print("train/valid shapes: ", train_data.shape, valid_data.shape)    


scaler = MinMaxScaler()        
scaled_train_data = scaler.fit_transform(train_data)
scaled_valid_data = scaler.transform(valid_data)


  acf = avf[: nlags + 1] / avf[0]


125
data shape: 268024 125 23
train/valid shapes:  (241221, 125, 23) (26803, 125, 23)


In [4]:
# boiler_train = scaled_train_data.to_csv('boiler_train.csv', index=False)

#convert to npz
np.savez('boiler_train.npz', scaled_train_data)

In [5]:
# # try to load an npz file
data = np.load('data/boiler_train.npz')
data['arr_0']

array([[[0.        , 0.68421053, 0.6136363 , ..., 0.95894625,
         0.49361908, 0.        ],
        [0.        , 0.69473684, 0.6136363 , ..., 0.04105356,
         0.99281567, 0.        ],
        [0.        , 0.70526316, 0.71590902, ..., 0.49999998,
         0.99281675, 0.        ],
        ...,
        [0.        , 0.66315789, 0.68181811, ..., 0.49999995,
         0.99281567, 0.        ],
        [0.        , 0.67368421, 0.71590902, ..., 0.49999995,
         0.99281567, 0.        ],
        [0.        , 0.68421053, 0.74999993, ..., 0.49999995,
         0.99281567, 0.        ]],

       [[0.99999999, 0.36842105, 0.76136356, ..., 0.95894625,
         0.49361908, 0.        ],
        [0.99999999, 0.37894737, 0.78409083, ..., 0.04105356,
         0.99281567, 0.        ],
        [0.99999999, 0.38947368, 0.78409083, ..., 0.49999995,
         0.99281567, 0.        ],
        ...,
        [0.99999999, 0.50526316, 0.74999993, ..., 0.49999995,
         0.99281567, 0.        ],
        [0.9

In [6]:
import numpy as np

def check_npz_keys(file_path):
    with np.load(file_path) as data:
        return data.files

print(check_npz_keys('data/boiler_train.npz'))

['arr_0']
