In [1]:
import pandas as pd
import numpy as np

np.set_printoptions(precision=6, suppress=True)

In [2]:
df_train = pd.read_csv('./data/second-order/Centar/train', index_col=0)
df_valid = pd.read_csv('./data/second-order/Centar/validation', index_col=0)
df_test = pd.read_csv('./data/second-order/Centar/test', index_col=0)

In [3]:
def build_seq2seq_datasets(dataset, history = 24, target_size = 12):
    start_index = history
    end_index = len(dataset) - target_size
    
    # Selecting the appropriate columns from the dataset
    encoder_input_dataset = dataset.values.copy()
    decoder_input_dataset = dataset.drop(['PM25', 'PM10_missing', 'PM25_missing'],
                                         axis=1).values.copy()
    decoder_target_dataset = dataset[['PM10']].values.copy()
    decoder_missing_dataset = dataset[['PM10_missing']].values.copy()

    # These lists will hold the final (third-order) datasets
    encoder_input_data = []
    decoder_input_data = []
    decoder_target_data = []
    
    for i in range(start_index, end_index):
        encoder_input_values = encoder_input_dataset[i-history:i]
        decoder_input_values = decoder_input_dataset[i:i+target_size]
        decoder_target_values = decoder_target_dataset[i+1:i+1+target_size]
        decoder_missing_values = decoder_missing_dataset[i+1:i+1+target_size]
        
        # If any of the target values has been imputed (i.e. was missing), skip the sample
        if np.any(decoder_missing_values == 1):
            continue
            
        encoder_input_data.append(encoder_input_values)
        decoder_input_data.append(decoder_input_values)
        decoder_target_data.append(decoder_target_values)

    encoder_input_data = np.array(encoder_input_data).reshape(-1, 
                                                              history, 
                                                              encoder_input_dataset.shape[1])
    decoder_input_data = np.array(decoder_input_data).reshape(-1, 
                                                              target_size, 
                                                              decoder_input_dataset.shape[1])
    decoder_target_data = np.array(decoder_target_data).reshape(-1, 
                                                                target_size, 
                                                                decoder_target_dataset.shape[1])
        
    return encoder_input_data, decoder_input_data, decoder_target_data

In [4]:
train_encoder_input_data, train_decoder_input_data, train_decoder_target_data = \
        build_seq2seq_datasets(df_train)

valid_encoder_input_data, valid_decoder_input_data, valid_decoder_target_data = \
        build_seq2seq_datasets(df_valid)

test_encoder_input_data, test_decoder_input_data, test_decoder_target_data = \
        build_seq2seq_datasets(df_test)

In [5]:
np.save('./data/third-order/Centar/train_encoder_input_data.npy', train_encoder_input_data)
np.save('./data/third-order/Centar/train_decoder_input_data.npy', train_decoder_input_data)
np.save('./data/third-order/Centar/train_decoder_target_data.npy', train_decoder_target_data)

np.save('./data/third-order/Centar/valid_encoder_input_data.npy', valid_encoder_input_data)
np.save('./data/third-order/Centar/valid_decoder_input_data.npy', valid_decoder_input_data)
np.save('./data/third-order/Centar/valid_decoder_target_data.npy', valid_decoder_target_data)

np.save('./data/third-order/Centar/test_encoder_input_data.npy', test_encoder_input_data)
np.save('./data/third-order/Centar/test_decoder_input_data.npy', test_decoder_input_data)
np.save('./data/third-order/Centar/test_decoder_target_data.npy', test_decoder_target_data)