In [2]:
from doctest import Example
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import logging, sys
from tqdm import tqdm

In [27]:
class UMASS_Dataset_V2(Dataset):

    """
    UMASS PyTorch Dataset Class
    Unnormalised at preprocessing. Normalise here.

    ...

    Attributes
    ----------
    umass_data : Tensor
        processed umass data
    umass_data_noisy : Tensor
        noise-added umass data
    mode : str
        method for which the umass dataset has been processed ('fc' or 'cnn)

    Methods
    -------
    preprocess_umass_fc()
        Initialization process of dataset for 'fc' fully-connected mode
    preprocess_umass_cnn()
        Initialization process of dataset for 'cnn' convolutional neural network mode
    """

    def __init__(self, folder_path: str ='../../dataset/interim/', mode: str = 'cnn', # Required arguments
                 train: bool = True, no_days: int = 12, reshape_factor: int = 2, # CNN arguments
                 N: int = None, seed: int = 0, eps: float = 1e-12,  # Other arguments
                 noise_type: str = 'none', noise_pct: float = 0.9): # Noise arguments

        """
        Parameters
        ----------
        folder_path : string
                Directory with all the files (processed)
        mode: string 
                Either 'cnn' for convolutional usage or 'fc' for basic AE usage.
        train : bool
                Determines whether training or test dataset to be used (already preprocessed to save time)
        no_days : int (optional) 
                Number of days (i.e., rows in the matrix example)
        reshape_factor : int (optional) 
                Used by the original authors to achieve a square tensor
        N : int (optional) 
                Select subset of examples, AFTER reshaping.
        seed : int (optional) 
                Set seed, mainly for shuffling
        eps : float (optional) 
                For numerical stability in min-max normalization.
        noise_type: bool (optional) ('gauss', 'speckle', None)
                If, and what type, of noise to be added to dataset
        noise_pct: float (optional)
                Parameter controlling how much noise to add
        """
        
        # Set seed
        torch.manual_seed(seed)
        
        # Activate tqdm for pandas and remember object variables
        tqdm.pandas()
        self.eps = eps
        self.mode = mode
        
        if train:
            file_path = folder_path + 'umass_train_unnormal.csv'
        else:
            file_path = folder_path + 'umass_test_unnormal.csv' # Note that this has all been directly preprocessed to reduce time spent reprocessing.

        df = pd.read_csv(file_path)
        
        if mode == 'cnn':
        
            # Perform first reshape into Tensor of shape (no_examples, no_days, 48)
            self.umass_data = self.preprocess_umass_cnn(df, no_days)


            # Perform second reshape into Tensor of shape (no_examples, no_days * reshape_factor, 48 / reshape_factor)        
            self.umass_data = self.umass_data.reshape(self.umass_data.size(0), self.umass_data.size(1) * reshape_factor,
                                                    self.umass_data.size(2) // reshape_factor)

            # Unsqueeze channel 1 back out (1 filter)
            self.umass_data = self.umass_data.unsqueeze(1)
            
        elif mode == 'fc':
            
            self.umass_data = self.preprocess_umass_fc(df, no_days)
            
        else:
            
            raise ValueError("Mode must be 'cnn' or 'fc'.")
            
        # If N is selected, pick random list
        if N is not None:
            if N > self.umass_data.shape[0]:
                raise ValueError("Cannot exceed dataset size of {}".format(self.umass_data.size(0)))
            else:
                # Permutation
                # perm = torch.randperm(self.umass_data.size(0))[:N]
                # self.umass_data = self.umass_data[perm, :, :]

                self.umass_data = self.umass_data[:N] # for debug purposes # TODO: remove this
                
        # Add noise to dataset
        if noise_type == 'gauss':
            # Add Gaussian noise
            noise = torch.randn(self.umass_data.size()) * noise_pct
            self.umass_data_noisy = self.umass_data + noise
            # Clamp between 0 and 1 (same as real life)
            self.umass_data_noisy = torch.clamp(self.umass_data_noisy, min = 0, max = 1)
        elif noise_type == 'speckle':
            raise NotImplementedError('Speckle noise not yet implemented')
        elif noise_type == 'none':
            self.umass_data_noisy = self.umass_data
        else:
            raise NotImplementedError('Noise selection has not been implemented.')
            
        # Min max normalise
        if mode == 'cnn':
            min_meter_day = torch.amin(self.umass_data, dim = (-1, -2), keepdim=True)
            max_meter_day = torch.amax(self.umass_data, dim = (-1, -2), keepdim=True)
            self.umass_data = (self.umass_data - min_meter_day) / (max_meter_day - min_meter_day) # numerica stability
            min_meter_day = torch.amin(self.umass_data_noisy, dim = (-1, -2), keepdim=True)
            max_meter_day = torch.amax(self.umass_data_noisy, dim = (-1, -2), keepdim=True)
            self.umass_data_noisy = (self.umass_data_noisy - min_meter_day) / (max_meter_day - min_meter_day) # numerica stability
        elif mode == 'fc':
            min_meter_day = torch.amin(self.umass_data, dim = (-1), keepdim=True)
            max_meter_day = torch.amax(self.umass_data, dim = (-1), keepdim=True)
            self.umass_data = (self.umass_data - min_meter_day) / (max_meter_day - min_meter_day) # numerica stability
            min_meter_day = torch.amin(self.umass_data_noisy, dim = (-1), keepdim=True)
            max_meter_day = torch.amax(self.umass_data_noisy, dim = (-1), keepdim=True)
            self.umass_data_noisy = (self.umass_data_noisy - min_meter_day) / (max_meter_day - min_meter_day) # numerica stability
        else:
            raise NotImplementedError('mode not implemented')


    def __len__(self):
        return self.umass_data.size(0)
    
    def preprocess_umass_fc(self, df, fc_days):
        
        return_torch = torch.zeros(1, fc_days * 96)
        
        def extract_umass_examples(subset_df, fc_days):
            
            """
            Nested function; group-by to modify nonlocal variable self.return_torch and attaches all modified examples.
            """
    
            meter_torch = torch.from_numpy(subset_df.kwh.to_numpy()).float() # conver to Tnesor
            meter_torch = meter_torch.reshape(-1, 96) # reshape into days
            
            assert meter_torch.shape[0] // (fc_days) != 0, "not enough data for required shape"
            meter_torch = meter_torch[:(meter_torch.shape[0] // (fc_days)) * (fc_days), :] # drop extra rows (cannot be used)

            meter_torch = meter_torch.reshape(-1, fc_days * 96) # reshape to daily form

            nonlocal return_torch # binds to non-global variable, which will be in non-nested function
            return_torch = torch.cat((return_torch, meter_torch))
        
        # nb: Below function does not need to be assigned, as effectively modifies return_torch inplace.
        df.groupby('house').progress_apply(extract_umass_examples, fc_days = fc_days)
        
        return return_torch[1:, :] # Removes first row of 0s
    
    def preprocess_umass_cnn(self, df, no_days):
        
        return_torch = torch.zeros(1, no_days, 96)
        
        def extract_umass_examples(subset_df, no_days):
            
            """
            Nested function; group-by to modify nonlocal variable self.return_torch and attaches all modified examples.
            """
    
            meter_torch = torch.from_numpy(subset_df.kwh.to_numpy()).float() # conver to Tnesor
            meter_torch = meter_torch.reshape(-1, 1, 96) # reshape into days
            
            assert meter_torch.shape[0] // no_days != 0, "not enough data for required shape"
            meter_torch = meter_torch[:(meter_torch.shape[0] // no_days) * no_days, :, :] # drop extra rows (cannot be used)

            meter_torch = meter_torch.reshape(-1, no_days, 96) # reshape to 12 day form

            nonlocal return_torch # binds to non-global variable, which will be in non-nested function
            return_torch = torch.cat((return_torch, meter_torch))
        
        # nb: Below function does not need to be assigned, as effectively modifies return_torch inplace.
        df.groupby('house').progress_apply(extract_umass_examples, no_days = no_days)
        
        return return_torch[1:, :, :] # Removes first row of 0s
    

    def __getitem__(self, idx):
        
        example = self.umass_data[idx]
        noisy_example = self.umass_data_noisy[idx]

        return example, noisy_example

In [28]:
test_set = UMASS_Dataset_V2(train=False, reshape_factor = 1)
train_set = UMASS_Dataset_V2(train=True, reshape_factor = 1)

100%|█████████████████████████████████████████| 10/10 [00:00<00:00, 1975.56it/s]
100%|█████████████████████████████████████████| 90/90 [00:00<00:00, 2585.90it/s]


In [32]:
train_set[2][0].max()

tensor(1.)

In [29]:
train_set.umass_data.isnan().sum()

tensor(0)

In [30]:
test_set.umass_data.isnan().sum()

tensor(0)