In [1]:
from doctest import Example
import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import logging, sys
from tqdm import tqdm

RuntimeError: KeyboardInterrupt: 

In [None]:
class CERN_Dataset_V2(Dataset):

    # TODO: refactor the extract_cern_examples so we only have one for both CNN and FC

    """
    CERN PyTorch Dataset Class

    ...

    Attributes
    ----------
    cern_data : Tensor
        processed CERN data
    cern_data_noisy : Tensor
        noise-added CERN data
    mode : str
        method for which the CERN dataset has been processed ('fc' or 'cnn)

    Methods
    -------
    preprocess_cern_fc()
        Initialization process of dataset for 'fc' fully-connected mode
    preprocess_cern_cnn()
        Initialization process of dataset for 'cnn' convolutional neural network mode
    """

    def __init__(self, folder_path: str ='../../dataset/interim/', mode: str = 'cnn', # Required arguments
                 train: bool = True, no_days: int = 12, reshape_factor: int = 2, # CNN arguments
                 fc_days: int = 1, # FC arguments
                 N: int = None, seed: int = 0, eps: float = 1e-12,  # Other arguments
                 noise_type: str = 'none', noise_pct: float = 0.9): # Noise arguments

        """
        Parameters
        ----------
        folder_path : string
                Directory with all the files (processed)
        mode: string 
                Either 'cnn' for convolutional usage or 'fc' for basic AE usage.
        train : bool
                Determines whether training or test dataset to be used (already preprocessed to save time)
        no_days : int (optional) 
                Number of days (i.e., rows in the matrix example)
        reshape_factor : int (optional) 
                Used by the original authors to achieve a square tensor
        N : int (optional) 
                Select subset of examples, AFTER reshaping.
        seed : int (optional) 
                Set seed, mainly for shuffling
        eps : float (optional) 
                For numerical stability in min-max normalization.
        noise_type: bool (optional) ('gauss', 'speckle', None)
                If, and what type, of noise to be added to dataset
        noise_pct: float (optional)
                Parameter controlling how much noise to add
        """
        
        # Set seed
        torch.manual_seed(seed)
        
        # Activate tqdm for pandas and remember object variables
        tqdm.pandas()
        self.eps = eps
        self.mode = mode
        
        if train:
            file_path = folder_path + 'cern_train_v2.csv'
        else:
            file_path = folder_path + 'cern_test_v2.csv' # Note that this has all been directly preprocessed to reduce time spent reprocessing.

        df = pd.read_csv(file_path)
        
        if mode == 'cnn':
        
            # Perform first reshape into Tensor of shape (no_examples, no_days, 48)
            self.cern_data = self.preprocess_cern_cnn(df, no_days)


            # Perform second reshape into Tensor of shape (no_examples, no_days * reshape_factor, 48 / reshape_factor)        
            self.cern_data = self.cern_data.reshape(self.cern_data.size(0), self.cern_data.size(1) * reshape_factor,
                                                    self.cern_data.size(2) // reshape_factor)

            # Unsqueeze channel 1 back out (1 filter)
            self.cern_data = self.cern_data.unsqueeze(1)
            
        elif mode == 'fc':
            
            self.cern_data = self.preprocess_cern_fc(df, fc_days)
            
        else:
            
            raise ValueError("Mode must be 'cnn' or 'fc'.")
            
        # If N is selected, pick random list
        if N is not None:
            if N > self.cern_data.shape[0]:
                raise ValueError("Cannot exceed dataset size of {}".format(self.cern_data.size(0)))
            else:
                # Permutation
                # perm = torch.randperm(self.cern_data.size(0))[:N]
                # self.cern_data = self.cern_data[perm, :, :]

                self.cern_data = self.cern_data[:N] # for debug purposes # TODO: remove this

        # Add noise to dataset
        if noise_type == 'gauss':
            # Add Gaussian noise
            noise = torch.randn(self.cern_data.size()) * noise_pct
            self.cern_data_noisy = self.cern_data + noise
            # Clamp between 0 and 1 (same as real life)
            self.cern_data_noisy = torch.clamp(self.cern_data_noisy, min = 0, max = 1)
        elif noise_type == 'speckle':
            raise NotImplementedError('Speckle noise not yet implemented')
        elif noise_type == 'none':
            self.cern_data_noisy = self.cern_data
        else:
            raise NotImplementedError('Noise selection has not been implemented.')


    def __len__(self):
        return self.cern_data.size(0)
    
    def preprocess_cern_fc(self, df, fc_days):
        
        return_torch = torch.zeros(1, fc_days * 48)
        
        def extract_cern_examples(subset_df, fc_days):
            
            """
            Nested function; group-by to modify nonlocal variable self.return_torch and attaches all modified examples.
            """
    
            meter_torch = torch.from_numpy(subset_df.kwh.to_numpy()).float() # conver to Tnesor
            meter_torch = meter_torch.reshape(-1, fc_days * 48) # reshape to daily form

            nonlocal return_torch # binds to non-global variable, which will be in non-nested function
            return_torch = torch.cat((return_torch, meter_torch))
        
        # nb: Below function does not need to be assigned, as effectively modifies return_torch inplace.
        df.groupby('metre_id').progress_apply(extract_cern_examples, fc_days = fc_days)
        
        return return_torch[1:, :] # Removes first row of 0s
    
    def preprocess_cern_cnn(self, df, no_days):
        
        return_torch = torch.zeros(1, 12, 48)
        
        def extract_cern_examples(subset_df, no_days):
            
            """
            Nested function; group-by to modify nonlocal variable self.return_torch and attaches all modified examples.
            """
    
            meter_torch = torch.from_numpy(subset_df.kwh.to_numpy()).float() # conver to Tnesor
            meter_torch = meter_torch.reshape(-1, no_days, 48) # reshape to 12 day form

            nonlocal return_torch # binds to non-global variable, which will be in non-nested function
            return_torch = torch.cat((return_torch, meter_torch))
        
        # nb: Below function does not need to be assigned, as effectively modifies return_torch inplace.
        df.groupby('metre_id').progress_apply(extract_cern_examples, no_days = no_days)
        
        return return_torch[1:, :, :] # Removes first row of 0s

    def __getitem__(self, idx):
        
        example = self.cern_data[idx]
        noisy_example = self.cern_data_noisy[idx]

        return example, noisy_example

In [None]:
test_set = CERN_Dataset_V2(train=False, reshape_factor = 1)
train_set = CERN_Dataset_V2(train=True, reshape_factor = 1)

In [None]:
train_set.cern_data