In [2]:
import sys 
if '..' not in sys.path:
    sys.path.append('../')
from utils import *

In [100]:
from collections import defaultdict
from functools import partial
from sklearn.preprocessing import StandardScaler
import h5py
import numpy as np
import os

import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

def get_all_datasets(hdf_file):
    """
    Function to return all datasets from an HDF5 file.

    Args:
    hdf_file : h5py.File object

    Returns:
    datasets : dict
        Dictionary with dataset names as keys and numpy arrays as values.
    """
    datasets = {}

    def collect_datasets(name, obj):
        if isinstance(obj, h5py.Dataset):
            datasets[name] = obj[:]

    hdf_file.visititems(collect_datasets)
    return datasets

def nice_dict_contents(data_dict, print_keys=False):
    """
    Function to print the contents of the dictionary in a hierarchical manner.
    
    Args:
    data_dict : dict
        Dictionary containing the data.
    print_keys : bool, optional
        Flag indicating whether to print the keys. Defaults to False.
    """
    outer_keys = defaultdict(lambda: defaultdict(list))
    
    for key in data_dict.keys():
        split_key = key.split('/')
        
        if len(split_key) == 2:
            outer, inner = split_key
            outer_keys[outer][inner]
        else:
            outer, middle, inner = split_key
            outer_keys[outer][middle].append(inner)
            
    if print_keys:
        print('CONTENTS OF HDF5 FILE:')
        for outer_key, outer_value in outer_keys.items():
            print(outer_key)
            for inner_key, inner_values in outer_value.items():
                print('\t', inner_key)
                print('\t\t', ', '.join(inner_values))

def load_data(file_path, verbose=False):
    """
    Function to load the data from a given file path.

    Args:
    file_path : str
        Path to the data file.
    verbose : bool, optional
        If True, print the contents of the file.

    Returns:
    data_dict : dict
        Dictionary with dataset names as keys and numpy arrays as values.
    """
    with h5py.File(file_path, 'r') as hdf_file:
        data_dict = get_all_datasets(hdf_file)
    
    if verbose:
        nice_dict_contents(data_dict, print_keys=True)

    return data_dict

def create_sequences(X, y, seq_length):
    """
    Function to create sequences from the input data.

    Args:
    X : numpy array
        Input data.
    y : numpy array
        Target data.
    seq_length : int
        Sequence length.

    Returns:
    X_seq, y_seq : numpy arrays
        Sequenced input and target data.
    """
    X_seq = [X[i:i+seq_length] for i in range(0, len(X) - seq_length + 1, seq_length)]
    y_seq = [y[i:i+seq_length] for i in range(0, len(y) - seq_length + 1, seq_length)]
    return np.array(X_seq), np.array(y_seq)

def load_much_data(N_train, N_test, folder_path, columns_X, columns_y, seq_length=1, verbose=False, num_datasets=1):
    """
    Function to load data from multiple HDF5 files.

    Args:
    Ntrain : int
        Number of training instances.
    Nval : int
        Number of validation instances.
    folder_path : str
        Path to the data directory.
    columns : list
        List of column names.

    Returns:
    data_dict : dict
        Dictionary with dataset names as keys and numpy arrays as values.
    """
    data = {
        'X-train': {key: None for key in columns_X},
        'y-train': {key: None for key in columns_y},
        'X-test': {key: None for key in columns_X},
        'y-test': {key: None for key in columns_y},
    }
    Nloaded_points = 0
    # get control of directories
    dirs = os.listdir(folder_path)
    if '.DS_Store' in dirs: dirs.remove('.DS_Store') # remove .DS_Store if present
    test_dir = dirs[:1]
    train_dirs = dirs[1:num_datasets+1]
    print(f'using {test_dir} for testing and the remaining ({len(train_dirs)}) for training')
    

    N_points_per_dir = max(int(N_train/len(train_dirs)), seq_length)
    N_points_per_dir = N_points_per_dir - N_points_per_dir % seq_length
    n_dirs_to_use = int(N_train/N_points_per_dir)
    train_dirs = train_dirs[:n_dirs_to_use+1]
    N_points = N_points_per_dir * len(train_dirs)
    
    print(f'Loading a total of {N_train}, with {N_points_per_dir} points from each of {len(train_dirs)} directories')
    for dir in dirs:
        file_path = os.path.join(folder_path, dir, 'data.hdf5')
        if verbose: print('Loading file:', file_path)
        with h5py.File(file_path, "r") as hdf_file:
            new_data_dict = load_data(file_path)
            Nloaded_points += N_points_per_dir
        if dir in test_dir:

            for key in columns_X:
                if data['X-test'][key] is None:
                    data['X-test'][key] = new_data_dict[key][:N_test]
                else:
                    data['X-test'][key] = np.vstack([data['X-test'][key], new_data_dict[key][:N_test]])
            for key in columns_y:
                if data['y-test'][key] is None:
                    data['y-test'][key] = new_data_dict[key][:N_test]
                else:
                    data['y-test'][key] = np.vstack([data['y-test'][key], new_data_dict[key][:N_test]])
        elif dir in train_dirs:
            for key in columns_X:
                if data['X-train'][key] is None:
                    data['X-train'][key] = new_data_dict[key][:N_points_per_dir]
                else:
                    data['X-train'][key] = np.vstack([data['X-train'][key], new_data_dict[key][:N_points_per_dir]])
            for key in columns_y:
                if data['y-train'][key] is None:
                    data['y-train'][key] = new_data_dict[key][:N_points_per_dir]
                else:
                    data['y-train'][key] = np.vstack([data['y-train'][key], new_data_dict[key][:N_points_per_dir]])
        if Nloaded_points >= N_points:
            break

    return data


def normalize_features(X, scaler=None):
    """
    Function to normalize the features.

    Args:
    X : numpy array
        Input data.

    Returns:
    X_normalized : numpy array
        Normalized input data.
    scaler : sklearn.preprocessing.StandardScaler
        The scaler used for normalization. Useful for inverse transformation.
    """
    if scaler is None:
        scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)
    return X_normalized, scaler

def load_split_data(folder_path='C:\\Users\\Simon Andersen\\Documents\\Uni\\KS6\\AppliedML\\Project 2\\train_dataset_1', **kwargs):
    """
    Function to load, split, and preprocess the data.

    Args:
    folder_path : str
        Path to the data directory.
    **kwargs : other parameters to control the data loading and processing.
            - N_points: number of training instances
            - seq_len: sequence length, must be >= 1
            - input: list of input features, e.g. ['pose/tango_ori', 'pose/tango_pos', 'synced/gyro']
            - output: list of output features, e.g. ['pose/tango_ori']
            - normalize: boolean, whether to normalize the data or not.
            - shuffle: boolean, whether to shuffle the data or not.
            - verbose: boolean, whether to print information about the data or not.

    Returns:
    X_reshaped : numpy array
        The processed and reshaped input data.
    y_reshaped : numpy array
        The processed and reshaped target data.
    """
    params = {'N_train': 1000,'N_test': 100, 'seq_len': 10, 'input': [], 'output': [], 'normalize': False, 'shuffle': True, 'verbose': True, 'num_datasets':1}
    params.update(kwargs)

    allowed_columns = [
        'pose/ekf_ori', 'pose/tango_ori', 'pose/tango_pos',
        'synced/acce', 'synced/game_rv', 'synced/grav', 'synced/gyro', 'synced/gyro_uncalib', 'synced/linacce', 'synced/magnet', 'synced/rv',
        'raw/imu/acce', 'raw/imu/game_rv', 'raw/imu/gps', 'raw/imu/gravity', 'raw/imu/gyro', 'raw/imu/gyro_uncalib', 'raw/imu/linacce', 'raw/imu/magnet', 'raw/imu/magnetic_rv', 'raw/imu/pressure', 'raw/imu/rv', 'raw/imu/step', 'raw/imu/wifi_address', 'raw/imu/wifi_values',
        'raw/tango/acce', 'raw/tango/game_rv', 'raw/tango/gps', 'raw/tango/gravity', 'raw/tango/gyro', 'raw/tango/gyro_uncalib', 'raw/tango/linacce', 'raw/tango/magnet', 'raw/tango/magnetic_rv', 'raw/tango/pressure', 'raw/tango/rv', 'raw/tango/step', 'raw/tango/tango_adf_pose', 'raw/tango/tango_pose', 'raw/tango/wifi_address', 'raw/tango/wifi_values',
    ]

    


    # make sure columns are in allowed_columns
    columns = params['output'] + params['input']
    for column in columns:
        if column not in allowed_columns:
            raise NameError(f'ERROR: Column "{column}" not in allowed columns: {allowed_columns}')

    data = load_much_data(folder_path=folder_path, 
                            columns_X=params['input'], columns_y=params['output'], 
                            N_train=params['N_train'], N_test=params['N_test'],
                            verbose=params['verbose'], 
                            seq_length=params['seq_len'], 
                            num_datasets=params['num_datasets'])
    
    

    X_train = np.hstack([data['X-train'][key] for key in params['input']])
    y_train = np.hstack([data['y-train'][key] for key in params['output']])
    X_test = np.hstack([data['X-test'][key] for key in params['input']])
    y_test = np.hstack([data['y-test'][key] for key in params['output']])
    if params['normalize']:
        X_train, scaler = normalize_features(X_train)
        # use the same scaler for test data
        X_test, _ = normalize_features(X_test, scaler=scaler)

        
    X_train_reshaped, y_train_reshaped = create_sequences(X_train, y_train, seq_length=params['seq_len'])
    X_test_reshaped, y_test_reshaped = create_sequences(X_test, y_test, seq_length=params['seq_len'])

    return X_train_reshaped, y_train_reshaped, X_test_reshaped, y_test_reshaped


In [103]:
folder_path = '/Users/antongolles/Documents/uni/masters/myMasters/applied_machine_learning/inertial_navigation_transformer/data/data_from_RoNIN/train_dataset_1/'

params = {'N_train': 2000, 'N_test':500, 'seq_len': 15, 
          'input': ['pose/tango_ori', 'pose/tango_pos', 'synced/gyro'], 
          'output': ['pose/tango_ori'], 
          'normalize': False, 'verbose': False, 'num_datasets':6}
X_train, y_train, X_test, y_test = load_split_data(folder_path=folder_path, **params)


using ['a017_1'] for testing and the remaining (6) for training
Loading a total of 2000, with 330 points from each of 6 directories


In [102]:
print('X train shape: ', X_train.shape, 'y train shape: ', y_train.shape, 'X test shape: ', X_test.shape, 'y test shape: ', y_test.shape)

X train shape:  (36, 11, 10) y train shape:  (36, 11, 4) X test shape:  (9, 11, 10) y test shape:  (9, 11, 4)


In [131]:
def create_sequences(X, y, seq_length, overlap=1):
    """
    Function to create sequences from the input data.

    Args:
    X : numpy array
        Input data.
    y : numpy array
        Target data.
    seq_length : int
        Sequence length.
    overlap : int, optional
        Overlap between sequences. Defaults to 1.

    Returns:
    X_seq, y_seq : numpy arrays
        Sequenced input and target data.
    """
    X_seq = []
    y_seq = []
    for i in range(0, len(X) - seq_length + 1, seq_length-overlap):
        X_seq.append(X[i:i+seq_length])
        y_seq.append(y[i:i+seq_length])
    return np.array(X_seq), np.array(y_seq)

In [137]:
X = np.arange(27).reshape(-1,3)
y = np.arange(27).reshape(-1,1)

X_seq, y_seq = create_sequences(X, y, seq_length=4, overlap=1)
print('X_seq: ', X_seq)

X_seq:  [[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]
  [ 9 10 11]]

 [[ 9 10 11]
  [12 13 14]
  [15 16 17]
  [18 19 20]]]
