# Setup

## Library

In [1]:
import os
from pandas import read_csv
from numpy import dstack
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader
from swiss_library import dataset_loader, custom_normalize, custom_norm_transform, SWISS_dataset, Downstream_dataset

## Hyperparameters

In [2]:
data_dir = "./UCI HAR Dataset/"
save_dir = "./swiss/uci_har/"
val_size = 0.2
seed = 42
pre_batch_size = 512
batch_size = 512

# Load Data

In [3]:
from re import sub


train_acc_x = np.loadtxt(f'{data_dir}/train/Inertial Signals/body_acc_x_train.txt')
train_acc_y = np.loadtxt(f'{data_dir}/train/Inertial Signals/body_acc_y_train.txt')
train_acc_z = np.loadtxt(f'{data_dir}/train/Inertial Signals/body_acc_z_train.txt')
train_gyro_x = np.loadtxt(f'{data_dir}/train/Inertial Signals/body_gyro_x_train.txt')
train_gyro_y = np.loadtxt(f'{data_dir}/train/Inertial Signals/body_gyro_y_train.txt')
train_gyro_z = np.loadtxt(f'{data_dir}/train/Inertial Signals/body_gyro_z_train.txt')
train_tot_acc_x = np.loadtxt(f'{data_dir}/train/Inertial Signals/total_acc_x_train.txt')
train_tot_acc_y = np.loadtxt(f'{data_dir}/train/Inertial Signals/total_acc_y_train.txt')
train_tot_acc_z = np.loadtxt(f'{data_dir}/train/Inertial Signals/total_acc_z_train.txt')
subject_train = np.loadtxt(f'{data_dir}/train/subject_train.txt')

test_acc_x = np.loadtxt(f'{data_dir}/test/Inertial Signals/body_acc_x_test.txt')
test_acc_y = np.loadtxt(f'{data_dir}/test/Inertial Signals/body_acc_y_test.txt')
test_acc_z = np.loadtxt(f'{data_dir}/test/Inertial Signals/body_acc_z_test.txt')
test_gyro_x = np.loadtxt(f'{data_dir}/test/Inertial Signals/body_gyro_x_test.txt')
test_gyro_y = np.loadtxt(f'{data_dir}/test/Inertial Signals/body_gyro_y_test.txt')
test_gyro_z = np.loadtxt(f'{data_dir}/test/Inertial Signals/body_gyro_z_test.txt')
test_tot_acc_x = np.loadtxt(f'{data_dir}/test/Inertial Signals/total_acc_x_test.txt')
test_tot_acc_y = np.loadtxt(f'{data_dir}/test/Inertial Signals/total_acc_y_test.txt')
test_tot_acc_z = np.loadtxt(f'{data_dir}/test/Inertial Signals/total_acc_z_test.txt')
subject_test = np.loadtxt(f'{data_dir}/test/subject_test.txt')

In [4]:
# Stacking channels together data
train_data = np.stack((train_acc_x, train_acc_y, train_acc_z,
                       train_gyro_x, train_gyro_y, train_gyro_z,
                       train_tot_acc_x, train_tot_acc_y, train_tot_acc_z), axis=1)
X_test = np.stack((test_acc_x, test_acc_y, test_acc_z,
                      test_gyro_x, test_gyro_y, test_gyro_z,
                      test_tot_acc_x, test_tot_acc_y, test_tot_acc_z), axis=1)
# labels
train_labels = np.loadtxt(f'{data_dir}/train/y_train.txt')
train_labels -= np.min(train_labels)
y_test = np.loadtxt(f'{data_dir}/test/y_test.txt')
y_test -= np.min(y_test)

## Train & Validation Set

In [5]:
# Splitting data into train and validation for data, labels and subjects
X_train, X_val, y_train, y_val, subject_train, subject_val = train_test_split(train_data, train_labels, subject_train, test_size=val_size, random_state=seed)

# print shapes
X_train.shape, X_val.shape, y_train.shape, y_val.shape, subject_train.shape, subject_val.shape

((5881, 9, 128), (1471, 9, 128), (5881,), (1471,), (5881,), (1471,))

In [6]:
X_test.shape, y_test.shape, subject_test.shape

((2947, 9, 128), (2947,), (2947,))

## Make Data Loader

In [7]:
train_data, tr_obs, tr_v = dataset_loader(X_train, y_train, subject_train, 'Train')
val_data, val_obs, val_v = dataset_loader(X_val, y_val, subject_val, 'Validation')
test_data, te_obs, te_v = dataset_loader(X_test, y_test, subject_test, 'Test')

[Train Loader] Train observations : 5881
[Train Loader] Train labels : 5881
[Train Loader] Train volunteers : 21
[Validation Loader] Validation observations : 1471
[Validation Loader] Validation labels : 1471
[Validation Loader] Validation volunteers : 21
[Test Loader] Test observations : 2947
[Test Loader] Test labels : 2947
[Test Loader] Test volunteers : 9


## Normalization

In [8]:
train_data['x'], means, stds = custom_normalize(train_data['x'])
val_data['x'] = custom_norm_transform(val_data['x'], means, stds)
test_data['x'] = custom_norm_transform(test_data['x'], means, stds)

In [9]:
# check the shapes of train_data[x]
train_data['x'].shape, val_data['x'].shape, test_data['x'].shape

(torch.Size([5881, 9, 128]),
 torch.Size([1471, 9, 128]),
 torch.Size([2947, 9, 128]))

In [10]:
total_obs = tr_obs+val_obs+te_obs
print(f"Total obs : {total_obs} / Train : Valid : Test = {tr_obs} : {val_obs} : {te_obs}")
print(f'train volunteers : {tr_v}')
print(f'valid volunteers : {val_v}')
print(f'test volunteers : {te_v}')

Total obs : 10299 / Train : Valid : Test = 5881 : 1471 : 2947
train volunteers : [1, 3, 5, 6, 7, 8, 11, 14, 15, 16, 17, 19, 21, 22, 23, 25, 26, 27, 28, 29, 30]
valid volunteers : [1, 3, 5, 6, 7, 8, 11, 14, 15, 16, 17, 19, 21, 22, 23, 25, 26, 27, 28, 29, 30]
test volunteers : [2, 4, 9, 10, 12, 13, 18, 20, 24]


# Define Dataset

In [11]:
pretrain_set = SWISS_dataset(train_data, mask_type='random', masking_rate=0.1)
train_set = Downstream_dataset(train_data)
valid_set = Downstream_dataset(val_data)
test_set = Downstream_dataset(test_data)

pretrain_loader = DataLoader(pretrain_set, batch_size=pre_batch_size, shuffle=True)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

# Save the DataLoader

In [2]:
# create dictionary for swiss pretrainer
pretrain_dict = {
    'gru_hid_dim': 32,
    'gru_input_size': 11,
    'gru_layers': 1,
    'gru_dropout': 0.2,
    'bidirectional': True,
    'num_signals': 9, # different for each dataset
    'emb_dim': 32,
    'emb_dropout': 0.0,
    'depth': 2,
    'heads': 2,
    'head_dim': 0,
    'transformer_mlp_dim': 0,
    'dropout': 0.2,
    'signal_emb': True,
    'proj_hiddim': 512,
    'proj_dim': 256,
}

pretrain_dict['head_dim'] = int(pretrain_dict['emb_dim']/pretrain_dict['heads'])
pretrain_dict['transformer_mlp_dim'] = pretrain_dict['emb_dim']*4
pretrain_dict['gru_hid_dim'] = pretrain_dict['emb_dim']
pretrain_dict['gru_emb_dim'] = pretrain_dict['emb_dim']
if pretrain_dict['gru_layers'] == 1:
    pretrain_dict['gru_dropout'] = 0.0