# Split data into train and test sets

Total data: 20*50 = 1000
    
Test data: 20*10 = 200

Train (include validation) data: 20*40 = 800

Validation: 20*10 = 200

## Step 1
Get test set

In [1]:
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

np.random.seed(1)
torch.manual_seed(1)

<torch._C.Generator at 0x7f314c02c130>

In [2]:
all_data_dir = '../../data_folder_ruihan/'
data_save_dir = '../../data_folder_split/'

In [3]:
# read data
X_icub = torch.load(all_data_dir+'ICUB_all.pt').numpy()
X_bio = torch.load(all_data_dir+'Bio_all.pt').numpy()
Y = np.load(all_data_dir+'all_labels.npy')
indices = np.arange(X_icub.shape[0])

FileNotFoundError: [Errno 2] No such file or directory: '../../data_folder_ruihan//Bio_all.pt'

In [None]:
X_icub.shape, X_bio.shape

In [None]:
X_train_icub, X_test_icub, y_train, y_test, ind_train, ind_test = train_test_split(X_icub, Y, indices, test_size=0.20, random_state=42, stratify=Y)

In [None]:
X_train_bio = X_bio[ind_train]
X_test_bio = X_bio[ind_test]

In [None]:
X_train_bio.shape, X_test_bio.shape, X_train_icub.shape, X_test_icub.shape

In [None]:
# save these datas
np.save(data_save_dir + 'icub_trainAll', X_train_icub)
np.save(data_save_dir + 'icub_test', X_test_icub)

np.save(data_save_dir + 'bio_trainAll', X_train_bio)
np.save(data_save_dir + 'bio_test', X_test_bio)

np.save(data_save_dir + 'labels_trainAll', y_train)
np.save(data_save_dir + 'labels_test', y_test)

## Step 2 

4-fold division for validation set

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)

In [None]:
count = 0
for train_index, test_index in skf.split(X_train_bio, y_train):
    # save biotac
    train_ = X_train_bio[train_index]
    val_ = X_train_bio[test_index]
    np.save(data_save_dir + 'bio_train_' + str(count), train_)
    np.save(data_save_dir + 'bio_val_' + str(count), val_)
    
    # save icub
    train_ = X_train_icub[train_index]
    val_ = X_train_icub[test_index]
    np.save(data_save_dir + 'icub_train_' + str(count), train_)
    np.save(data_save_dir + 'icub_val_' + str(count), val_)
    
    # save labels
    np.save(data_save_dir + 'labels_train_' + str(count), y_train[train_index])
    np.save(data_save_dir + 'labels_val_' + str(count), y_train[test_index])
    
    count+=1

## Step 3

prepare utils function

In [None]:
# create utilis here
data_save_dir = '../../data_folder_split/'

def get_ohe(_Y, num_class = 20):

    target_class = np.zeros([_Y.shape[0], num_class])

    for i in range(target_class.shape[0]):

        target_class[i, int(_Y[i])] = 1

    return target_class

def get_trainValData(path, k=0, spike_ready=True):
    num_class = 20
    # read data
    X_train_icub = torch.FloatTensor(np.load(data_save_dir + 'icub_train_' + str(k) + '.npy'))
    X_val_icub = torch.FloatTensor(np.load(data_save_dir + 'icub_val_' + str(k) + '.npy'))
    X_train_bio = torch.FloatTensor(np.load(data_save_dir + 'bio_train_' + str(k) + '.npy'))
    X_val_bio = torch.FloatTensor(np.load(data_save_dir + 'bio_val_' + str(k) + '.npy'))
    y_train = torch.FloatTensor(np.load(data_save_dir + 'labels_train_' + str(k) + '.npy'))
    y_val = torch.FloatTensor(np.load(data_save_dir + 'labels_val_' + str(k) + '.npy'))
    
    if spike_ready == False:
        return X_train_icub, X_val_icub, X_train_bio, X_val_bio, y_train, y_val
        
    target_class_train = torch.FloatTensor(get_ohe(y_train).reshape(-1, num_class, 1, 1, 1))
    target_class_val = torch.FloatTensor(get_ohe(y_val).reshape(-1, num_class, 1, 1, 1))
    
    X_train_icub = X_train_icub.reshape(X_train_icub.shape[0], 60, 1, 1, X_train_icub.shape[-1])
    X_val_icub = X_val_icub.reshape(X_val_icub.shape[0], 60, 1, 1, X_val_icub.shape[-1])
    
    X_train_bio = X_train_bio.reshape(X_train_bio.shape[0], X_train_bio.shape[1], 1, 1, X_train_bio.shape[-1])
    X_val_bio = X_val_bio.reshape(X_val_bio.shape[0], X_val_bio.shape[1], 1, 1, X_val_bio.shape[-1])
        
    return X_train_icub, X_val_icub,  X_train_bio, X_val_bio, target_class_train, target_class_val, y_train, y_val

In [None]:
def get_testData(path, spike_ready=True):
    num_class = 20
    X_test_icub = torch.FloatTensor(np.load(path + 'icub_test_' + str(count) + '.npy'))
    X_test_bio = torch.FloatTensor(np.load(path + 'bio_test_' + str(count) + '.npy'))
    y_test = torch.FloatTensor(np.load(data_save_dir + 'labels_test_' + str(k) + '.npy'))
    if spike_ready == False:
        return X_test_icub, X_test_bio, y_test
    
    X_test_icub = X_test_icub.reshape(X_test_icub.shape[0], 60, 1, 1, X_test_icub.shape[-1])
    X_test_bio = X_test_bio.reshape(X_test_bio.shape[0], X_test_bio.shape[1], 1, 1, X_test_bio.shape[-1])
    target_class_test = torch.FloatTensor(get_ohe(y_test).reshape(-1, num_class, 1, 1, 1))
    
    return X_test_icub, X_test_bio, target_class_test, y_test

In [None]:
def get_trainValLoader(path, k=0):
    X_train_icub, X_val_icub,  X_train_bio, X_val_bio, target_class_train, target_class_val, y_train, y_val = get_trainValData(path, k)
    
    train_dataset = torch.utils.data.TensorDataset(X_train_icub, X_train_bio, target_class_train, y_train)
    train_loader = torch.utils.data.DataLoader(train_dataset,shuffle=True,batch_size=8)
    
    val_dataset = torch.utils.data.TensorDataset(X_val_icub, X_val_bio, target_class_train, y_val)
    val_loader = torch.utils.data.DataLoader(val_dataset,shuffle=True,batch_size=8)

    X_test_icub, X_test_bio, target_class_test, y_test = get_testData(path, k)
    
    test_dataset = torch.utils.data.TensorDataset(X_test_icub, X_test_bio, target_class_test, y_test)
    test_loader = torch.utils.data.DataLoader(test_dataset,shuffle=True,batch_size=8)
    
    return train_loader, val_loader, train_dataset, val_dataset

def get_testLoader(path):

    X_test_icub, X_test_bio, target_class_test, y_test = get_testData(path)
    
    test_dataset = torch.utils.data.TensorDataset(X_test_icub, X_test_bio, target_class_test, y_test)
    test_loader = torch.utils.data.DataLoader(test_dataset,shuffle=True,batch_size=8)
   
    return test_loader, test_dataset