In [7]:
import numpy as np
import torch

# add the path to my packages to system paths so they can be imported
import sys
# sys.path.append('/home/yasamanparhizkar/Documents/yorku/01_thesis/code/my_packages')
sys.path.append('F:\MAScThesis\code\my_packages')
# sys.path.append('/home/yasamanparhizkar/Documents/thesis/code/my_packages')

import data_handler_01 as dh

# Load spike data

Spike data shape:  (297, 1141, 113) $\implies$ (movie repeats, frames/time, neurons)
<br>
Labels are 1 (= spike) or -1 (= no spike).

In [2]:
# load all spike data from file
spikes_dp = '../../data/original_files/spikes.csv'
binned_data = np.loadtxt(spikes_dp, delimiter=',')
binned_data = binned_data.reshape(binned_data.shape[0], 1141, 113)
binned_data = binned_data * 2 - 1     # turn labels from 0,1 to -1,1

I_order_10 = [54, 35, 10, 60, 74, 9, 61, 56, 91, 104]

## Group all 113 neurons

This will create a more balanced dataset which is presumabley easier to solve.
<br>
Grouped data shape:  (297, 1141, 1) $\implies$ (movie repeats, frames/time, group)

In [3]:
# group all neurons together
grouped_data = np.zeros((297, 1141, 1))
for trial in range(297):
    for frame in range(1141):
        grouped_data[trial, frame, :] = 2 * int((binned_data[trial, frame, :] == 1).any()) - 1

In [4]:
# print some statistics
print('grouped_data.shape = ', grouped_data.shape)

avg_spike_perc = 0
print('trial #    | percentage belonging to class 1')
print('---------------------------------------------')
for trial in range(10):
    pers = dh.class_percentages(grouped_data[trial, :, :].reshape(-1), [-1, 1])
    avg_spike_perc += pers[1]
    print('trial #{:3} | {:.2f} %'.format(trial, pers[1]))

avg_spike_perc /= 10
print('---------------------------------------------')
print('AVERAGE     | {:.2f} %'.format(avg_spike_perc))

total_perc = np.sum(grouped_data == 1) *100 /(grouped_data.shape[0] * grouped_data.shape[1])
print('---------------------------------------------')
print('{:.2f} % of the whole data belongs to class 1.'.format(total_perc))

grouped_data.shape =  (297, 1141, 1)
trial #    | percentage belonging to class 1
---------------------------------------------
trial #  0 | 66.26 %
trial #  1 | 69.06 %
trial #  2 | 67.92 %
trial #  3 | 71.08 %
trial #  4 | 68.97 %
trial #  5 | 68.27 %
trial #  6 | 66.87 %
trial #  7 | 65.82 %
trial #  8 | 67.66 %
trial #  9 | 68.19 %
---------------------------------------------
AVERAGE     | 68.01 %
---------------------------------------------
68.47 % of the whole data belongs to class 1.


# Load single datapoints

In [5]:
# utility function to use when creating train and val datasets
def datapoint(index, features_dp, spike_data, group_id, transform=None):
    """
    Return a single datapoint consisting of (feature vector, label) 
    based on the extended index system of the whole dataset (297 repeats of a 1141-frame movie); 
    for example, the 6th frame of the 7th repeat is indexed 7*1141+5. 
    In this system, indices only move forward after repeats, so they represent time in a sense.
    Acceptable index range is batch_sz-1 to 1141*297-1.
      
    Inputs: index, features_dp, spike_data, group_id
    index - chosen datapoint's index
    features_dp - path to where feature vectors are stored.
    spike_data  - ndarray containing spike data.
                  the array's shape is (297 x 1141 x m) where m is the number of subgroups of neurons.
                  subgroups can be a single neuron or as large as all 113 neurons.
    group_id    - index of the chosen subgroup of neurons which is being considered
    transform   - func. applied to the original feature vector (defult: None, no transform is applied)
            
    
    Output: fv, lbl
    fv  - torch tensor representing the selected time bin's feature vector
    lbl - the selected time bin's label
    """
    
    trial = index//1141
    frame = index%1141
    fv = torch.load(features_dp+'fv_'+str(frame)+'.pt')
    if transform is not None:
        fv = transform(fv)
    lbl = spike_data[trial, frame, group_id]
    
    return fv, lbl

## last layer features & single-neuron spikes

In [8]:
# chosen neuron is conveyed via 'group_id' in data_params
def transform(fv):
    """
    Transform to be applied on feature vectors.
    
    Input: fv
    fv - 1xDf torch tensor representing a feature vector
    
    Output: fvv
    fvv - 1xDf' torch tensor representing the transformed feature vector
    """
    
    # for faster run and less memory usage
    fvv = fv[:, ::10]
    
    # for numerical stability during GD
    # fvv = fvv * 10
    
    return fvv

data_params = {'func': datapoint, \
               'features_dp': '../../data/slowfast_4608/', \
               'spike_data': binned_data, \
               'group_id': I_order_10[0], \
               'transform': transform}

datapoint_func = data_params['func']
features_dp = data_params['features_dp']
spike_data = data_params['spike_data']
group_id = data_params['group_id']
transform = data_params['transform']
index = 1*1141+5

fv, lbl = datapoint_func(index, features_dp, spike_data, group_id, transform)
print('datapoint #{}:'.format(index))
print('label = ', lbl)
print('feature vector shape = ', fv.shape)
# print('feature vector = ', fv)

datapoint #1146:
label =  -1.0
feature vector shape =  torch.Size([1, 461])


## first layer features & grouped-neurons spikes

In [9]:
data_params = {'func': datapoint, \
               'features_dp': '../../data/slowfast_4732/', \
               'spike_data': grouped_data, \
               'group_id': 0, \
               'transform': transform}

datapoint_func = data_params['func']
features_dp = data_params['features_dp']
spike_data = data_params['spike_data']
group_id = data_params['group_id']
transform = data_params['transform']
index = 1*1141+5

fv, lbl = datapoint_func(index, features_dp, spike_data, group_id, transform)
print('datapoint #{}:'.format(index))
print('label = ', lbl)
print('feature vector shape = ', fv.shape)
# print('feature vector = ', fv)

datapoint #1146:
label =  1.0
feature vector shape =  torch.Size([1, 474])


# Create training and validation datasets

In [10]:
def update_set(samples, data_params):
    """
    Update a set (either training or validation) based on a new list of datapoints' indices.
    
    Input: samples, data_params
    samples - list of datapoints' indices (original indices representing time)
    data_params   -
        func - funtion which returns a datapoint (fv, lbl) based on its index
        features_dp - path to where feature vectors are stored
        spike_data - (297 x 1141 x m)-shaped array where m is the number of subgroups of neurons.
        group_id - index of the chosen subgroup of neurons which is being considered
        transform - func. applied to the original feature vector (defult: None, no transform is applied)
        
    Output: dess, lbls
    dess - NxD matrix of feature vectors of N datapoints
    lbls - Nx1 vector of corresponding labels of said datapoints
    """
    
    datapoint = data_params['func']
    features_dp = data_params['features_dp']
    spike_data = data_params['spike_data']
    group_id = data_params['group_id']
    transform = data_params['transform'] if 'transform' in data_params else None
    
    dess = []
    lbls = []
    for index in samples:
        fv, lbl = datapoint(index, features_dp, spike_data, group_id, transform)
        dess.append(fv)
        lbls.append(lbl)
    dess = torch.cat(dess)
    dess = dess.detach().numpy()
    lbls = np.array(lbls)
    
    return dess, lbls

In [11]:
def random_train_val(train_num, val_num, ind_min, ind_max, data_params, seed=None):
    """
    Choose random datapoints to form training and validation datasets. The two sets do not overlap.
    Note: since datapoints are selected randomly, their new indices do NOT represent time anymore.
    
    Input: train_num, val_num, ind_min, ind_max, data_params, seed
    train_num     - size of the training datase
    val_num       - size of the validation dataset
    ind_min       - minimum possible datapoint index (acceptable >= batch_sz-1)
    ind_max       - maximum possible datapoint index (acceptable < 297*1141)
    data_params   -
        func - funtion which returns a datapoint (fv, lbl) based on its index
        features_dp - path to where feature vectors are stored
        spike_data - (297 x 1141 x m)-shaped array where m is the number of subgroups of neurons.
        group_id - index of the chosen subgroup of neurons which is being considered
        transform - func. applied to the original feature vector (defult: None, no transform is applied)
    seed - for random selection of datapoints (default: None, machine chosen seed is used)
    
    Output: train_num, val_num, train_data, val_data
    train_num  - number of training datapoints
    val_num    - number of validation datapoints
    train_data - 
        des   - NxD numpy array of feature vectors
        lbls  - Nx1 numpy array of corresponding labels
        smpls - list of indices of chosen datapoints, original indices which represent time
    val_data  - 
        des   - NxD numpy array of feature vectors
        lbls  - Nx1 numpy array of corresponding labels
        smpls - list of indices of chosen datapoints, original indices which represent time
    """
    
    data_num = ind_max - ind_min + 1
    train_num = min(train_num, data_num)
    val_num   = min(val_num, data_num-train_num)
    
    # select indices of datapoints randomly
    rng = np.random.default_rng(seed)
    samples = rng.choice(np.arange(ind_min, ind_max+1), size=(train_num+val_num), replace=False)
    train_smpls = samples[:train_num]
    val_smpls   = samples[train_num:]
    
    # get feature vectors and labels corresponding to chosen indices
    train_dess, train_lbls = update_set(train_smpls, data_params)
    val_dess, val_lbls = update_set(val_smpls, data_params)
    
    train_data = {'des': train_dess, 'lbls': train_lbls, 'smpls': train_smpls}
    val_data   = {'des': val_dess, 'lbls': val_lbls, 'smpls': val_smpls}
    
    return train_num, val_num, train_data, val_data   

In [12]:
# only consider the second trial
ind_min = 1*1141+0
ind_max = 2*1141-1
# train_num = int(data_num*0.8)
# val_num = data_num - train_num
train_num = 10
val_num = 10

def transform(fv):
    """
    Transform to be applied on feature vectors.
    
    Input: fv
    fv - 1xDf torch tensor representing a feature vector
    
    Output: fvv
    fvv - 1xDf' torch tensor representing the transformed feature vector
    """
    
    # for faster run and less memory usage
    fvv = fv[:, ::10]
    
    # for numerical stability during GD
    # fvv = fvv * 10
    
    return fvv

data_params = {'func': datapoint, 'features_dp': '../../data/slowfast_4732/', \
               'spike_data': grouped_data, 'group_id': 0, 'transform': transform}

train_num, val_num, train_data, val_data = \
random_train_val(train_num, val_num, ind_min, ind_max, data_params, seed=1342)

# show statistics
print('train_num = ', train_num, ', val_num = ', val_num)
print('training data contains {} points ({:.2f}%) of label 1.'
      .format(np.sum(train_data['lbls'] == 1), np.sum(train_data['lbls'] == 1)*100/train_num))
print('validation data contains {} points ({:.2f}%) of label 1.'
      .format(np.sum(val_data['lbls'] == 1), np.sum(val_data['lbls'] == 1)*100/val_num))

# print('train_smpls = ', train_data['smpls'], '\nval_smpls = ', val_data['smpls'])
# print('train_lbls = ', train_data['lbls'], '\nval_lbls = ', val_data['lbls'])
# print('train_des = ', train_data['des'], '\nval_des = ', val_data['des'])

train_num =  10 , val_num =  10
training data contains 7 points (70.00%) of label 1.
validation data contains 6 points (60.00%) of label 1.


## snippet to update a set (training or validation)

In [13]:
def update_indices(num, ind_min, ind_max, minus_set, seed=None):
    """
    Update the choice of datapoints for a specific set (either training or validation).
    Do not consider indices in the minus_set as options; 
    this prevents overlap between the previous and the new sets, or between the training and validation sets.
    
    Input: num, ind_min, ind_max, minus_set, seed=None
    num - number of datapoints in the final set
    ind_min - minimum possible datapoint index (acceptable >= batch_sz-1)
    ind_max - maximum possible datapoint index (acceptable < 297*1141)
    minus_set - set of discarded indices (AKA overlapping indices)
    seed - for random selection of datapoints (default: None, machine chosen seed is used)
    
    Output: num, samples
    num - number of chosen datapoints
    samples - list of chosen datapoints' indices
    """
    
    # remove overlapping indices from options
    options = np.arange(ind_min, ind_max+1)
    keeplist = [True] * len(options)
    for index in minus_set:
        keeplist = np.logical_and(keeplist, (options != index))
    options = options[keeplist]
    
    # user error: more datapoints are requested that available options
    num = min(num, len(options))
    
    # select randomly
    rng = np.random.default_rng(seed)
    samples = rng.choice(options, size=num, replace=False)
    
    return num, samples

In [14]:
# update the validation set, so that it doesn't overlap with the training or the previous validation set
val_num = 15
minus_set = np.append(val_data['smpls'], train_data['smpls'])
val_num, val_smpls = update_indices(val_num, ind_min, ind_max, minus_set, seed=None)
val_dess, val_lbls = update_set(val_smpls, data_params)
val_data   = {'des': val_dess, 'lbls': val_lbls, 'smpls': val_smpls}

# show statistics
print('new val_num = ', val_num)
print('validation data contains {} points ({:.2f}%) of label 1.'
      .format(np.sum(val_data['lbls'] == 1), np.sum(val_data['lbls'] == 1)*100/val_num))
# print('val_smpls = ', val_data['smpls'])
# print('val_lbls = ', val_data['lbls'])
# print('val_des = ', val_data['des'])

new val_num =  15
validation data contains 9 points (60.00%) of label 1.


In [15]:
val_data['des'].shape

(15, 474)

## Normalize feature vectors

Taken from 'code/03_mnist/sift_on_mnist/sift_on_mnist_06.ipynb' with small changes.

In [16]:
def normalize(dess, feature_nrm=1, node_nrm=1):
    """
    Normalize feature vectors.
    Inputs: dess, feature_nrm, node_nrm
    dess - NxD array of features for all datapoints.
    feature_nrm - final norm by feature (columns of dess)
    node_nrm - final norm by datapoint/node (rows of dess)
    
    Outputs: dess_nrm
    dess_nrm - NxD array of normalized features.
    """
    # method 2: double normalization
    # step 1 - feature-wise: subtract mean and divide by standard deviation of each feature.
    dess_mean = np.mean(dess, axis=1, keepdims=True)
    dess_std = np.std(dess, axis=1, keepdims=True)

    dess_nrm = dess - dess_mean
    dess_nrm = dess_nrm * feature_nrm / (dess_std + 0.01)


    # step 2 - smaple-wise: normalize l2-norm of each vector to a certain value.
    ideal_norm = 30
    dess_norm = np.linalg.norm(dess_nrm, axis=0, keepdims=True)
    dess_nrm = dess_nrm * node_nrm / (dess_norm + 0.01)
    
    return dess_nrm

In [17]:
train_dess_nrm = normalize(train_data['des'], feature_nrm=1, node_nrm=30)  
val_dess_nrm = normalize(val_data['des'], feature_nrm=1, node_nrm=30)

print('train. set features:')
print('before: ', train_data['des'])
print('after: ', train_dess_nrm)

# print('\nval. set features:')
# print('before: ', val_data['des'])
# print('after: ', val_dess_nrm)

train. set features:
before:  [[2.8389516 1.8708148 1.8910706 ... 2.7468455 5.0521436 1.7201645]
 [2.7723725 1.8953145 1.9795811 ... 2.6227593 5.0829363 2.3729057]
 [2.8612802 1.8065518 1.9427454 ... 1.9574664 4.9966254 1.6729536]
 ...
 [2.7531817 1.9815539 1.9276134 ... 2.605453  5.0031734 1.795469 ]
 [2.804281  1.9247894 1.9292516 ... 2.5805027 5.018913  1.7007477]
 [2.7910361 1.8826758 2.0089457 ... 2.576486  5.151503  1.7518011]]
after:  [[ -9.484925   -9.570793  -10.352366  ...  -8.033644    8.824194
   -9.853354 ]
 [ -9.984946   -9.380919   -9.772409  ...  -8.916292    9.328838
   -6.5149717]
 [ -8.834356   -9.741596   -9.870943  ... -14.435342    8.7642355
   -9.933851 ]
 ...
 [ -9.780237   -8.767047   -9.905189  ...  -8.756527    9.202212
   -9.285356 ]
 [ -9.202595   -9.112561   -9.940922  ...  -8.988755    9.43847
   -9.817366 ]
 [ -9.705453   -9.573602   -9.715495  ...  -9.318397   10.336171
   -9.786439 ]]
