In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch

# add the path to my packages to system paths so they can be imported
import sys
sys.path.append('/home/yasamanparhizkar/Documents/yorku/01_thesis/code/my_packages')
# sys.path.append('F:\MAScThesis\code\my_packages')
# sys.path.append('/home/yasamanparhizkar/Documents/thesis/code/my_packages')

import my_simgraph_04 as sg
import compare_with_benchmark as bn
import data_handler_01 as dh

  from .autonotebook import tqdm as notebook_tqdm


# Load spike data

Spike data shape:  (297, 1141, 113) $\implies$ (movie repeats, frames/time, neurons)
<br>
Labels are 1 (= spike) or -1 (= no spike).

In [2]:
# load all spike data from file
spikes_dp = '../../data/original_files/spikes.csv'
binned_data = np.loadtxt(spikes_dp, delimiter=',')
binned_data = binned_data.reshape(binned_data.shape[0], 1141, 113)
binned_data = binned_data * 2 - 1     # turn labels from 0,1 to -1,1

I_order_10 = [54, 35, 10, 60, 74, 9, 61, 56, 91, 104]

## Group all 113 neurons

This will create a more balanced dataset which is presumabley easier to solve.
<br>
Grouped data shape:  (297, 1141, 1) $\implies$ (movie repeats, frames/time, group)

In [3]:
# group all neurons together
grouped_data = np.zeros((297, 1141, 1))
for trial in range(297):
    for frame in range(1141):
        grouped_data[trial, frame, :] = 2 * int((binned_data[trial, frame, :] == 1).any()) - 1

In [4]:
# print some statistics
print('grouped_data.shape = ', grouped_data.shape)

avg_spike_perc = 0
print('trial #    | percentage belonging to class 1')
print('---------------------------------------------')
for trial in range(10):
    pers = dh.class_percentages(grouped_data[trial, :, :].reshape(-1), [-1, 1])
    avg_spike_perc += pers[1]
    print('trial #{:3} | {:.2f} %'.format(trial, pers[1]))

avg_spike_perc /= 10
print('---------------------------------------------')
print('AVERAGE     | {:.2f} %'.format(avg_spike_perc))

total_perc = np.sum(grouped_data == 1) *100 /(grouped_data.shape[0] * grouped_data.shape[1])
print('---------------------------------------------')
print('{:.2f} % of the whole data belongs to class 1.'.format(total_perc))

grouped_data.shape =  (297, 1141, 1)
trial #    | percentage belonging to class 1
---------------------------------------------
trial #  0 | 66.26 %
trial #  1 | 69.06 %
trial #  2 | 67.92 %
trial #  3 | 71.08 %
trial #  4 | 68.97 %
trial #  5 | 68.27 %
trial #  6 | 66.87 %
trial #  7 | 65.82 %
trial #  8 | 67.66 %
trial #  9 | 68.19 %
---------------------------------------------
AVERAGE     | 68.01 %
---------------------------------------------
68.47 % of the whole data belongs to class 1.


# Assess the model's performance with random tests

## define functions

In [10]:
def get_valset(train_data, val_num, ind_min, ind_max, data_params):
    # prevent overlap with training set
    minus_set = train_data['smpls']
    
    # create datapoints' label vector for a balanced set
    spike_data = data_params['spike_data']
    group_id = data_params['group_id']
    lbls = spike_data[:,:,group_id].reshape(-1)
    
    val_num, val_smpls = dh.update_indices_balanced(val_num, ind_min, ind_max, minus_set, lbls, seed=None)
    val_dess, val_lbls = dh.update_set(val_smpls, data_params)
    val_data   = {'des': val_dess, 'lbls': val_lbls, 'smpls': val_smpls}
    
    return val_num, val_data

def visualize_M(B, thresh, xloc, yloc, train_comb, train_num, val_num, res_path):
    M = B.T @ B
    sg.display_matrix(M, None)
    # mark prominent elements          
    lim = (thresh/100) * np.max(M) # marker threshold                
    plt.plot(xloc[M > lim],yloc[M > lim], marker='o', markersize=3, color='r', linestyle='')
    plt.title('M - marked above {}%'.format(thresh))
    # save figure
    plt.savefig(res_path+'finalM_'+str(val_num)+'_'+str(train_num)+'_'+str(train_comb)+'.png')
    plt.close()
    
def assessment_quantities(val_data, val_num, y_est, val_acc):
    nospk_per = np.sum(val_data['lbls']==-1)/val_num
    min_acc = max(nospk_per, 1-nospk_per)
    if sum(val_data['lbls']==1) == 0:
        missed = 0
    else:
        missed = sum(np.logical_and(val_data['lbls']==1, y_est < 0))/sum(val_data['lbls']==1)

    if sum(val_data['lbls']==-1) == 0:
        false_alarm = 0
    else:
        false_alarm = sum(np.logical_and(val_data['lbls']==-1, y_est > 0))/sum(val_data['lbls']==-1)
        
    assess_qs = {'min_acc': min_acc, 'val_acc': val_acc, 'missed': missed, 'false_alarm': false_alarm}
        
    return assess_qs

def make_line(head, train_num, val_num, res_dict, index):
    line = '{:^10} | {:^10} | {:^10} | {:^10.2f} | {:^10.2f} | {:^17.2f} | {:^17.2f} \n'\
           .format(head, train_num, val_num, \
                   res_dict['min_acc'][index]*100, \
                   res_dict['val_acc'][index]*100, \
                   res_dict['missed'][index]*100, \
                   res_dict['false_alarm'][index]*100)
    return line

def take_train_step(train_num, val_num, ind_min, ind_max, data_params, nn_opt_params, nn_arch_params, seed=None):
    # create training set
    train_num, _, train_data, _ = dh.random_train_val_balanced(train_num, val_num, ind_min, ind_max, data_params, seed)

    # train the model              
    Theta, nn_stats = bn.fit(train_data['des'], (train_data['lbls'] == 1).astype(int), nn_opt_params, nn_arch_params, show_nrmdE=False)
    
    return train_num, train_data, Theta, nn_stats

def take_val_step(train_data, val_num, ind_min, ind_max, data_params, nn_arch_params, Theta, seed=None):
    # create validation set, NO overlap with the training set
    val_num, val_data = get_valset(train_data, val_num, ind_min, ind_max, data_params)

    # validate the model
    val_acc, y_est = bn.get_acc(val_data['des'], (val_data['lbls'] == 1).astype(int), nn_arch_params, Theta)
    y_est = y_est * 2 - 1
    # compute several assessment quantities
    assess_qs = assessment_quantities(val_data, val_num, y_est, val_acc)
    
    return val_num, val_data, assess_qs, y_est

def avg_and_log(next_dict, prev_dict, index, head, train_num, val_num, func, path):
    # compute averages over random combinations of validation sets
    for quantity in prev_dict:
        if func == 'mean':
            next_dict[quantity][index] = np.mean(prev_dict[quantity])
        elif func == 'std':
            next_dict[quantity][index] = np.std(prev_dict[quantity])
        else:
            assert False

    # save on file
    with open(path+'log.txt', 'a') as file:
        line = make_line(head, train_num, val_num, next_dict, index)
        file.write(line)
        
    return next_dict

In [11]:
def assess_nn_model(train_sizes, val_sizes, train_combs, val_combs, res_path, data_params, nn_opt_params, nn_arch_params, ind_min, ind_max, seed=None):
    # prepare results file
    with open(res_path+'log.txt', 'w') as file:    
        arr = ('{:^10} | {:^10} | {:^10} | {:^10} | {:^10} | {:^17} | {:^17} \n'\
               .format('i', 'train_num', 'val_num', 'min_acc(%)', 'val_acc(%)',\
                       'missed spks(%)', 'false alarms(%)'),'-'*101+'\n')
        file.writelines(arr)

    # create dictionaries to keep interesting variables
    assess_qs = {'min_acc': 0, 'val_acc': 0, 'missed': 0, 'false_alarm': 0}
    val_comb_res = {}
    train_comb_res = {}
    train_num_res = {}
    train_num_err = {}
    val_num_res = {}
    val_num_err = {}
    for quantity in assess_qs:
        val_comb_res[quantity] = np.zeros(val_combs)
        train_comb_res[quantity] = np.zeros(train_combs)
        train_num_res[quantity] = np.zeros(len(train_sizes))
        train_num_err[quantity] = np.zeros(len(train_sizes))
        val_num_res[quantity] = np.zeros(len(val_sizes))
        val_num_err[quantity] = np.zeros(len(val_sizes))

    i = 0
    for val_num in val_sizes:
        j = 0
        for train_num in train_sizes:
            for train_comb in range(train_combs):
                # train
                train_num, train_data, Theta, nn_stats = \
                take_train_step(train_num, val_num, ind_min, ind_max, data_params, nn_opt_params, nn_arch_params, seed)
                

                for val_comb in range(val_combs):
                    # validate
                    val_num, val_data, assess_qs, y_est= \
                    take_val_step(train_data, val_num, ind_min, ind_max, data_params, nn_arch_params, Theta, seed)
                    # log resutls
                    val_comb_res = avg_and_log(val_comb_res, assess_qs, val_comb, str(val_comb), train_num, val_num, 'mean', res_path)

                # average over various validation set combinations and log
                train_comb_res = avg_and_log(train_comb_res, val_comb_res, train_comb, '>'+str(train_comb), train_num, val_num, 'mean', res_path)
            # average over various training and validation set combinations and log
            train_num_res = avg_and_log(train_num_res, train_comb_res, j, '*t*', train_num, val_num, 'mean', res_path)
            train_num_err = avg_and_log(train_num_err, train_comb_res, j, '*te*', train_num, val_num, 'std', res_path)
            j += 1
        # average over various training set sizes and training and validation set combinations, and log
        val_num_res = avg_and_log(val_num_res, train_num_res, i, '**v**', train_num, val_num, 'mean', res_path)
        val_num_err = avg_and_log(val_num_err, train_num_res, i, '**ve**', train_num, val_num, 'std', res_path)
        i += 1   
        # save train_num_res curves for this specific val_num
        with open(res_path+'curves_'+str(i-1)+'.txt', 'w') as file:
            for quantity in assess_qs:
                np.savetxt(file, train_num_res[quantity])
                np.savetxt(file, train_num_err[quantity])
                file.write('\n')
        
    return val_num_res, val_num_err
    

In [15]:
def plot_curves(train_sizes, val_sizes, val_num_res, val_num_err, res_path, nn_arch_params):
    curves = {}
    errors = {}
    for i in range(len(val_sizes)):
        curves_i = np.loadtxt(res_path+'curves_'+str(i)+'.txt')
        curves_i = curves_i.reshape(8, -1)
        j = 0
        for quantity in val_num_res:
            if i == 0:
                curves[quantity] = curves_i[2*j].reshape(1, -1)
                errors[quantity] = curves_i[2*j+1].reshape(1, -1)
            else:
                curves[quantity] = np.concatenate((curves[quantity], [curves_i[2*j]]), axis=0)
                errors[quantity] = np.concatenate((errors[quantity], [curves_i[2*j+1]]), axis=0)
            j += 1

    plt.figure()
    plt.subplots_adjust(left=0.1,bottom=0.1,right=0.9,top=0.9,wspace=0.8,hspace=0.8)
    for i in range(len(val_sizes)):
        plt.subplot(len(val_sizes), 1, i+1)
        for quantity in curves:
            plt.errorbar(train_sizes, curves[quantity][i], errors[quantity][i])
        plt.legend(curves.keys())
        plt.xlabel('training set size')
        plt.ylabel('{} val repeats x {} train repeats'.format(val_combs, train_combs))
        _ = plt.title('val. set size = {}, #layers = {}, #units = {}'.format(val_sizes[i], nn_arch_params['num_hidden_layers'], nn_arch_params['num_hidden_units']))
    plt.savefig(res_path+'train_curves.png')
    plt.close()
    
    plt.figure()
    for quantity in val_num_res:
        plt.errorbar(val_sizes, val_num_res[quantity], val_num_err[quantity])
    plt.legend(val_num_res.keys())
    plt.xlabel('validation set size')
    plt.ylabel('{} val repeats x {} train repeats x {} train set sizes'.format(val_combs, train_combs, len(train_sizes)))
    _ = plt.title('#layers = {}, #units = {}'.format(nn_arch_params['num_hidden_layers'], nn_arch_params['num_hidden_units']))
    plt.savefig(res_path+'val_curves.png')
    plt.close()

## Assess with various $\mu$ values

In [45]:
# only consider the second trial
ind_min = 1*1141+0
ind_max = 2*1141-1

def transform(fv):
    """
    Transform to be applied on feature vectors.
    
    Input: fv
    fv - 1xDf torch tensor representing a feature vector
    
    Output: fvv
    fvv - 1xDf' torch tensor representing the transformed feature vector
    """
    
    # for faster run and less memory usage
    fvv = fv[:, ::10]
    
    # for numerical stability during GD
    # fvv = fvv * 10
    
    return fvv

data_params = {'func': dh.datapoint_torch, 'lbl_func': dh.get_labels, 'features_dp': '../../data/features/slowfast/slowfast_4732/', \
               'spike_data': grouped_data, 'group_id': 0, 'transform': transform}

nn_opt_params = { 'epsilon0':1, 'epsilon_decay':0.5, 'epsilon_jump': 2, \
                  'num_its':16, 'check_freq':1, 'print_checks':False, 'Theta0':None, \
                  'force_all_its': True, 'threshold': 0.01}

num_classes = 2
nn_arch_params = { 'num_hidden_layers':5, 'num_hidden_units':20, 'num_outs':num_classes, \
               'act_func':bn.relu, 'out_func':bn.softmax, 'loss_func':bn.crossent }

# try various training and validation set sizes
train_sizes = [10, 20, 30, 40, 50, 100, 150, 200, 250, 300]
val_sizes = [10, 20]

# for each set size, try a number of random combinations of datapoints
train_combs = 10
val_combs = 10

res_path = '../../data/experiments/slowfast/temp/'

In [46]:
val_num_res, val_num_err = assess_nn_model(train_sizes, val_sizes, train_combs, val_combs, res_path, data_params, nn_opt_params, nn_arch_params, ind_min, ind_max, seed=None)
plot_curves(train_sizes, val_sizes, val_num_res, val_num_err, res_path, nn_arch_params)

In [37]:
# reload a package
import importlib
importlib.reload(dh)

<module 'data_handler_01' from '/home/yasamanparhizkar/Documents/yorku/01_thesis/code/my_packages/data_handler_01.py'>