In [1]:
import xgboost as xgb
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

# add the path to my packages to system paths so they can be imported
import sys
sys.path.append('/home/yasamanparhizkar/Documents/yorku/01_thesis/code/my_packages')
# sys.path.append('F:\MAScThesis\code\my_packages')
# sys.path.append('/home/yasamanparhizkar/Documents/thesis/code/my_packages')

import data_handler_01 as dh
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# Load data

In [2]:
# load all spike data from file
spikes_dp = '../../data/original_files/spikes.csv'
binned_data = np.loadtxt(spikes_dp, delimiter=',')
binned_data = binned_data.reshape(binned_data.shape[0], 1141, 113)
binned_data = binned_data * 2 - 1     # turn labels from 0,1 to -1,1

I_order_10 = [54, 35, 10, 60, 74, 9, 61, 56, 91, 104]

# group all neurons together
grouped_data = np.zeros((297, 1141, 1))
for trial in range(297):
    for frame in range(1141):
        grouped_data[trial, frame, :] = 2 * int((binned_data[trial, frame, :] == 1).any()) - 1
        
print('grouped_data.shape = ', grouped_data.shape)

grouped_data.shape =  (297, 1141, 1)


# Assess the model's performance with random tests

## define functions

In [3]:
def get_valset(train_data, val_num, ind_min, ind_max, data_params):
    # prevent overlap with training set
    minus_set = train_data['smpls']
    
    # create datapoints' label vector for a balanced set
    lbl_func = data_params['lbl_func']
    lbls = lbl_func(data_params)
    
    val_num, val_smpls = dh.update_indices_balanced(val_num, ind_min, ind_max, minus_set, lbls, seed=None)
    val_dess, val_lbls = dh.update_set(val_smpls, data_params)
    val_data   = {'des': val_dess, 'lbls': val_lbls, 'smpls': val_smpls}
    
    return val_num, val_data

def visualize_M(B, thresh, xloc, yloc, train_comb, train_num, val_num, res_path):
    M = B.T @ B
    sg.display_matrix(M, None)
    # mark prominent elements          
    lim = (thresh/100) * np.max(M) # marker threshold                
    plt.plot(xloc[M > lim],yloc[M > lim], marker='o', markersize=3, color='r', linestyle='')
    plt.title('M - marked above {}%'.format(thresh))
    # save figure
    plt.savefig(res_path+'finalM_'+str(val_num)+'_'+str(train_num)+'_'+str(train_comb)+'.png')
    plt.close()
    
def assessment_quantities(val_data, val_num, preds_th, val_acc):
    nospk_per = np.sum(val_data['lbls']!=1)/val_num
    min_acc = max(nospk_per, 1-nospk_per)
    if sum(val_data['lbls']==1) == 0:
        missed = 0
    else:
        missed = sum(np.logical_and(val_data['lbls']==1, preds_th < 0.5))/sum(val_data['lbls']==1)

    if sum(val_data['lbls']!=1) == 0:
        false_alarm = 0
    else:
        false_alarm = sum(np.logical_and(val_data['lbls']!=1, preds_th > 0.5))/sum(val_data['lbls']!=1)
        
    assess_qs = {'min_acc': min_acc, 'val_acc': val_acc, 'missed': missed, 'false_alarm': false_alarm}
        
    return assess_qs

def make_line(head, train_num, val_num, res_dict, index):
    line = '{:^10} | {:^10} | {:^10} | {:^10.2f} | {:^10.2f} | {:^17.2f} | {:^17.2f} \n'\
           .format(head, train_num, val_num, \
                   res_dict['min_acc'][index]*100, \
                   res_dict['val_acc'][index]*100, \
                   res_dict['missed'][index]*100, \
                   res_dict['false_alarm'][index]*100)
    return line

def take_train_step(train_num, val_num, ind_min, ind_max, data_params, xgb_params, num_boost_round, evals, veval, estop, res_path, seed=None):
    # create training set
    train_num, val_num, train_data, val_data = dh.random_train_val_balanced(train_num, val_num, ind_min, ind_max, data_params, seed)

    # create xgb Dmatrices
    dtrain = xgb.DMatrix(train_data['des'], label=(train_data['lbls']+1)//2)
    dval = xgb.DMatrix(val_data['des'], label=(val_data['lbls']+1)//2)
    
    # train the model
    if evals:
        evals = [(dtrain, "train"), (dval, "validation")]
    else:
        evals = None
    model = xgb.train(params=xgb_params,dtrain=dtrain,num_boost_round=num_boost_round,evals=evals,verbose_eval=veval, early_stopping_rounds=estop)
 
    return train_num, train_data, model

def take_val_step(train_data, val_num, ind_min, ind_max, data_params, model, estop, seed=None):
    # create validation set, NO overlap with the training set
    val_num, val_data = get_valset(train_data, val_num, ind_min, ind_max, data_params)

    # validate the model
    gt_lbls = (val_data['lbls']+1)//2
    dval = xgb.DMatrix(val_data['des'], label=gt_lbls)
    if estop is not None:
        preds = model.predict(dval, iteration_range=(0, model.best_iteration+1))
    else:
        preds = model.predict(dval)
    
    # compute validation accuracy
    preds_th = (preds > 0.5).astype(int)
    val_acc = sum(preds_th == gt_lbls)/len(gt_lbls)

    # compute several assessment quantities
    assess_qs = assessment_quantities(val_data, val_num, preds_th, val_acc)
    
    return val_num, val_data, assess_qs, preds_th

def avg_and_log(next_dict, prev_dict, index, head, train_num, val_num, func, path):
    # compute averages over random combinations of validation sets
    for quantity in prev_dict:
        if func == 'mean':
            next_dict[quantity][index] = np.mean(prev_dict[quantity])
        elif func == 'std':
            next_dict[quantity][index] = np.std(prev_dict[quantity])
        else:
            assert False

    # save on file
    with open(path+'log.txt', 'a') as file:
        line = make_line(head, train_num, val_num, next_dict, index)
        file.write(line)
        
    return next_dict

In [4]:
def assess_sg_model(train_sizes, val_sizes, train_combs, val_combs, res_path, data_params, xgb_params, num_boost_round, evals, veval, estop, ind_min, ind_max, seed=None):
    # prepare results file
    with open(res_path+'log.txt', 'w') as file:    
        arr = ('{:^10} | {:^10} | {:^10} | {:^10} | {:^10} | {:^17} | {:^17} \n'\
               .format('i', 'train_num', 'val_num', 'min_acc(%)', 'val_acc(%)',\
                       'missed spks(%)', 'false alarms(%)'),'-'*101+'\n')
        file.writelines(arr)

    # create dictionaries to keep interesting variables
    assess_qs = {'min_acc': 0, 'val_acc': 0, 'missed': 0, 'false_alarm': 0}
    val_comb_res = {}
    train_comb_res = {}
    train_num_res = {}
    train_num_err = {}
    val_num_res = {}
    val_num_err = {}
    for quantity in assess_qs:
        val_comb_res[quantity] = np.zeros(val_combs)
        train_comb_res[quantity] = np.zeros(train_combs)
        train_num_res[quantity] = np.zeros(len(train_sizes))
        train_num_err[quantity] = np.zeros(len(train_sizes))
        val_num_res[quantity] = np.zeros(len(val_sizes))
        val_num_err[quantity] = np.zeros(len(val_sizes))

    i = 0
    for val_num in val_sizes:
        j = 0
        for train_num in train_sizes:
            for train_comb in range(train_combs):
                # train               
                train_num, train_data, model = \
                take_train_step(train_num, val_num, ind_min, ind_max, data_params, xgb_params, num_boost_round, evals, veval, estop, res_path, seed=None)

                for val_comb in range(val_combs):
                    # validate
                    val_num, val_data, assess_qs, preds_th = \
                    take_val_step(train_data, val_num, ind_min, ind_max, data_params, model, estop, seed=None)
                    # log resutls
                    val_comb_res = avg_and_log(val_comb_res, assess_qs, val_comb, str(val_comb), train_num, val_num, 'mean', res_path)

                # average over various validation set combinations and log
                train_comb_res = avg_and_log(train_comb_res, val_comb_res, train_comb, '>'+str(train_comb), train_num, val_num, 'mean', res_path)
            # average over various training and validation set combinations and log
            train_num_res = avg_and_log(train_num_res, train_comb_res, j, '*t*', train_num, val_num, 'mean', res_path)
            train_num_err = avg_and_log(train_num_err, train_comb_res, j, '*te*', train_num, val_num, 'std', res_path)
            j += 1
        # average over various training set sizes and training and validation set combinations, and log
        val_num_res = avg_and_log(val_num_res, train_num_res, i, '**v**', train_num, val_num, 'mean', res_path)
        val_num_err = avg_and_log(val_num_err, train_num_res, i, '**ve**', train_num, val_num, 'std', res_path)
        i += 1   
        # save train_num_res curves for this specific val_num
        with open(res_path+'curves_'+str(i-1)+'.txt', 'w') as file:
            for quantity in assess_qs:
                np.savetxt(file, train_num_res[quantity])
                np.savetxt(file, train_num_err[quantity])
                file.write('\n')
        
    return val_num_res, val_num_err
    

In [5]:
def plot_curves(train_sizes, val_sizes, val_num_res, val_num_err, res_path):
    curves = {}
    errors = {}
    for i in range(len(val_sizes)):
        curves_i = np.loadtxt(res_path+'curves_'+str(i)+'.txt')
        curves_i = curves_i.reshape(8, -1)
        j = 0
        for quantity in val_num_res:
            if i == 0:
                curves[quantity] = curves_i[2*j].reshape(1, -1)
                errors[quantity] = curves_i[2*j+1].reshape(1, -1)
            else:
                curves[quantity] = np.concatenate((curves[quantity], [curves_i[2*j]]), axis=0)
                errors[quantity] = np.concatenate((errors[quantity], [curves_i[2*j+1]]), axis=0)
            j += 1

    plt.figure()
    plt.subplots_adjust(left=0.1,bottom=0.1,right=0.9,top=0.9,wspace=0.8,hspace=0.8)
    for i in range(len(val_sizes)):
        plt.subplot(len(val_sizes), 1, i+1)
        for quantity in curves:
            plt.errorbar(train_sizes, curves[quantity][i], errors[quantity][i])
        plt.legend(curves.keys())
        plt.xlabel('training set size')
        plt.ylabel('{} val repeats x {} train repeats'.format(val_combs, train_combs))
        _ = plt.title('val. set size = {}'.format(val_sizes[i]))
    plt.savefig(res_path+'train_curves.png')
    plt.close()
    
    plt.figure()
    for quantity in val_num_res:
        plt.errorbar(val_sizes, val_num_res[quantity], val_num_err[quantity])
    plt.legend(val_num_res.keys())
    plt.xlabel('validation set size')
    plt.ylabel('{} val repeats x {} train repeats x {} train set sizes'.format(val_combs, train_combs, len(train_sizes)))
    _ = plt.title('{}'.format('Default hyperparameters'))
    plt.savefig(res_path+'val_curves.png')
    plt.close()

In [6]:
# only consider the second trial
ind_min = 1*1141+0
ind_max = 2*1141-1

def transform(fv):
    """
    Transform to be applied on feature vectors.
    
    Input: fv
    fv - 1xDf torch tensor representing a feature vector
    
    Output: fvv
    fvv - 1xDf' torch tensor representing the transformed feature vector
    """
    
    # for faster run and less memory usage
    fvv = fv[:, ::10]
    
    # for numerical stability during GD
    # fvv = fvv * 10
    
    return fvv

data_params = {'func': dh.datapoint_torch, 'lbl_func': dh.get_labels, 'features_dp': '../../data/features/slowfast/slowfast_4732/', \
               'spike_data': grouped_data, 'group_id': 0, 'transform': transform}

# Define hyperparameters
xgb_params = {"objective": "binary:logistic", "tree_method": "hist"}
num_boost_round = 100
veval = 101
estop = 10
evals = True

# try various training and validation set sizes
train_sizes = [50, 100, 150, 200, 250, 300]
val_sizes = [10, 20]

# for each set size, try a number of random combinations of datapoints
train_combs = 10
val_combs = 10

res_path = '../../data/experiments/slowfast_xgb/temp/'

In [7]:
val_num_res, val_num_err = assess_sg_model(train_sizes, val_sizes, train_combs, val_combs, res_path, data_params, xgb_params, num_boost_round, evals, veval, estop, ind_min, ind_max, seed=None)
plot_curves(train_sizes, val_sizes, val_num_res, val_num_err, res_path)

[0]	train-logloss:0.51628	validation-logloss:0.71509
[10]	train-logloss:0.10006	validation-logloss:1.02754
[0]	train-logloss:0.51676	validation-logloss:0.69409
[14]	train-logloss:0.08010	validation-logloss:0.72512
[0]	train-logloss:0.53530	validation-logloss:0.63353
[30]	train-logloss:0.04780	validation-logloss:0.41150
[0]	train-logloss:0.51706	validation-logloss:0.64108
[13]	train-logloss:0.07484	validation-logloss:0.72197
[0]	train-logloss:0.52147	validation-logloss:0.65194
[20]	train-logloss:0.05887	validation-logloss:0.56603
[0]	train-logloss:0.51812	validation-logloss:0.64625
[10]	train-logloss:0.10328	validation-logloss:0.70392
[0]	train-logloss:0.51494	validation-logloss:0.71067
[28]	train-logloss:0.04320	validation-logloss:0.61417
[0]	train-logloss:0.53264	validation-logloss:0.72687
[37]	train-logloss:0.04360	validation-logloss:0.46400
[0]	train-logloss:0.52637	validation-logloss:0.73231
[14]	train-logloss:0.07792	validation-logloss:0.70646
[0]	train-logloss:0.51541	validation-