### Average arrays by trial type
This notebook demonstrates how to average single-trial data (from Widefield_Preproc_Spark_Swift, Widefield_Correlation_Spark) by trial type. The output is stored as movies per trial type on Swift.

### Imports

In [None]:
import os, sys
import numpy as np
import matplotlib.animation as animation
from matplotlib import pylab as plt
from __future__ import print_function
import getpass
import tempfile
import shutil
import h5py
import re
import csv
import time
%matplotlib inline

nbBackend = 'openstack'

# add folder 'utils' to the Python path
# this folder contains custom written code that is required for data import and analysis
utils_dir = os.path.join(os.getcwd(), 'utils')
sys.path.append(utils_dir)
import SwiftStorageUtils
from SwiftStorageUtils import saveAsMat, saveAsH5
import WidefieldDataUtils as wf

In [None]:
# swift file system parameters
swift_container = 'dayra' # specify name of container in Swift (do not use _ etc.)
swift_provider = 'SparkTest' # in general, this should not change
swift_basename = "swift://" + swift_container + "." + swift_provider + "/"

# storage location of data relative to swift_basename
data_folder = 'dff_out'

# array name in the HDF5 file (either mov, dff or corr)
h5_array_id = 'dff'

# start of name for matching files
filename_start = '20170214_' # all files with names starting like this will be processed

# Specify output file formats
# For each output format, one average per trial type will be saved
# Mat-file output should only be used for saving movie averages, to be used with OCIA

# save HDF5 output file
save_h5 = True
h5_folder = 'h5_avg'

# save mat-file output (only for saving average mov files for use with OCIA)
save_mat = True
# in which folder to save the mat files
mat_folder = 'mat_out'

# save a movie animation (mp4) in the folder animations
# only for presentation (highly compressed)
save_movie = False

In [None]:
# OpenStack credentials for accessing Swift storage
os_username = 'hluetc'
os_tenant_name = 'helmchen.hifo.uzh'
os_auth_url = 'https://cloud.s3it.uzh.ch:5000/v2.0'
# provide OS password
os_password = getpass.getpass()

In [None]:
# put all these params in a dict for later access
file_params = dict()
file_params['swift_container'] = swift_container
file_params['swift_provider'] = swift_provider
file_params['swift_basename'] = swift_basename
file_params['os_username'] = os_username
file_params['os_tenant_name'] = os_tenant_name
file_params['os_auth_url'] = os_auth_url
file_params['os_password'] = os_password

In [None]:
from setupSpark import initSpark
# Initialize Spark
# specify the number of cores and the memory of the workers
# each worker VM has 8 cores and 32 GB of memory
# the status of the cluster (ie. how many cores are available) can be checked in the Spark UI:
# http://SparkMasterIP:8080/

spark_instances = 1 # the number of workers to be used
executor_cores = 8 # the number of cores to be used on each worker
executor_memory = '28G' # the amount of memory to be used on each worker
max_cores = spark_instances*executor_cores # the max. number of cores Spark is allowed to use overall

# returns the SparkContext object 'sc' which tells Spark how to access the cluster
sc = initSpark(nbBackend, spark_instances=spark_instances, executor_cores=executor_cores, \
               max_cores=max_cores, executor_memory=executor_memory)

In [None]:
# provide OS credentials to the Hadoop configuration
sc._jsc.hadoopConfiguration().set('fs.swift.service.SparkTest.username', os_username)
sc._jsc.hadoopConfiguration().set('fs.swift.service.SparkTest.tenant', os_tenant_name)
sc._jsc.hadoopConfiguration().set('fs.swift.service.SparkTest.password', os_password)

In [None]:
# add Python files in 'utils' folder to the SparkContext 
# this is required so that all files are available on all the cluster workers
for filename in os.listdir(utils_dir):
    if filename.endswith('.py'):
        sc.addPyFile(os.path.join(utils_dir, filename))

Get relevant files from container and create RDD from list of objects to download.

In [None]:
from SwiftStorageUtils import listItems
object_list = listItems(swift_container, file_params)
objects_to_download = [n for n in object_list if n.startswith(data_folder) and filename_start in n]
objects_to_download

In [None]:
file_rdd = sc.parallelize(objects_to_download)

### Import trial IDs and types

In [None]:
# Read trials_ind.mat file (deprecated)
# File with trial indices
# trials_index_file = 'trials_ind.mat'

# # local storage directory --> remember to delete afterwards
# temp_dir = tempfile.mkdtemp()

# # download options
# down_opts = {
#     'skip_identical': True,
#     'out_directory': temp_dir,
# }

# from SwiftStorageUtils import downloadItems
# downloadItems(swift_container, [trials_index_file], file_params, down_opts)

# trial_ind = wf.importTrialIndices('%s%s%s' % (temp_dir, os.path.sep, trials_index_file))

# # delete temp dir
# shutil.rmtree(temp_dir)

In [None]:
# Read TrialList.txt (created during pre-processing)
# File with trial information
trial_list_file = 'TrialList.txt'

# local storage directory --> remember to delete afterwards
temp_dir = tempfile.mkdtemp()

# download options
down_opts = {
    'skip_identical': True,
    'out_directory': temp_dir,
}

from SwiftStorageUtils import downloadItems
downloadItems(swift_container, [trial_list_file], file_params, down_opts)

with open('%s%s%s' % (temp_dir, os.path.sep, trial_list_file)) as fid:
    reader=csv.reader(fid,delimiter='\t')
    trial_list = []
    for line in reader:
        line[0] = int(line[0])
        line[1] = int(line[1])
        trial_list.append(line)

# delete temp dir
shutil.rmtree(temp_dir)

In [None]:
# def getTrialType_legacy(selected_file, trial_ind):
#     """
#     Return trial type of input file from trial_ind.
#     """
#     # parse file name to get trial_no
#     selected_file = selected_file.replace(data_folder + '/', '').replace('.h5', '')
#     trial_no = int(selected_file[selected_file.rfind('_')+1:])
# #     p = re.compile('\d{1,8}')
# #     file_info = p.findall(selected_file)
# #     trial_no = int(file_info[2])
#     # search trial_ind for trial_type
#     trial_type = [i for i in trial_ind if trial_no in trial_ind[i]]
#     if not len(trial_type):
#         return 'void'
#     else:
#         return trial_type[0]

In [None]:
def getTrialType(selected_file, trial_list, data_folder):
    """
    Return trial type of input file using trial list lookup
    """
    # parse file name for trial no.
    selected_file = selected_file.replace(data_folder + '/', '').replace('.h5', '')
    trial_no = int(selected_file[selected_file.rfind('_')+1:])
    # search trial list to find trial type
    trial_type = [i for i in trial_list if i[0] == trial_no][0][3]
    # TODO: filter correct / incorrect trials
    trial_type = trial_type[trial_type.rfind(' ')+1:]
    return trial_type

Set the key of each RDD element as the trial type. Check the first few elements to see if results make sense.

In [None]:
file_rdd.first()

In [None]:
# set trial_type as key
file_rdd_keyed = file_rdd.map(lambda x: (getTrialType(x, trial_list, data_folder), x))

In [None]:
file_rdd_keyed.getNumPartitions()

In [None]:
file_rdd_keyed.take(5)

Return a dictionary with unique trials and counts.

In [None]:
trial_count = file_rdd_keyed.countByKey()
trial_count

### Import data
Next, we import the data from HDF5.

In [None]:
def getArrayFromH5(h5file, dataset_name):
    """
    Return array data stored in HDF5 file
    """
    with h5py.File(h5file,'r') as hf:
        print('List of arrays in HDF5 file: ', hf.keys())
        data = hf.get(dataset_name)
        data = np.array(data)
        print('Shape of the array %s: ' % (dataset_name), data.shape)
        return data

In [None]:
from SwiftStorageUtils import downloadItems
def convert2rdd(obj, file_params, h5_array_id):
    """
    Import HDF5 array data into Spark RDD
    """
    temp_dir = tempfile.mkdtemp()
    
    # download options
    down_opts = {
        'skip_identical': True,
        'out_directory': temp_dir,
    }
    downloadItems(file_params['swift_container'], [obj], file_params, down_opts)
    
    local_file = '%s%s%s' % (temp_dir, os.path.sep, obj)
    print('Local file: ', local_file)
    
    data = getArrayFromH5(local_file, h5_array_id)
    
    # delete temp dir
    shutil.rmtree(temp_dir)
    
    return data

Setup RDD as Key-Value pair with key=trial_type and value as (data_array, 1).

In [None]:
trial_arr_rdd = file_rdd_keyed.map(lambda (k,v): (k, (convert2rdd(v, file_params, h5_array_id), 1)))

Get the first data array to determine number of timepoints and calculate time vector

In [None]:
arr1 = trial_arr_rdd.first()[1][0]
timepoints = arr1.shape[2]

In [None]:
# time vector
sample_rate = 20.0 # Hz
t = (np.array(range(timepoints)) / sample_rate) - 3.0

### Calculate the average array over trials
This is done in two steps: first add up the arrays for each trial type, keeping track of the number of trials. Second, divide the summed array by the number of trials.

In [None]:
# reduceByKey will add up arrays of a specific trial_type (key) 
# and also keep track of the number of trials per trial type
avg_rdd = trial_arr_rdd.reduceByKey(lambda (arr1, count1), (arr2, count2): (arr1+arr2, count1+count2))

In [None]:
# divide the final summed array per trial_type by the number of trials to get the average
avg_rdd = avg_rdd.map(lambda (k,v): (k, (v[0]/v[1])))

### Save average as mat / movie
For each trial type, compute the average and then save the movie back to Swift.

In [None]:
duration_list = []
for i_trial in trial_count:
    print("Computing average for trial type %s" % (i_trial))
    t1 = time.time()
    trial_avg = avg_rdd.filter(lambda (k,v): k==i_trial).first()[1]
    dur = (time.time()-t1)
    duration_list.append(dur)
    print("Duration: %1.2fs" % dur)
    # save output
    if save_h5:
        print("Saving HDF5 file for trial type %s" % (i_trial))
        h5file_name = '%s_%s_AVG.h5' % (data_folder, i_trial)
        dataset_name = '%s_AVG' % (h5_array_id)
        saveAsH5(trial_avg, h5file_name, dataset_name, h5_folder, file_params)
    if save_mat:
        print("Saving mat-file for trial type %s" % (i_trial))
        matfile_name = 'cond_%s_ave.mat' % (i_trial.replace('P',''))
        dataset_name = 'cond_%s_ave' % (i_trial.replace('P',''))
        saveAsMat(trial_avg, matfile_name, dataset_name, mat_folder, file_params)
    if save_movie:
        print("Saving movie for trial type %s" % (i_trial))
        wf.saveMovie(trial_avg, i_trial, h5_array_id, sample_rate, t, file_params)

# Summarize timing stats
print("Average by trial type - %1.0f cores" % (max_cores))
print("Time per trial: %1.3fs" % (sum(duration_list)/file_rdd.count()))

In [None]:
sc.stop()