### Sliding-window correlation analysis
This notebook shows how to run a sliding-window correlation analysis between an ROI seed signal and the timeseries for each pixel. The output is stored as HDF5 files (one per trial) on Swift. A movie of the correlation signal for selected trials can also be generated and stored. Sliding-window correlation is a time consuming computation. It is parallelized across trials using Spark

### Imports

In [None]:
# Import Python modules
import os, sys
import numpy as np
from matplotlib import pylab as plt
from __future__ import print_function
import getpass
import tempfile
import shutil
import h5py
import re

%matplotlib inline

nbBackend = 'openstack'

# add folder 'utils' to the Python path
# this folder contains custom written code that is required for data import and analysis
utils_dir = os.path.join(os.getcwd(), 'utils')
sys.path.append(utils_dir)
import SwiftStorageUtils

In [None]:
# Import custom-written modules
import WidefieldDataUtils as wf
import PickleUtils as pick
import CalciumAnalysisUtils as calciumTools

### Data and Analysis Parameters

In [None]:
# swift file system
swift_container = 'ariel' # specify name of container in Swift (do not use _ etc.)
swift_provider = 'SparkTest' # in general, this should not change
swift_basename = "swift://" + swift_container + "." + swift_provider + "/"

# storage location of dFF data (from Widefield_Preproc_Spark_Swift notebook)
output_folder_dff = 'dff_out'

# start of name for matching files
filename_start = '20152310_' # all files with names starting like this will be processed

In [None]:
# OpenStack credentials for accessing Swift storage
os_username = 'hluetc'
os_tenant_name = 'helmchen.hifo.uzh'
os_auth_url = 'https://cloud.s3it.uzh.ch:5000/v2.0'
# provide OS password
os_password = getpass.getpass()

In [None]:
# put all these params in a dict for later access
file_params = dict()
file_params['swift_container'] = swift_container
file_params['swift_provider'] = swift_provider
file_params['swift_basename'] = swift_basename
file_params['os_username'] = os_username
file_params['os_tenant_name'] = os_tenant_name
file_params['os_auth_url'] = os_auth_url
file_params['os_password'] = os_password

### Start SparkContext

In [None]:
from setupSpark import initSpark
# Initialize Spark
# specify the number of cores and the memory of the workers
# each worker VM has 8 cores and 32 GB of memory
# the status of the cluster (ie. how many cores are available) can be checked in the Spark UI:
# http://SparkMasterIP:8080/

spark_instances = 2 # the number of workers to be used
executor_cores = 8 # the number of cores to be used on each worker
executor_memory = '28G' # the amount of memory to be used on each worker
max_cores = 16 # the max. number of cores Spark is allowed to use overall

# returns the SparkContext object 'sc' which tells Spark how to access the cluster
sc = initSpark(nbBackend, spark_instances=spark_instances, executor_cores=executor_cores, \
               max_cores=max_cores, executor_memory=executor_memory)

from pyspark import StorageLevel

In [None]:
# provide OS credentials to the Hadoop configuration
sc._jsc.hadoopConfiguration().set('fs.swift.service.SparkTest.username', os_username)
sc._jsc.hadoopConfiguration().set('fs.swift.service.SparkTest.tenant', os_tenant_name)
sc._jsc.hadoopConfiguration().set('fs.swift.service.SparkTest.password', os_password)

In [None]:
# add Python files in 'utils' folder to the SparkContext 
# this is required so that all files are available on all the cluster workers
for filename in os.listdir(utils_dir):
    if filename.endswith('.py'):
        sc.addPyFile(os.path.join(utils_dir, filename))

### Get HDF5 DFF files from Swift

First, check if container exists and return items.

In [None]:
from SwiftStorageUtils import listItems
object_list = listItems(swift_container, file_params)

Narrow down the list to objects in pseudo-folder output_folder_dff containing filename_start.

In [None]:
objects_to_download = [n for n in object_list if n.startswith(output_folder_dff) and filename_start in n]
objects_to_download

Next, create Spark RDD from list of objects to download and set the file stem as key.

In [None]:
file_rdd = sc.parallelize(objects_to_download)
file_rdd = file_rdd.map(lambda x: (x.replace(output_folder_dff + '/', '').replace('.h5', ''), x))

Now we can define the functions for importing the data. As before, we first create a temporary folder on the local machine. Then we download the files into this temporary folder and read them with standard Python tools.

In [None]:
def getArrayFromH5(h5file, dataset_name):
    with h5py.File(h5file,'r') as hf:
        print('List of arrays in HDF5 file: ', hf.keys())
        data = hf.get(dataset_name)
        data = np.array(data)
        print('Shape of the array %s: ' % (dataset_name), data.shape)
        return data

In [None]:
from SwiftStorageUtils import downloadItems
def convert2rdd(obj, file_params):
    temp_dir = tempfile.mkdtemp()
    
    # download options
    down_opts = {
        'skip_identical': True,
        'out_directory': temp_dir,
    }
    downloadItems(file_params['swift_container'], [obj], file_params, down_opts)
    
    local_file = '%s%s%s' % (temp_dir, os.path.sep, obj)
    print('Local file: ', local_file)
    
    data = getArrayFromH5(local_file, 'dff')
    
    # delete temp dir
    shutil.rmtree(temp_dir)
    
    return data

Now we register this transformation to create dff_rdd from file_rdd. Here we only change the values and not the keys, so we can use mapValues instead of map.

In [None]:
# Import data from DFF files (HDF5 format)
dff_rdd = file_rdd.mapValues(lambda v: convert2rdd(v, file_params))

If we cache the new RDD (and there is sufficient space), subsequent steps might run a bit faster.

In [None]:
# dff_rdd.persist(StorageLevel.DISK_ONLY)

Load the first element and determine image dimensions / number of frames. Then, setup time vector.

In [None]:
dff1 = dff_rdd.first()
dims_analysis = (dff1[1].shape[0], dff1[1].shape[1])
timepoints = dff1[1].shape[2]

In [None]:
# time vector
sample_rate = 20.0 # Hz
t = (np.array(range(timepoints)) / sample_rate) - 3.0

t_stim = -1.9 # stimulus cue (auditory)
t_textIn = 0 # texture in (i.e. stimulus onset)
t_textOut = 2 # texture starting to move out (stimulus offset)
t_response = 4.9 # response cue for licking (auditory)
t_base = -2 # baseline end (for F0 calculation)

### Import Rois and trial indices
The approach for importing mat-files from Swift storage is the same as for HDF5 files: first download files from Swift storage to a temporary folder. Then, we use custom-written code to read the mat-files. Finally, the temporary folder is deleted.

In [None]:
# File name of ROI mat-file
roi_file = 'rois_OCIA.mat'
# image dimensions on which coordinates in roi_file are based
dims_roi = (256,256)
# ROI names that should be extracted
roi_dict = {'roi_S1BC': [], 'roi_A1': [], 'roi_EC': [], 'roi_M2': []}

# File with trial indices
trials_index_file = 'trials_ind.mat'

# download and import Roi and trial index files
objects_to_download = [
    roi_file,
    trials_index_file
]
# local storage directory --> remember to delete afterwards
temp_dir = tempfile.mkdtemp()

# download options
down_opts = {
    'skip_identical': True,
    'out_directory': temp_dir,
}

from SwiftStorageUtils import downloadItems
downloadItems(swift_container, objects_to_download, file_params, down_opts)

trial_ind = wf.importTrialIndices('%s%s%s' % (temp_dir, os.path.sep, trials_index_file))

# Specify ROIs to pull out
roi_file = '%s%s%s' % (temp_dir, os.path.sep,roi_file)
roi_dict = wf.importMatlabRois(roi_file, roi_dict, dims_roi, dims_analysis)

# delete temp dir
shutil.rmtree(temp_dir)

### Sliding-window cross-correlation
Now we have everything in place for running the correlation analysis. First, we define a function (slidingWindowCorr) that computes the sliding window correlation between two vectors with a given window size. This function is optimised so that we only need to call the correlation function once. Next, we define a function to append NaNs to the beginning and end. Finally, we have a wrapper function that runs slidingWindowCorr for each pixel time series in a loop.

In [None]:
def slidingWindowCorr(data1, data2, winsz):
    """
    Calculate the cross-correlation between data1 and data2 with a moving window of winsz datapoints.
    """
    N = int(data1.shape[0])
    winsz_onesided = int(np.ceil(winsz/2))
    # preallocate for efficiency
    data1_windows = np.empty(shape=(N-winsz, winsz), dtype=np.float64)
    data2_windows = np.empty(shape=(N-winsz, winsz), dtype=np.float64)
    # build input arrays
    row = 0
    for t in range(winsz_onesided, N-winsz_onesided):
        data1_windows[row, 0:winsz] = data1[t-winsz_onesided:t+winsz_onesided]
        data2_windows[row, 0:winsz] = data2[t-winsz_onesided:t+winsz_onesided]
        row += 1
    # calculate correlation and extract relevant points in the correlation matrix
    corr = np.corrcoef(np.vstack((data1_windows, data2_windows)))
    ix1 = range(0,corr.shape[0]/2)
    ix2 = range(corr.shape[0]/2, corr.shape[0])
    corr = corr[ix1[:], ix2[:]]

    return corr

In [None]:
def addNansToArray(A, winsz):
    """
    add Nans at beginning and end
    """
    nan_array = np.zeros((A.shape[0], A.shape[1], int(np.ceil(winsz/2))))
    nan_array[:] = np.nan
    return np.concatenate((nan_array, A, nan_array), axis=2)

In [None]:
def doSlidingWindowCorr(data, seed_pixel, winsz):
    """
    Run sliding window correlation between all pixel timeseries in data and seed timeseries with winzs window size
    
    Return array with correlation values (same size as data)
    """
    corr = np.zeros((data.shape[0], data.shape[1], data.shape[2]-winsz))
    seed = np.nanmean(data[seed_pixel[0], seed_pixel[1], :], axis=0)
    for x in range(data.shape[0]):
        for y in range(data.shape[1]):
            corr[x,y,:] = slidingWindowCorr(data[x,y,:], seed, winsz)
    return addNansToArray(corr, winsz)

Now we can specify the window size (in frames), the seed ROI and then schedule the correlation analysis as RDD transformation.

In [None]:
winsz = 20
seed_roi = 'roi_S1BC'

corr_rdd = dff_rdd.mapValues(lambda v: doSlidingWindowCorr(v, roi_dict[seed_roi], winsz))

As a sanity check, we can look at the shape of the first element. It should match the dimensions of the input dFF arrays. This will run the analysis for the first element and take a few minutes.

In [None]:
corr_rdd1 = corr_rdd.first()
corr_rdd1[1].shape

### Save correlation results as HDF5 files
Now we can save the data back to the Swift storage. This will finally kick-off the whole processing pipeline that has been defined so far.

In [None]:
# Include the seed ROI name in the output folder
output_folder_corr = 'corr_%s' % (seed_roi)

Check if the folders exist already. If a folder exists, will display the contents and ask for confirmation to delete.

In [None]:
from SwiftStorageUtils import deleteExistingFolder
deleteExistingFolder(swift_container, output_folder_corr, file_params)

Save the correlation data as HDF5 on Swift storage. This will run all the transformations that have been registered for corr_rdd. Depending on the number of trials and image resolution, this may take a while.

In [None]:
from SwiftStorageUtils import saveAsH5
corr_rdd.foreach(lambda (k,v): (k, saveAsH5(v, k, 'corr', output_folder_corr, file_params)))

### Correlation movie
The final part of the notebook demonstrates how to make a movie out of the correlation arrays.

In [None]:
# select file to be displayed
selected_file = '20152310_092225_4'

Import trial indices and figure out the trial type of the selected file.

In [None]:
# download and import trial index files
objects_to_download = [
    trials_index_file
]
# local storage directory --> remember to delete afterwards
temp_dir = tempfile.mkdtemp()

# download options
down_opts = {
    'skip_identical': True,
    'out_directory': temp_dir,
}

from SwiftStorageUtils import downloadItems
downloadItems(swift_container, objects_to_download, file_params, down_opts)

trial_ind = wf.importTrialIndices('%s%strials_ind.mat' % (temp_dir, os.path.sep))

# delete temp dir
shutil.rmtree(temp_dir)

In [None]:
def getTrialType(selected_file, trial_ind):
    """
    Return trial type from of input file from trial_ind
    """
    # parse file name to get trial_no
    p = re.compile('\d{1,8}')
    file_info = p.findall(selected_file)
    trial_no = int(file_info[2])
    # search trial_ind for trial_type
    trial_type = [i for i in trial_ind if trial_no in trial_ind[i]]
    if not len(trial_type):
        return 'void'
    else:
        return trial_type[0]

In [None]:
trial_type = getTrialType(selected_file, trial_ind)
print('%s trial type: %s' % (selected_file, trial_type))

### Import correlation data from HDF5

In [None]:
def getArrayFromH5(h5file, dataset_name):
    with h5py.File(h5file,'r') as hf:
        print('List of arrays in HDF5 file: ', hf.keys())
        data = hf.get(dataset_name)
        data = np.array(data)
        print('Shape of the array %s: ' % (dataset_name), data.shape)
        return data

In [None]:
# local storage directory --> remember to delete afterwards
temp_dir = tempfile.mkdtemp()
# file to download from Swift
objects_to_download = [
    '%s/%s.h5' % (output_folder_corr, selected_file)
]

# download options
down_opts = {
    'skip_identical': True,
    'out_directory': temp_dir,
}

# download file to local directory
from SwiftStorageUtils import downloadItems
downloadItems(swift_container, objects_to_download, file_params, down_opts)

# read file from local directory
corr_file = '%s%s%s%s%s.h5' % (temp_dir, os.path.sep, output_folder_corr, os.path.sep, selected_file)

corr_data = getArrayFromH5(corr_file, 'corr')

# delete temp dir
shutil.rmtree(temp_dir)

### Build and save the movie

In [None]:
wf.saveMovie(corr_data, trial_type, '%s_corr_%s' % (selected_file, seed_roi), sample_rate, t, file_params)