In [None]:
# Import Python modules
import os, sys
import numpy as np
from matplotlib import pylab as plt
from matplotlib import font_manager
from __future__ import print_function
import getpass
import tempfile
import shutil
import h5py
import re

from sklearn import linear_model

%matplotlib inline

In [None]:
# add folder 'utils' to the Python path
# this folder contains custom written code that is required for data import and analysis
utils_dir = os.path.join(os.getcwd(), 'utils')
sys.path.append(utils_dir)

import SwiftStorageUtils
import WidefieldDataUtils as wf
import CalciumAnalysisUtils as calciumTools

### File access parameters

In [None]:
# swift file system
swift_container = 'ariel' # specify name of container in Swift (do not use _ etc.)
swift_provider = 'SparkTest' # in general, this should not change
swift_basename = "swift://" + swift_container + "." + swift_provider + "/"

# storage location of dFF data (from Widefield_Preproc_Spark_Swift notebook)
output_folder_dff = 'dff_out'

# start of name for matching files
filename_start = '20152310_' # all files with names starting like this will be processed

In [None]:
# OpenStack credentials for accessing Swift storage
os_username = 'hluetc'
os_tenant_name = 'helmchen.hifo.uzh'
os_auth_url = 'https://cloud.s3it.uzh.ch:5000/v2.0'
# provide OS password
os_password = getpass.getpass()

In [None]:
# put all these params in a dict for later access
file_params = dict()
file_params['swift_container'] = swift_container
file_params['swift_provider'] = swift_provider
file_params['swift_basename'] = swift_basename
file_params['os_username'] = os_username
file_params['os_tenant_name'] = os_tenant_name
file_params['os_auth_url'] = os_auth_url
file_params['os_password'] = os_password

### Import trial index information

In [None]:
# download file with trial indices
trials_index_file = 'trials_ind.mat'

# download and import trial index files
objects_to_download = [
    trials_index_file
]
# local storage directory --> remember to delete afterwards
temp_dir = tempfile.mkdtemp()

# download options
down_opts = {
    'skip_identical': True,
    'out_directory': temp_dir,
}

from SwiftStorageUtils import downloadItems
status = downloadItems(swift_container, objects_to_download, file_params, down_opts)

if not status:
    print('Could not download file. Is the password correct?')
    shutil.rmtree(temp_dir)

trial_ind = wf.importTrialIndices('%s%strials_ind.mat' % (temp_dir, os.path.sep))

# delete temp dir
shutil.rmtree(temp_dir)

### Get information about the data

In [None]:
from SwiftStorageUtils import listItems
object_list = listItems(swift_container, file_params)

In [None]:
objects_to_download = [n for n in object_list if n.startswith(output_folder_dff) and filename_start in n]

In [None]:
def getArrayFromH5(h5file, dataset_name):
    with h5py.File(h5file,'r') as hf:
#         print('List of arrays in HDF5 file: ', hf.keys())
        data = hf.get(dataset_name)
        data = np.array(data)
#         print('Shape of the array %s: ' % (dataset_name), data.shape)
        return data

In [None]:
from SwiftStorageUtils import downloadItems
def downloadFromSwift(obj, file_params):
    temp_dir = tempfile.mkdtemp()
    
    # download options
    down_opts = {
        'skip_identical': True,
        'out_directory': temp_dir,
    }
    downloadItems(file_params['swift_container'], [obj], file_params, down_opts)
    
    local_file = '%s%s%s' % (temp_dir, os.path.sep, obj)
#     print('Local file: ', local_file)
    
    data = getArrayFromH5(local_file, 'dff')
    
    # delete temp dir
    shutil.rmtree(temp_dir)
    
    return data

In [None]:
# download first file to get dimensions / timepoints
dff_data = downloadFromSwift(objects_to_download[0], file_params)

In [None]:
dimensions = (dff_data.shape[0], dff_data.shape[1])
timepoints = dff_data.shape[2]

# time vector
sample_rate = 20.0 # Hz
t = (np.array(range(timepoints)) / sample_rate) - 3.0

t_stim = -1.9 # stimulus cue (auditory)
t_textIn = 0 # texture in (i.e. stimulus onset)
t_textOut = 2 # texture starting to move out (stimulus offset)
t_response = 4.9 # response cue for licking (auditory)

In [None]:
def getTrialType(selected_file, trial_ind):
    """
    Return trial type from of input file from trial_ind
    """
    # parse file name to get trial_no
    p = re.compile('\d{1,8}')
    file_info = p.findall(selected_file)
    trial_no = int(file_info[2])
    # search trial_ind for trial_type
    trial_type = [i for i in trial_ind if trial_no in trial_ind[i]]
    if not len(trial_type):
        return 'void'
    else:
        return trial_type[0]

### Setup design matrix

In [None]:
def buildDesignMatrix(trial_type):
    # setup design matrix for one trial
    # columns: cue_stim, tr_100_In, tr_1200_In, tr_100_Out, tr_1200_Out, response_100, response_1200 
    # use 'dummy variables' to model the different trial conditions and phases
    # note: the baseline is not modeled explicitely, but implicitely (absence of all other conditions)
    if trial_type == 'tr_100':
        design_matrix = np.array([
                (0, 0, 0, 0, 0, 0, 0),
                (1, 0, 0, 0, 0, 0, 0), 
                (0, 1, 0, 0, 0, 0, 0), 
                (0, 0, 0, 1, 0, 0, 0), 
                (0, 0, 0, 0, 0, 1, 0)
            ])
    elif trial_type == 'tr_1200':
        design_matrix = np.array([
                (0, 0, 0, 0, 0, 0, 0),
                (1, 0, 0, 0, 0, 0, 0), 
                (0, 0, 1, 0, 0, 0, 0), 
                (0, 0, 0, 0, 1, 0, 0), 
                (0, 0, 0, 0, 0, 0, 1)
            ])
    return design_matrix

In [None]:
# list of model parameter IDs (for labeling)
param_ids = [
    'cue_stim', 
    'tr_100_In', 
    'tr_1200_In', 
    'tr_100_Out', 
    'tr_1200_Out', 
    'response_100', 
    'response_1200' 
]
# setup design matrix X
X = np.empty((0,len(param_ids)), bool)
for i_file in objects_to_download:
    trial_type = getTrialType(i_file, trial_ind)
    if trial_type == 'void':
        continue
    X_trial = buildDesignMatrix(trial_type)
    X = np.append(X, X_trial, axis=0)

### Start Spark context

In [None]:
from setupSpark import initSpark
# Initialize Spark
# specify the number of cores and the memory of the workers
# each worker VM has 8 cores and 32 GB of memory
# the status of the cluster (ie. how many cores are available) can be checked in the Spark UI:
# http://SparkMasterIP:8080/

spark_instances = 1 # the number of workers to be used
executor_cores = 2 # the number of cores to be used on each worker
executor_memory = '2G' # the amount of memory to be used on each worker
max_cores = 2 # the max. number of cores Spark is allowed to use overall

# returns the SparkContext object 'sc' which tells Spark how to access the cluster
sc = initSpark('local', spark_instances=spark_instances, executor_cores=executor_cores, \
               max_cores=max_cores, executor_memory=executor_memory)

In [None]:
# add Python files in 'utils' folder to the SparkContext 
# this is required so that all files are available on all the cluster workers
for filename in os.listdir(utils_dir):
    if filename.endswith('.py'):
        sc.addPyFile(os.path.join(utils_dir, filename))

### Import data into Spark

In [None]:
def stimDataFromMov(mov, t, t_stim, t_textIn, t_textOut, t_response):
    '''
    Extract stimulus data for a whole movie using matrix-based indexing. Return tuple with 1 2D matrix per condition.
    '''
    # get data for base, cue, stimulus, response
    base_data = np.nanmean(mov[:,:,9:12], axis=2)
    stim_cue_data = np.nanmax(mov[:,:,(t >= t_stim) & (t < t_stim+0.2)], axis=2)
    textIn_data = np.nanmax(mov[:,:,(t >= t_textIn) & (t < t_textIn+1)], axis=2)
    textOut_data = np.nanmax(mov[:,:,(t >= t_textOut-1) & (t < t_textOut)], axis=2)
    response_data = np.nanmax(mov[:,:,(t >= t_response) & (t < t_response+1)], axis=2)
    return (base_data, stim_cue_data, textIn_data, textOut_data, response_data)

In [None]:
def getStimDataFromFile(file_name, file_params, trial_ind, t, t_stim, t_textIn, t_textOut, t_response, dimensions):
    dff_data = downloadFromSwift(file_name, file_params)
    trial_type = getTrialType(file_name, trial_ind)
    if trial_type == 'void':
        return None
    else:
        stim_data = stimDataFromMov(dff_data, t, t_stim, t_textIn, t_textOut, t_response)
        stim_data_out = np.zeros((len(stim_data), dimensions[0]*dimensions[1]))
        for ix, i_stim in enumerate(stim_data):
            stim_data_out[ix, :] = np.reshape(i_stim, (1, np.prod(dimensions)), order='C')
        return stim_data_out

In [None]:
file_rdd = sc.parallelize(objects_to_download)

In [None]:
stim_data_rdd = file_rdd.map(lambda x: 
                             getStimDataFromFile(x, file_params, trial_ind, t, t_stim, t_textIn, 
                                                 t_textOut, t_response, dimensions)).filter(lambda x: 
                                                                                            x is not None).cache()

Use flatMap to split up stim_data_rdd by pixel. One element per pixel / trial with 5 data points. Pixel index as key. Then use reduceByKey to join elements for the same pixel.

In [None]:
stim_data_flat = stim_data_rdd.flatMap(lambda (arr): ([x for x in arr.T]))

In [None]:
reps = stim_data_flat.count()/np.prod(dimensions)
ix = range(np.prod(dimensions))
key_list = ix * reps

In [None]:
stim_data_flat = stim_data_flat.zipWithIndex()

In [None]:
def assignKeyFromList(rdd_el, key_list):
    rdd_content = rdd_el[0]
    rdd_ix = rdd_el[1]
    rdd_key = key_list[rdd_ix]
    return (rdd_key, rdd_content)

In [None]:
stim_data_flat = stim_data_flat.map(lambda x: assignKeyFromList(x, key_list))

In [None]:
response_by_pixel_rdd = stim_data_flat.reduceByKey(lambda a,b: np.vstack((a,b)))

In [None]:
def replaceNanAndReshape(arr):
    if np.all(np.isnan(arr[1])):
        out = np.nan
    else:
        out = np.reshape(arr, (arr.size, 1))
    return out

In [None]:
response_by_pixel_rdd = response_by_pixel_rdd.mapValues(lambda x: replaceNanAndReshape(x))

### Linear regression analysis

In [None]:
def runLinearRegression(y, X, regr):
    '''
    Fit linear regression object (regr) with model (X) and response vector (y)
    
    Return tuple: (goodness-of-fit (r^2), coefficients (betas), intercept)
    '''
    betas = np.zeros(X.shape[1])
    if np.any(np.isnan(y)):
        rsq = np.nan
        betas[:] = np.nan
        intercept = np.nan
    else:
        # fit the model
        regr.fit(X, y)
        # coefficient of determination of prediction (R^2)
        rsq = regr.score(X, y)
        # coefficients
        betas[:] = regr.coef_
        # intercept
        intercept = regr.intercept_
    return (rsq, betas, intercept)

In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()
results_rdd = response_by_pixel_rdd.mapValues(lambda y_pixel: runLinearRegression(y_pixel, X, regr)).cache()

### Plot results

In [None]:
keys = results_rdd.keys().collect()
sort_ix = np.argsort(keys)

In [None]:
# plot a histogram of R^2 (goodness-of-fit) values over all pixels
rsq_all = np.array(results_rdd.values().map(lambda (rsq,beta,interc): rsq).collect())
plt.hist(rsq_all[~np.isnan(rsq_all)])

In [None]:
# plot an image of R^2 values
rsq_img = np.reshape(rsq_all[sort_ix], dimensions, order='C')
im = plt.imshow(rsq_img)
plt.colorbar(im)

In [None]:
# plot coefficient maps (for inspection)
beta_all = np.array(results_rdd.values().map(lambda (rsq,beta,interc): beta).collect())
vmin = 0
vmax = 10
fig, axes = plt.subplots(1, beta_all.shape[1], figsize=(30,10))
for ix, row in enumerate(beta_all.T):
    beta_img = np.reshape(row[sort_ix], dimensions, order='C')
    im = axes[ix].imshow(beta_img, vmin=vmin, vmax=vmax)
    axes[ix].set_title(param_ids[ix])
fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.81, 0.38, 0.03, 0.25])
text = cbar_ax.yaxis.label
font = font_manager.FontProperties(family='times new roman', style='italic', size=16)
text.set_font_properties(font)
fig.colorbar(im, cax=cbar_ax)

In [None]:
# plot coefficient maps (for print, i.e. larger)
beta_all = np.array(results_rdd.values().map(lambda (rsq,beta,interc): beta).collect())
vmin = 0
vmax = 10
for ix, row in enumerate(beta_all.T):
    fig = plt.figure(figsize=(10,10))
    beta_img = np.reshape(row[sort_ix], dimensions, order='C')
    im = plt.imshow(beta_img, vmin=vmin, vmax=vmax)
#     axes[ix].set_title(param_ids[ix])
# fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.81, 0.38, 0.03, 0.25])
text = cbar_ax.yaxis.label
font = font_manager.FontProperties(family='times new roman', style='italic', size=16)
text.set_font_properties(font)
fig.colorbar(im, cax=cbar_ax)

### Old: Derive response_rdd from response matrix
Not using Spark

In [None]:
%%time
# get response data for all pixels
# create a 2D response_matrix with responses in rows and pixels in columns
# TODO: use Spark to make this more efficient (distribute files)
response_matrix = np.zeros((X.shape[0], dimensions[0]*dimensions[1]))
row = 0
for i_file in objects_to_download:
    dff_data = downloadFromSwift(i_file, file_params)
    trial_type = getTrialType(i_file, trial_ind)
    if trial_type == 'void':
        continue
    stim_data = stimDataFromMov(dff_data, t, t_stim, t_textIn, t_textOut, t_response)
    for ix, i_stim in enumerate(stim_data):
        response_matrix[row+ix, :] = np.reshape(i_stim, (1, dimensions[0]*dimensions[1]), order='C')
    row += len(stim_data)

In [None]:
np.save('response_matrix', response_matrix)

In [None]:
response_matrix = np.load('response_matrix.npy')

In [None]:
def getResponseData(ix, response_matrix):
    response_data = response_matrix[:,ix]
    if np.any(np.isnan(response_data)):
        response_data = np.nan
    return response_data

In [None]:
response_rdd = sc.parallelize(range(response_matrix.shape[1])).map(lambda x: (x, x))
response_rdd = response_rdd.mapValues(lambda ix: getResponseData(ix, response_matrix))

In [None]:
# create dummy data to test RDD transform
arr = np.array(([1,2,3,4,5,6], [10,20,30,40,50,60], [100,200,300,400,500,600]))
lst = [arr, arr, arr, arr]

# convert to RDD
lst_rdd = sc.parallelize(lst)

lst_rdd.count()

lst_rdd.first().shape

lst_rdd_flat = lst_rdd.flatMap(lambda arr: [x for x in arr.T])

reps = lst_rdd_flat.count()/lst_rdd.first().shape[1]
ix = range(lst_rdd.first().shape[1])
key_list = ix * reps

lst_rdd_flat = lst_rdd_flat.zipWithIndex()

lst_rdd_flat = lst_rdd_flat.map(lambda x: assignKeyFromList(x, key_list))

lst_rdd_flat.collect()

lst_rdd_flat.reduceByKey(lambda a,b: np.vstack((a,b))).collect()

### Local implementation (non-Spark)

In [None]:
# fit the model (X) to the response (y) for each pixel (mass-univariate approach)
# TODO: use Spark to make this more efficient (distribute pixels)
from sklearn import linear_model

# Create linear regression object
regr = linear_model.LinearRegression()

r_sq_all = np.zeros((response_matrix.shape[1]))
beta_all = np.zeros((X.shape[1], response_matrix.shape[1]))

# fit for all pixels
for ix, y_pixel in enumerate(response_matrix.T):
    if np.any(np.isnan(y_pixel)):
        r_sq_all[ix] = np.nan
        beta_all[:, ix] = np.nan
        continue
    # fit the model
    regr.fit(X, y_pixel)
    # coefficient of determination of prediction (R^2)
    r_sq_all[ix] = regr.score(X, y_pixel)
    # coefficients
    beta_all[:, ix] = regr.coef_
    # intercept (this should be v. close to 0 as we selected the baseline frames above)
    # regr.intercept_

In [None]:
from matplotlib import font_manager
# Beta images
vmin = 0
vmax = 10
fig, axes = plt.subplots(1, beta_all.shape[0], figsize=(30,10))
for ix, row in enumerate(beta_all):
    beta_img = np.reshape(row, dimensions, order='C')
    im = axes[ix].imshow(beta_img, vmin=vmin, vmax=vmax)
    axes[ix].set_title(param_ids[ix])
fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.81, 0.38, 0.03, 0.25])
text = cbar_ax.yaxis.label
font = font_manager.FontProperties(family='times new roman', style='italic', size=16)
text.set_font_properties(font)
fig.colorbar(im, cax=cbar_ax)

In [None]:
# DFF image (as sanity check for orientation)
plt.imshow(np.nanmean(dff_data, axis=2))