## Regression analysis of calcium imaging data with the Spark Thunder library

Mass univariate regression is the process of independently regressing multiple response variables against a single set of explantory features. It is common in any domain in which a lage number of response variables are measured, and fitting large collections of such models can benefit significantly from parallelization. The following tutorial notebook describes how to perform a simple linear regression analysis with calcium imaging data to determine how different neurons' activity is modulated by sensory stimulation. The example data set is the same as the one used in [Tutorial_Basics](Tutorial_Basics.ipynb).

### Initial setup & data import

This section is largely identical to the first part of [Tutorial_Basics](Tutorial_Basics.ipynb). To run it all in one go, you can select the next section heading ([Regression analysis](#regress)) and choose Cell --> Run All Above.

In [None]:
# Import required modules
import numpy as np
import pylab as plt
import h5py
import os, sys
import seaborn as sns

# Set figure style options for Seaborn
sns.set_style('darkgrid')
sns.set_context('notebook')

# show figure in notebook
%matplotlib inline

In [None]:
# add folder 'utils' to the Python path
# this folder contains custom written code that is required for data import and analysis
utils_dir = os.path.join(os.getcwd(), 'utils')
sys.path.append(utils_dir)

In [None]:
# starting Spark depends on where the notebook is running (local computer or OpenStack cluster)
# choose 'local' or 'openstack'
nbBackend = 'openstack'
print "Running notebook on " + nbBackend + " backend"

In [None]:
# Initialize Spark
# returns the SparkContext object 'sc' which tells Spark how to access the cluster
from setupSpark import initSpark
sc = initSpark(nbBackend)

In [None]:
# add Python files in 'utils' folder to the SparkContext 
# this is required so that all files are available on all the cluster workers
for filename in os.listdir(utils_dir):
    if filename.endswith('.py'):
        sc.addPyFile(os.path.join(utils_dir, filename))

In [None]:
# full path to directory containing HDF5 files
directory = '/home/ubuntu/example_data/LEC_Data'

# select HDF5 file
# following files are available: 
# Monyer_Leitner_F296_spot01.h5
# Monyer_Leitner_F397_spot01.h5
# Monyer_Leitner_F400_spot02.h5
# Monyer_Leitner_F400_spot04.h5
h5file = 'Monyer_Leitner_F296_spot01.h5'
h5file = directory + os.sep + h5file

In [None]:
# obtain further information about the dataset (size, sampling rate, number of trials)
from NeuroH5Utils import getFileInfo
dsetSz, sampF, nTrials = getFileInfo(h5file)

In [None]:
# create the RDD and Thunder series
import thunder as td
from NeuroH5Utils import convert2RDD
numPartitions = 10 # how many partitions?
rdd = convert2RDD(sc, h5file, numPartitions=numPartitions)
series = td.series.fromrdd(rdd.map(lambda kv: ((kv[0],), kv[1])))

In [None]:
# compute number of neurons, time points and time axis
nNeurons = series.count()
nTimepoints = len(series.index)
t = (np.linspace(1, nTimepoints, nTimepoints)) / sampF

In [None]:
# import stimulus data
from NeuroH5Utils import getStimData
stimData, stimNames = getStimData(h5file)

<a name="regress"></a>
### Regression analysis

The following section demonstrates how to perform a mass univariate linear regression analysis of stimulus vector against the timeseries data for each ROI using Thunder. Regression is part of the Thunder analysis package thunder-regression. 

In [None]:
# First, we need to build a design matrix from the stimulus vector. 
# The stimulus vector contains one value per time point, indicating the start of  a stimulus.
# The design matrix should have one row per stimulus ID, indicating the onset of the respective stimulus. 
stimId = stimData[stimData>0].astype(np.int64) # need integer for indexing later
stimIx = np.where(stimData>0)[0]
nStims = len(np.unique(stimId))
X = np.zeros((nStims, nTimepoints), int)
for ix, iStim in enumerate(stimIx):
    X[stimId[ix]-1, stimIx[ix]] = 1

In [None]:
# plot stimulus vector
fig = plt.figure(figsize=(20,5))
plt.plot(t, stimData);
plt.xlim((0, np.max(t)));
plt.xlabel('Time [s]');
plt.show()

In [None]:
# plot design matrix
fig = plt.figure(figsize=(20,5))
plt.plot(t, X.T);
plt.xlim((0, np.max(t)));
plt.xlabel('Time [s]');
plt.show()

In [None]:
# we approximate the shape of the typical calcium response by convolving the design matrix with a 
# canonical (double-exponential) calcium transient
# to do this, we define a function using the key word 'def'
# this function can later be evaluated with different inputs

def convolveStimWithCalciumResponse(stim, t, tauOn, tauOff):
    # canonical calcium indicator response characteristics
    # tauOn ... onset time in s
    # tauOff ... offset time in s
    signal = (1-(np.exp(-t/tauOn)))*(np.exp(-(t/tauOff)));
    
    # convolve stimulus with the canonical calcium indicator response
    stim_conv = np.convolve(stim, signal)
    stim_conv = np.delete(stim_conv, range(len(stim),len(stim_conv)))
    return stim_conv

In [None]:
# Now we actually convolve the design matrix with the calcium response
tauOn = 0.5 # in s
tauOff = 5 # in s
Xconv = np.zeros((nStims, nTimepoints), float)
for iStim in range(nStims):
    Xconv[iStim, :] = convolveStimWithCalciumResponse(X[iStim, :], t, tauOn, tauOff)
# plot convolved stimulus matrix
fig = plt.figure(figsize=(20,5))
plt.plot(t, Xconv.T);
plt.xlim((0, np.max(t)));
plt.xlabel('Time [s]');
plt.show()

In [None]:
# Now we are ready to create and fit the regression model
from regression import LinearRegression
algorithm = LinearRegression(fit_intercept=True, normalize=True)
model,score = algorithm.fit_and_score(Xconv.T, series)

In [None]:
# Results of the regression analysis are provided as regression coefficients (betas) for each stimulus 
# and the overall goodness-of-fit of the model

# extract betas as array and plot
betas = model.betas.toarray()

plt.figure(figsize=(20,6))
# beta matrix image
fig = plt.imshow(betas, cmap='gray', interpolation='nearest', 
           aspect=0.25, origin='lower', vmin=0, vmax=100)
plt.xlabel('StimID')
plt.ylabel('Neuron ID')
plt.xticks([])
plt.colorbar();
plt.show()
# plt.savefig('GLM_betaImage.tiff', interpolation='nearest')

In [None]:
# Plot goodness of model fit (R^2) for each Roi
plt.figure(figsize=(20,5))
plt.bar(range(nNeurons), score.toarray());
plt.xlabel('Neuron ID');
plt.ylabel('Model fit / R^2');
plt.xlim((0, nNeurons));

In addition to the provided LinearRegression algorithm, we can also specify customized regression models. This approach allows us to use many of the algorithms available in the [scikit-learn](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model) library

In [None]:
# Use a Lasso linear model with iterative fitting
from regression import CustomRegression
from sklearn.linear_model import LassoCV
algorithm_lcv = CustomRegression(LassoCV(normalize=True, fit_intercept=True, selection='random'))
model_lcv, score_lcv = algorithm_lcv.fit_and_score(Xconv.T, series)

In [None]:
# extract betas as array and plot
betas_lcv = model_lcv.betas.toarray()

plt.figure(figsize=(20,6))
# beta matrix image
fig = plt.imshow(betas_lcv, cmap='gray', interpolation='nearest', 
           aspect=0.25, origin='lower', vmin=0, vmax=100)
plt.xlabel('StimID')
plt.ylabel('Neuron ID')
plt.xticks([])
plt.colorbar();
plt.show()
# plt.savefig('GLM_betaImage.tiff', interpolation='nearest')

In [None]:
# Plot goodness of model fit (R^2) for each Roi
plt.figure(figsize=(20,5))
plt.bar(range(nNeurons), score_lcv.toarray());
plt.xlabel('Neuron ID');
plt.ylabel('Model fit / R^2');
plt.xlim((0, nNeurons));