## Basic calcium imaging analysis using Jupyter, Spark and Thunder

In [None]:
import numpy as np
import pylab as plt
import h5py
import os, sys
import seaborn as sns
sns.set_style('darkgrid')
sns.set_context('notebook')

%matplotlib inline

In [None]:
# add folder utils to the Python path
utils_dir = os.path.join(os.getcwd(), 'utils')
sys.path.append(utils_dir) # add folder to Python path

In [None]:
# starting Spark depends on where the notebook is running
# choose 'local' or 'openstack'
nbBackend = 'local'
print "Running notebook on " + nbBackend + " backend"

In [None]:
from setupSpark import initSpark
sc = initSpark(nbBackend)

In [None]:
# add py-files in this folder to the SparkContext 
# (this allows that all required files are available on all workers)
for filename in os.listdir(utils_dir):
    if filename.endswith('.py'):
        sc.addPyFile(os.path.join(utils_dir, filename))

In [None]:
# full path to HDF5 data file
directory = '/Users/Henry/polybox/Data_temp/NeuroPipeline/LEC_Data'
# select H5 file
h5file = 'Monyer_Leitner_F296_spot01.h5'
h5file = directory + os.sep + h5file

In [None]:
# example: read one neuron timeseries (all trials)
ix = 0 # indexing is zero-based
from NeuroH5Utils import readPixel_map
x, result = readPixel_map(ix, h5file, dim=1, debug=True)

In [None]:
# example: read a single timepoint for all neurons (all trials)
ix = 100
from NeuroH5Utils import readPixel_map
x, result = readPixel_map(ix, h5file, dim=2)

In [None]:
# obtain further information about the dataset (size, sampling rate, number of trials)
from NeuroH5Utils import getFileInfo
dsetSz, sampF, nTrials = getFileInfo(h5file)

In [None]:
# create the RDD
from NeuroH5Utils import convert2RDD
numPartitions = 10 # how many partitions?
rdd = convert2RDD(sc, h5file, numPartitions=numPartitions)

In [None]:
# use count to force loading of the data, i.e. access every element once
# count returns number of elements in the RDD (i.e. dsetSz[0])
nNeurons = rdd.count()

In [None]:
# get time series from first roi to compute timepoints
s  = np.asarray(rdd.lookup(0))
# create a time array t
t = (np.linspace(1, len(s[0]), len(s[0]))) / sampF 
nTimepoints = len(t)

In [None]:
# return a specific Roi as Python list and plot
roi = 0
s = rdd.lookup(roi) # returns a list
s = np.asarray(s) # convert to np array (actually not required for plotting)
plt.plot(t, s[0]);
plt.xlim((0, np.max(t)));
plt.xlabel('Time / s');

In [None]:
# convert the Spark RDD into a Thunder Series object
# this allows us to make use of the functions available for Series objects in the Thunder library
import thunder as td
# thunder fromrdd expects a key-value pair where the key is a tuple representing the index
# here keys are ints --> wrap them in a single-element tuple
series = td.series.fromrdd(rdd.map(lambda kv: ((kv[0],), kv[1])))

In [None]:
# the index of a series allows us to store some important information for each timepoint, for example the trial number
import numpy.matlib
for iTrial in range(nTrials):
    v = np.matlib.repmat(iTrial, len(t)/nTrials, 1)
    if iTrial == 0:
        index = v
    else:
        index = np.append(index, v)
series.index = index

In [None]:
# Example: select some ROIs above a certain mean intensity threshold and plot them
examples = series.filter(lambda x: x.mean() > 10).sample(5).toarray()
plt.plot(t, examples.T);
plt.xlim((0, np.max(t)));
plt.xlabel('Time / s');

In [None]:
# Example: Compute mean / standard deviation of each Roi
series_mean = series.map(lambda x: x.mean()).flatten().toarray()
series_sd = series.map(lambda x: x.std()).flatten().toarray()

In [None]:
# Scatter plot of Roi mean vs. SD
plt.scatter(series_mean, series_sd)
plt.xlabel('Roi Mean');
plt.ylabel('Roi SD');

In [None]:
# import stimulus data
from NeuroH5Utils import getStimData
stimData, stimNames = getStimData(h5file)

In [None]:
# Plot timeseries for all neurons stacked and with odor stimuli indicated
# Extract data directly from Spark RDD because it's faster than going via Thunder
fig = plt.figure(figsize=(8,4)) # increase figsize to (20,10) to improve visibility
offset = 0
for iNeuron in range(nNeurons):
    plotTrace = rdd.lookup(iNeuron) # returns a list
    plotTrace = np.asarray(plotTrace) # convert to np array (actually not required for plotting)
    plotTrace = plotTrace[0] - min(plotTrace[0]) + offset
    offset = max(plotTrace)
    plt.plot(t, plotTrace)
for iTimepoint in range(len(stimData)):
    if stimData[iTimepoint]:
        tStim = t[iTimepoint]
        plt.plot((tStim, tStim), (0, offset), 'k--')
plt.xlabel('Time [s]', fontsize=18)
plt.ylim((0, offset))
plt.xlim((0, np.max(t)))
ax = fig.gca()
plt.setp(ax.get_xticklabels(), fontsize=16)
plt.setp(ax.get_yticklabels(), fontsize=16)
plt.show()
# save the figure if required
# plt.savefig('Timeseries_AllStim.png')

In [None]:
# peri-stimulus plots - average traces for each stimulus per neuron
# TODO: remove for-loop?
from CalciumAnalysisUtils import psAnalysis

# select time interval to plot (in frames)
baseFrames = 10
evokedFrames = 100

# select Rois to plot
# TODO: bug when selecting only 1 Roi
roisToPlot = (0, 16, 25, 26)
# Or select all neurons
# roisToPlot = range(nNeurons)

# compute peri-stimulus data for all neurons from the Spark RDD
# this creates a new RDD called psData
psData = rdd.map(lambda (k, v): (k, psAnalysis(v, stimData, (baseFrames, evokedFrames))))
psData = psData.partitionBy(numPartitions).cache()

# now create the plot
fig = plt.figure(figsize=(20,20)) # (20, 200) for full dataset, otherwise fewer rows
splotCounter = 1
for ix, iRoi in enumerate(roisToPlot):
    iRoi_data = np.asarray(psData.lookup(iRoi))
    psDataByStim = iRoi_data[0]
    # same y range for all stims
    minY = min([ np.min(x) for x in psDataByStim ])
    maxY = max([ np.max(x) for x in psDataByStim ])
    # plot for each stimulus
    for ix2, iStim in enumerate(psDataByStim):
        meanData = np.mean(iStim,axis=0)
        semData = np.std(iStim,axis=0) / np.sqrt(np.shape(iStim)[0])
        tPs = (np.linspace(0, evokedFrames, meanData.size)-baseFrames)/sampF
        plt.subplot(len(roisToPlot), len(psDataByStim), splotCounter)
        splotCounter = splotCounter + 1
        plt.fill_between(tPs, meanData-semData, meanData+semData, alpha=0.2)
        plt.plot(tPs, meanData)
        plt.plot((0,0), (minY, maxY), 'k--')
        plt.xlim((min(tPs), max(tPs)))
        plt.ylim((minY, maxY))
        if ix == 0:
            plt.title(stimNames[ix2+1])
        if ix2 == 0:
            plt.ylabel('%DFF Roi {0}'.format(iRoi+1))
plt.show()
# plt.savefig('PsPlot_AllStims.eps')

In [None]:
# get the reference image for a single trial and display it
from NeuroH5Utils import getReferenceImage
from showit import image
trial = 0 # specify trial (0 based indexing)
refImage = getReferenceImage(h5file, trial=trial)
image(refImage, clim=(0,80))

In [None]:
# setup another RDD, this time parallelizing across time points 
# thus, one RDD element contains the activity of all neurons for a particular time point
from NeuroH5Utils import convert2RDD
numPartitions = 100 # how many partitions?
rdd_time = convert2RDD(sc, h5file, numPartitions=numPartitions, dim=2)
# TODO: split by trials
# use count to force loading of the data, i.e. access every element once
rdd_time.count()