In [141]:
# Our numerical workhorse
import numpy as np
import scipy.special
import pandas as pd
from scipy.stats import pearsonr
from bebi103 import bokeh_matplot

# Import pyplot for plotting
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.axes3d import Axes3D

# Import Bokeh modules for interactive plotting
import bokeh.charts
import bokeh.charts.utils
import bokeh.io
import bokeh.models
import bokeh.palettes
import bokeh.plotting

# Beeswarm plots
import beeswarm as bs

# Seaborn, useful for graphics
import seaborn as sns

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables high res graphics inline (only use with static plots (non-Bokeh))
# SVG is preferred, but there is a bug in Jupyter with vertical lines
%config InlineBackend.figure_formats = {'png', 'retina'}

# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2, 
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)

In [142]:
# read in the mappings and turn them into helpful dicts
region_df = pd.read_csv('data/SUB01_region.csv',comment='#',header=None)
region_df.columns = ['region_ID']
key_df = pd.read_csv('data/regionkey.csv',comment='#', header=None)
key_df.columns = ['region_ID','region']
region_dict = {i:r for i,r in zip(region_df.index,region_df['region_ID'])}
key_dict = {ID:r for ID,r in zip(key_df['region_ID'],key_df['region'])}

def do_all (df):
    """
    Make the bokeh plot
    """
    #first, tidy the df
    # we need rows as a column before melting
    df['row'] = df.index
    # after melting, each row contains the original row, the time, and the value
    melted_df = pd.melt(df,id_vars='row')
    # rename the columns
    melted_df.columns = ['row','time (s)','value']
    # use the dicts to add the region_IDs and then the regions
    melted_df['region_ID'] = [region_dict[row] for row in melted_df['row']]
    melted_df['region'] = [key_dict[ID] for ID in melted_df['region_ID']]
    # calculate the grouped means
    grouped_means = melted_df.groupby(['region','time (s)'])['value'].mean()
    # get all the regions, loop through, and make the correlation matrix
    regions = [key_dict[k] for k in key_dict.keys()]
    n = len(regions)
    correls = []
    for i,r1 in enumerate(regions):
        for j,r2 in enumerate(regions):
            correls.append([i,j,pearsonr(grouped_means[r1],grouped_means[r2])[0]])
    correls = pd.DataFrame(correls, columns = ['i','j','value'])
    # plot
    bokeh.plotting.output_file('test_matplot.html')
    p = bokeh_matplot(correls, 'i', 'j', 'value', n_colors=21,
                      colormap='RdBu_r', plot_width=500,
                      plot_height=500)
    bokeh.io.show(p)

In [60]:
df = pd.read_csv('data/SUB01_data.csv',comment='#',header=None)

In [143]:
do_all(df)

With the regions in no particular order, it's difficult to discern any sort of pattern. There seem to be more positive correlations than negative ones. 

In [144]:
messy_df = pd.read_csv('data/SUB01_uncleaned_data.csv',comment='#',header=None)

In [145]:
do_all(messy_df)

This matrix seems to have many more strong positive correlations than the previous one. There are also a few outlier regions that seem to be negatively correlated with almost all other regions. 

With time series data, there can be underlying processes that cause all of the data to be correlated. The regions could all be linked by some underlying, time-dependent, process. 