In [None]:
# Load the python package
import os
from dynetan.toolkit import *
from dynetan.viz import *
from dynetan.proctraj import *
from dynetan.gencor import *
from dynetan.contact import *
from dynetan.datastorage import *

from MDAnalysis.analysis import distances as MDAdistances
#from numpy.linalg import norm
#from itertools import islice
from itertools import combinations
from scipy import stats
from collections import OrderedDict

import networkx as nx
import numpy as np
import scipy as sp

import pandas as pd

In [None]:
# For visualization
from bokeh.io import output_file, output_notebook, push_notebook, show
from bokeh import models as bokehModels
from bokeh import transform as bokehTransform
from bokeh import layouts as bokehLayouts
from bokeh import plotting as bokehPlotting
from bokeh import palettes as bokehPalettes
from bokeh import events as bokehEvents
# For pre-calculating CArtesian distances based on 2D embedding
from sklearn.manifold import MDS

In [None]:
mapResidueNames={'ALA':'A','CYS':'C','ASP':'D','GLU':'E','PHE':'F',
                 'GLY':'G','HIS':'H','HSD':'H','HSE':'H','ILE':'I','LYS':'K','LEU':'L',
                 'MET':'M','ASN':'N','PRO':'P','GLN':'Q','ARG':'R',
                 'SER':'S','THR':'T','VAL':'V','TRP':'W','TYR':'Y',
                 'MG':'Mg','ATP':'Atp','POPC':'Popc','SOL':'h2o'}

def name_node(dnad, node):
    #i=dnad.nodesAtmSel[node].index
    resname=dnad.nodesAtmSel[node].resname ; resid=dnad.nodesAtmSel[node].resid
    return "%s%s" % (mapResidueNames[resname], resid)

def clarify_duplicate_nodes(dictNames, dictSuffix):
    """
    From two dicts with the same keys, add the respective suffix to all keys in the former that possess duplicate values.
# For visualization
from bokeh.io import output_file, output_notebook, push_notebook, show
from bokeh import models as bokehModels
from bokeh import transform as bokehTransform
from bokeh import layouts as bokehLayouts
from bokeh import plotting as bokehPlotting
from bokeh import palettes as bokehPalettes
from bokeh import events as bokehEvents
# For pre-calculating CArtesian distances based on 2D embedding
from sklearn.manifold import MDS    """
    from itertools import chain
    dictRev = {}
    for k, v in dictNames.items():
        dictRev.setdefault(v, set()).add(k)
        setDuplicateKeys = set(chain.from_iterable( v for k, v in dictRev.items() if len(v) > 1))
    for k in setDuplicateKeys:
        dictNames[k] = dictNames[k]+"_"+dictSuffix[k]
    return dictNames  

In [None]:
def load_all_graphs(workingFolder):
    """
    Load all results bt
    """

In [None]:
def simplify_graph(G, attr='segid', listIgnoredNodes=[]):
    outG = nx.Graph()
    for n,d in G.nodes(data=attr):
        if d not in outG.nodes():
            outG.add_node( d )
            outG.nodes[d]['count']=0
        outG.nodes[d]['count']+=1
    for u,v,w in G.edges(data='weight'):
        k1 = G.nodes[u][attr]
        k2 = G.nodes[v][attr]
        if (k1,k2) not in outG.edges():
            outG.add_edge(k1,k2, weight=0.0)
        outG.edges[k1,k2]['weight'] += w

    # = = = Easier to remove nodes post-fact rather than going through all the if-statements.
    for d in listIgnoredNodes:
        if d in outG.nodes():
            outG.remove_node( d )

    return outG
    # Hello World

In [None]:
bPythonExport = False

# Initialise variables
We will create a pandas dataframe _dfAnnotations_ to handle various catergorisation possibilities.
This will be used to group the data from individual windows.

In [None]:
if not bPythonExport:
    %cd ..

In [None]:
if not bPythonExport:
    # Define mutant file IO locations. wt, P67L, E56K, R75Q, S945L, dF508
    temperature="310K"
    fileAnnotations = './list_allele_annotations.txt'
    dfAnnotations = pd.read_csv(fileAnnotations, delimiter=' ', skipinitialspace=True, header=0)

    #outputFileName = "./results/networkView_%s_%s.html" % (allele, temperature)
    outputFileName = "./networkCompare.html"
    
    fileImportPos  = './CFTRGraphReferencePositions.txt'
    fileExportPos  = './temp.txt'
    #fileClusterDefinitions = None
    fileClusterDefinitions = './Stable_Solvent_Clustering.cluster_definitions_d3.5_r0.50.txt'

## Read graphs from DNA analysis files and extend annotation with simulation data
The dataframe from above will be augmented below with an extra column denoting the window.
the expected outcomes is that the index in the dataframe should strictly correspond to the index of the graph lists, so you can link all windows & graphs with their respective external annotations.

In [None]:
# = = Utilise the annotations file as a means of determining input paths.
print("= = = Loading input graph data from all input paths...")
listG=[] ; listNWinds=[] ; listLoaded = [] ; nWarn=0
for i in dfAnnotations.index:
    fullPathRoot = dfAnnotations['DyNetAn_Path'][i]
    if not os.path.exists(fullPathRoot+'.hf'):
        print("= = WARNING: ...path %s does not contain DyNetAn files? Skipping." % fullPathRoot)
        nWarn+=1
        continue
    print("    ...loading from path %s:" % fullPathRoot)
    
    dnad = DNAdata()
    # = = = loadFromFile will automatically output debug lines.
    dnad.loadFromFile(fullPathRoot)
    pdbVizFile = fullPathRoot + "_reducedTraj.pdb"
    mdU = mda.Universe(pdbVizFile)
    dnad.nodesAtmSel = mdU.atoms[ dnad.nodesIxArray ]
    # = = = Make a simpler representation via segid, and exclude crystallographic waters as they don't comprise a cohesive node.
    for w in range(dnad.numWinds):      
        G = simplify_graph(dnad.nxGraphs[w], 'segid', ['CRY'])
        listG.append(G)
        #listNames.append('%s_%i' % (allele, w))

    listNWinds.append(dnad.numWinds)
    listLoaded.append(i)
    print("= = NOTE: ...path %s DyNetAn files loaded." % fullPathRoot)

In [None]:
if nWarn>0:
    print("= = WARNING: A total of %i entries in the Annotations file are missing their DyNetAn datasets:" % nWarn)
    for i in dfAnnotations.index:
        if not i in listLoaded:
            print("  ...",dfAnnotations['DyNetAn_Path'][i])

In [None]:
# = = = = Create New analysis dataframe to incorporate information from both the annotation and simulation.
# The idea is to pass only the list of Graphs and the Analysis dataframe to subequent parts of the pipeline.
bFirst = True
for i,j in enumerate(listLoaded):
    dfTemp = pd.DataFrame(np.tile(dfAnnotations.values[j],(listNWinds[i],1)), columns = dfAnnotations.columns)
    dfTemp['Window'] = np.arange(listNWinds[i])
    UniqueID = [ "%s_%i" % (dfTemp['Allele'][k], dfTemp['Window'][k]) for k in dfTemp.index ]
    dfTemp['UniqueID'] = UniqueID
    if bFirst:
        bFirst=False
        dfAnalysis = dfTemp.copy(deep=True)
    else:
        dfAnalysis = dfAnalysis.append(dfTemp, ignore_index=True)

#dfAnalysis

## Functions for graph plotting

In [None]:
if fileImportPos is not None:
    # = = = Cheat with resid by eliminating the first letter.
    refPosList={}
    with open(fileImportPos,'r') as fp:
        for line in fp:
            l=line.split()     
            if len(l) != 3:
                continue
            refPosList[ l[0][1:] ] = [ float(l[1]), float(l[2]) ]
    posNodes={}
    for a, name in dnad.nxGraphs[0].nodes(data='name'):
        s = dnad.nxGraphs[0].nodes[a]['segid']
        if name[1:] in refPosList.keys():
            pos = refPosList[name[1:]]
            if s not in posNodes.keys():
                posNodes[s] = []
            posNodes[s].append(pos)
    for k in posNodes.keys():
        posNodes[k] = np.mean(posNodes[k],axis=0)
    bPosSet=True

In [None]:
def arrange_self_edges(G, CDS, pos):
    """
    Sets on a ColumnDataSource to plot graph self-edges.
    Runs a simplistic computation over neighbour node positions such that the edge will be oriented to lesson overlap with existint edge rays.
    """
    widthWedge=np.pi/4
    for u,v,weight in G.edges(data='weight'):
        if u==v:
            x = pos[u]
            CDS.data['x'].append(x[0])
            CDS.data['y'].append(x[1])
            vec = x - np.mean([pos[w] for w in G.neighbors(u)],axis=0)
            a = np.arctan2(vec[1],vec[0])
            CDS.data['a1'].append(a-widthWedge)
            CDS.data['a2'].append(a+widthWedge)
            CDS.data['weight'].append( weight )

In [None]:
def return_copy_without_self_edges(G):
    outG = G.copy()
    for n in outG.nodes():
        if (n,n) in outG.edges():
            outG.remove_edge(n,n)
    return outG

In [None]:
def get_node_data_range(G, nodeAttr):
    vals = [ G.nodes[x][nodeAttr] for x in G.nodes() ]
    return np.min(vals), np.max(vals)

def get_node_color_label_map(G):
    vals = [ x for x in G.nodes() ]
    _, i = np.unique(vals, return_index=True)
    vMap=np.array([ vals[x] for x in np.sort(i)])
    return vMap

def format_graph_nodes_by_palette(G, palette, nullColour='#FFFFFF'):
    # Set the node properties as additional entries in the graph.
    # Should I wrap palette around for text encodings that has more types than the number of colours in palette
    pMax = len(palette)
    nodeColors={}
    vMap=get_node_color_label_map(G)
    
    for n in G.nodes():
        i = np.where(vMap==n)[0][0]
        if i>=pMax:
            nodeColors[n] = nullColour
        else:
            nodeColors[n] = palette[ i ]
    nx.set_node_attributes(G, nodeColors, "node_color")

In [None]:
colourPaletteCat = ['#FF0000'] + list( bokehPalettes.Colorblind[8] ) + ['#666666']
for gg in listG:
    format_graph_nodes_by_palette(gg, colourPaletteCat)
#format_graph_edges_by_palette(G, colourPaletteLin)

In [None]:
def rescale_linear(dMin=0, dMax=1):
    r = dict(min=dMin,max=dMax)
    vfunc = """
        const norm = new Float64Array(xs.length)
        const min = Math.min(...xs)
        const max = Math.max(...xs)
        for (let i = 0; i < xs.length; i++) {
            norm[i] = r.min + (xs[i]-min)*(r.max-r.min)/(max-min)
        }
        return norm
    """
    return bokehModels.CustomJSTransform(args=dict(r=r), v_func=vfunc)

In [None]:
def create_JS_update_visibility(sourceTable, dictRenderer, dictGlyph, targetPlot):
    """
    This Javascript snippet updates which of the graphs elements are visible depending on the selection in the source widget.
    """
    return bokehModels.CustomJS(args=dict(s=sourceTable, dR=dictRenderer, dG=dictGlyph, pTo=targetPlot),
        code="""
        var inds = cb_obj.indices ;
        var listSelected = [] ;
        
        for (var i = 0; i < inds.length; i++) {
            listSelected.push( s.data['items'][inds[i]] )
        }
        
        // Note: dict uses "X in Y" notation, while arrays use X.includes(Y) notation.
        for (let k in dR) {
            if (listSelected.includes(k)) {
                dR[k].visible = true
            } else {
                dR[k].visible = false
            }
        }
        for (let k in dG) {
            if (listSelected.includes(k)) {
                dG[k].visible = true
            } else {
                dG[k].visible = false
            }
        }
        pTo.change.emit();
    """,
    )

## Graph-based plotting within Bokeh part A
Initial overview render to visually confirm that simulations have  similar coarse-grained networks.

In [None]:
# SAVE
#output_file('Sample_Application.html',mode='inline',root_dir=None)
output_notebook()

# = = = = = = = = = = = = = = = = = = = = = = = = = = =
# = = General settings.
# = = = = = = = = = = = = = = = = = = = = = = = = = = =
plotWidth=800 ; plotHeight=600

# = = = = = = = = = = = = = = = = = = = = = = = = = = =
# = = Graph
# = = = = = = = = = = = = = = = = = = = = = = = = = = =

colorsA = np.flip(bokehPalettes.YlOrRd[9])
mapperA = bokehModels.LinearColorMapper(palette=colorsA, low=0, high=1)
colorsB = np.flip(bokehPalettes.Blues[9])
mapperB = bokehModels.LinearColorMapper(palette=colorsB, low=0, high=1)

# = = = Plot Prep
figA = bokehPlotting.figure(plot_width=plotWidth, plot_height=plotHeight,
              tools=["pan","wheel_zoom", "tap", "reset", "save"],
              title="Overview")
figA.toolbar.active_scroll = figA.select_one(bokehModels.WheelZoomTool)
figA.title.text = "Coarse-grained network overview"

# = = = Plot all graphs elements but hide them dynamically based on tabel selection!
dictGraphMain = {} ; dictGraphSelfEdge = {}

for i in dfAnalysis.index:
    G   = listG[i]
    key = dfAnalysis['UniqueID'][i]
    
    # = = = Plot main part of the graph without self edges.
    Gplot = return_copy_without_self_edges(G)
    rendererGraph = bokehPlotting.from_networkx(Gplot, posNodes, scale=2, center=(0, 0))
    rendererGraph.node_renderer.glyph = bokehModels.Circle(size=bokehTransform.transform('count',rescale_linear(10,20)),
                                                           fill_color='node_color'
                                                          )
    rendererGraph.edge_renderer.glyph = bokehModels.MultiLine(line_color='black',
                                                              line_alpha=0.8,
                                                              line_width=bokehTransform.transform('weight',rescale_linear(1,5))
                                                             )
    rendererGraph.node_renderer.selection_glyph = bokehModels.Circle(size=20, fill_color='node_color')
    #sourceGraph = rendererGraph.node_renderer.data_source
    
    # = = = Plot self-edge part of the graph.
    sourceSelfEdge = bokehModels.ColumnDataSource(data=dict(x=[], y=[], a1=[], a2=[], weight=[]))
    #glyphSelfEdge = bokehModels.Circle(x="x", y="y", size=30, fill_alpha=0.0,
    #                                     line_color='grey', line_alpha=0.8, line_width=1)
    glyphSelfEdge = bokehModels.AnnularWedge(x="x", y="y", inner_radius=0, outer_radius=20, outer_radius_units='screen',
                                             start_angle="a1", end_angle="a2", fill_alpha=0.0,
                                             line_color='grey',
                                             line_alpha=0.8,
                                             line_width=bokehTransform.transform('weight',rescale_linear(1,5))
                                            )
    arrange_self_edges(G, sourceSelfEdge, rendererGraph.layout_provider.graph_layout)
    
    
    # = = = Add these renderes to the figure
    rendererSelfEdges = figA.add_glyph(sourceSelfEdge, glyphSelfEdge)
    rendererSelfEdges.visible=True
    figA.renderers.append(rendererGraph)
    rendererGraph.visible=True
    
    dictGraphMain[key]     = rendererGraph
    dictGraphSelfEdge[key] = rendererSelfEdges

# = = = = = = = = = = = = = = = = = = = = = = =
# = = = Additional bells and whistles
# = = = = = = = = = = = = = = = = = = = = = = =

# = = = Colour Bar
colourList = get_node_color_label_map(G)
palette = colourPaletteCat
if len(colourList) > len(palette):
    colourList = colourList[:len(palette)]
elif len(palette) > len(colourList):
    palette = colourPaletteCat[:len(colourList)]
colourMapper = bokehModels.mappers.CategoricalColorMapper(palette=palette, factors=colourList)
colourBar = bokehModels.ColorBar(name='SegID', color_mapper=colourMapper, label_standoff=12)
figA.add_layout(colourBar, 'right')    

# = = = Hover Tools
listTemp = [ dictGraphMain[k].edge_renderer for k in dictGraphMain.keys() ] + [ dictGraphSelfEdge[k] for k in dictGraphSelfEdge.keys() ]
edge_hover_tool = bokehModels.HoverTool(tooltips=[("weight", "@weight")], renderers=listTemp )
figA.add_tools(edge_hover_tool)

listTemp = [ dictGraphMain[k].node_renderer for k in dictGraphMain.keys() ]
node_hover_tool = bokehModels.HoverTool(tooltips=[("index", "@index")], renderers=listTemp)
figA.add_tools(node_hover_tool)
#p.add_tools(bokehModels.HoverTool(tooltips=tooltips, renderers=[rendererA,rendererB]))




In [None]:
# = = = Controls for allele comparison.
dataTable = dict(items=[ dfAnalysis['UniqueID'][k] for k in dfAnalysis.index ])
sourceTable = bokehModels.ColumnDataSource(dataTable)
colsTable = [ bokehModels.TableColumn(field="items", title="Select Dataset") ]

tableData = bokehModels.DataTable(source=sourceTable, columns=colsTable, index_position=None,
                                  width=100, sizing_mode='stretch_height')

# = = = Hookup to update visibility
tableData.source.selected.js_on_change("indices",
                                       create_JS_update_visibility(sourceTable, dictGraphMain, dictGraphSelfEdge, figA)
                                      )
#graphRenderer.node_renderer.data_source.selected.js_on_change("indices", callbackBig)
#widgetSelect = bokehModels.Select(title="Option:", value="foo", options=["foo", "bar", "baz", "quux"],
#                            sizing_mode='stretch_height')

In [None]:
layout=bokehModels.Row(tableData, figA)

show(layout)

## Compare individual edges between all simulations.
The first half uses Bokeh for graph-based depictions, e.g. of differences.
The second half uses a Pandas data frame to utilise its box metric functionalities.

### Numerical and statistical plots

In [None]:
#= = = This is the first important part. Which graph metric to 
edgePair=('ND1','ND2')
sourceEdge = bokehModels.ColumnDataSource(data=dict(cat=[], val=[]))
#= = = This is the second important part. How to group 
#sortingField = dfAnalysis['Allele'] 
sortingField = dfAnalysis['Functional_Category']
for i in dfAnalysis.index:
    G = listG[i] ; key = dfAnalysis['UniqueID'][i]
    sourceEdge.data['cat'].append( sortingField[i] )
    sourceEdge.data['val'].append( G.edges[edgePair]['weight'] )
listCategories=sortingField.unique()

In [None]:
df = pd.DataFrame(dict(cat=sourceEdge.data['cat'],
                       val=sourceEdge.data['val']))
# find the quartiles and IQR for each category
groups = df.groupby('cat', sort=False)
q0 = groups.quantile(q=0.00)
q1 = groups.quantile(q=0.25)
q2 = groups.quantile(q=0.50)
q3 = groups.quantile(q=0.75)
q4 = groups.quantile(q=1.0)
iqr = q3 - q1
upper = q3 + 1.5*iqr
lower = q1 - 1.5*iqr

# assume no outliers, shrink lengths of stems to be no longer than the minimums or maximums
upper.val = [min([x,y]) for (x,y) in zip(list(q4.loc[:,'val']),upper.val)]
lower.val = [max([x,y]) for (x,y) in zip(list(q0.loc[:,'val']),lower.val)]

In [None]:
# = = = General numbers like percentiles.
groups['val'].describe()

## Make box-plots as an overall visualisation

In [None]:
widthBox=0.7
widthWhisker=0.2
plotWidth=np.max( (500,60*listCategories.shape[0]) )
plotHeight=500

listCounts = [ i for i in groups['val'].count() ]
labelsXAxis = [ (x,"N=%i"%i) for x,i in zip(listCategories,listCounts)]

figB = bokehPlotting.figure(plot_width=plotWidth, plot_height=plotHeight,
              tools=["pan","wheel_zoom", "tap", "reset", "save"],
              title="Overview",
              x_range=bokehModels.FactorRange(*labelsXAxis),
              y_axis_label="%s - %s" % edgePair)
figB.toolbar.active_scroll = figA.select_one(bokehModels.WheelZoomTool)
figB.title.text = "Cross graph comparisons"
figB.xgrid.grid_line_color = None
figB.xaxis.group_label_orientation = "vertical"
#figB.ygrid.grid_line_color = None
figB.ygrid.grid_line_dash = [3,3]
figB.ygrid.minor_grid_line_color = "lightgrey"
figB.ygrid.minor_grid_line_dash = [1,5]

# stems
figB.segment(listCategories, upper.val, listCategories, q3.val, line_color="grey")
figB.segment(listCategories, lower.val, listCategories, q1.val, line_color="grey")

# boxes
figB.vbar(listCategories, widthBox, q2.val, q3.val, fill_alpha=0.0, line_color="grey")
figB.vbar(listCategories, widthBox, q1.val, q2.val, fill_alpha=0.0, line_color="grey")

# whiskers (almost-0 height rects simpler than segments)
figB.rect(listCategories, lower.val, widthWhisker, 0.01, line_color="grey")
figB.rect(listCategories, upper.val, widthWhisker, 0.01, line_color="grey")

In [None]:
figB.scatter(source=sourceEdge,
             x=bokehTransform.jitter('cat', 0.1, mean=0, distribution='normal', range=figB.x_range),
             y='val',
             size=6, fill_color='#BBCCFF', line_color='black'
            )

In [None]:
show(figB)

## Normality tests for statistical comparisons.

In [None]:
#groups['val'].aggregate(np.mean)
#groups['val'].aggregate([np.mean,'mean'])
#groups['val'].transform(lambda x: (x - x.mean())/x.std())
# Note: All statistical tests of normality have low power with small samples sizes,
# i.e. false-negative rate is quite high. Small datasets needs to be quite abnormal for it to fail.
def test_normality_shapiro(x):
    """
    Suggested reading: https://stats.stackexchange.com/questions/13983/is-it-meaningful-to-test-for-normality-with-a-very-small-sample-size-e-g-n
    Says N=20~30 needed for Shapiro to reliably reject an exponential versus a normal distribution.
    """
    r = sp.stats.shapiro(x)
    return r[1]>0.05

def test_normality_dagostino(x):
    """
    Minimum 8 samples.
    """
    r = sp.stats.normaltest(x)
    return r[1]>0.05

def test_normality_anderson(x):
    """
    Minimum 4 samples? Otherwise auto-False. Also is probably Auto-True for N=4~6.
    """
    r = sp.stats.anderson(x)
    # Significance levels: 15, 10, 5, 2.5, 1
    # r.significance_level[i] 
    return [ r.statistic < r.critical_values[i] for i in range(len(r.critical_values)) ]

def test_similarity_ttest(x1, x2, bSameVariance=False):
    """
    Assumes normal distribution of data.
    True uses the independent 2-sample t-test (Student's)
    False uses Welch's T-test.
    """
    r = sp.stats.ttest_ind(x1, x2, equal_var=bSameVariance)
    return r[1]>0.05
    
def test_similiarity_mannwhitneyu(x1, x2):
    """
    Does not assumes normal distribution of data, or similarity of variances.
    Assumes rank-order ability.
    """
    r = sp.stats.mannwhitneyu(x1,x2)
    return r[1]>0.05

In [None]:
#groups['val'].aggregate([test_normality_shapiro,test_normality_dagostino,test_normality_anderson])
dfNormalityTest = groups['val'].aggregate([test_normality_shapiro,test_normality_anderson]) 

In [None]:
sourceNormal    = bokehModels.ColumnDataSource(data=dict(cat=[]))
sourceNotNormal = bokehModels.ColumnDataSource(data=dict(cat=[]))
for i,k in enumerate(dfNormalityTest.index):
    if np.all( dfNormalityTest.values[i][1]+[dfNormalityTest.values[i][0]] ):
        sourceNormal.data['cat'].append( k )
    else:
        sourceNotNormal.data['cat'].append( k )

dfNormalityTest

### Comparison tests to see if groups a significantly different from each other. 

In [None]:
dictIndices = groups['val'].indices
sourceSimilar = bokehModels.ColumnDataSource(data=dict(catA=[], catB=[]))
sourceDissimilar = bokehModels.ColumnDataSource(data=dict(catA=[], catB=[]))
#keyIndex = {}
#for i, k in enumerate(dictIndices.keys()):
#    keyIndex[k]=i
#nKeys = len(dictIndices)
# matSimilarity = np.zeros( (nKeys,nKeys), dtype=bool)
for k1,k2 in combinations( dictIndices.keys(), 2 ):
    s1 = df['val'][ groups['val'].indices[k1] ]
    s2 = df['val'][ groups['val'].indices[k2] ]
    if np.all( (test_similarity_ttest(s1, s2), test_similiarity_mannwhitneyu(s1, s2)) ):
        sourceSimilar.data['catA'].append( k1 )    ; sourceSimilar.data['catB'].append( k2 )
    else:
        sourceDissimilar.data['catA'].append( k1 ) ; sourceDissimilar.data['catB'].append( k2 )
    #matSimilarity[ keyIndex[k2],keyIndex[k1] ] = matSimilarity[ keyIndex[k1],keyIndex[k2] ] = np.all( (test_similarity_ttest(s1, s2), test_similiarity_mannwhitneyu(s1, s2)) )
    #print( k1, k2, (test_similarity_ttest(s1, s2), test_similiarity_mannwhitneyu(s1, s2)) )

# = = = Estimate error rate. Expect 5% to come up.
ratioSim = len(sourceDissimilar.data['catA'])/len(sourceSimilar.data['catA'])

In [None]:
# = = = Plot similarity matrix.
plotWidth=plotHeight=np.max( (500,50*listCategories.shape[0]) )
figC = bokehPlotting.figure(plot_width=plotWidth, plot_height=plotHeight,
              tools=["tap", "reset", "save"],
              title="Overview",
              x_range=listCategories, y_range=[l for l in reversed(listCategories)] )

figC.toolbar.active_scroll = figA.select_one(bokehModels.WheelZoomTool)
figC.title.text = "Normality/pairwise-similarity tests (p>0.05 per test). %.1f%% of pairs are not similar." % (ratioSim*100)
figC.xgrid.grid_line_color = None
figC.ygrid.grid_line_color = None
figC.xaxis.major_label_orientation = "vertical"

propGlyphs=dict( size=12, line_alpha=1.0, line_color='black')
                #size_units='data')
figC.scatter(x='catA',y='catB',source=sourceSimilar, fill_color='blue', marker='plus', legend_label='Passes all similarity tests.', **propGlyphs)
figC.scatter(x='catA',y='catB',source=sourceDissimilar, fill_color='yellow', marker='triangle', legend_label='Fails 1+ similiarity test.', **propGlyphs)
figC.scatter(x='cat',y='cat',source=sourceNormal, fill_color='green', marker='plus',  legend_label='Passes all normality tests.', **propGlyphs)
figC.scatter(x='cat',y='cat',source=sourceNotNormal, fill_color='pink', marker='triangle', legend_label='Fails 1+ normality test.', **propGlyphs)

hoverTool = bokehModels.HoverTool(tooltips=[("a", "@catA"),("b", "@catB")], point_policy='snap_to_data')
figC.add_tools(hoverTool)


In [None]:
show(figC)