In [1]:
import numpy as np
import pandas as pd

import iqplot

import bokeh.io
import bokeh.models
import bokeh.plotting

bokeh.io.output_notebook()

In [2]:
df = pd.read_csv('data/frog_tongue_adhesion.csv', comment='#')

# Have a look so we remember
df.head()

Unnamed: 0,date,ID,trial number,impact force (mN),impact time (ms),impact force / body weight,adhesive force (mN),time frog pulls on target (ms),adhesive force / body weight,adhesive impulse (N-s),total contact area (mm2),contact area without mucus (mm2),contact area with mucus / contact area without mucus,contact pressure (Pa),adhesive strength (Pa)
0,2013_02_26,I,3,1205,46,1.95,-785,884,1.27,-0.29,387,70,0.82,3117,-2030
1,2013_02_26,I,4,2527,44,4.08,-983,248,1.59,-0.181,101,94,0.07,24923,-9695
2,2013_03_01,I,1,1745,34,2.82,-850,211,1.37,-0.157,83,79,0.05,21020,-10239
3,2013_03_01,I,2,1556,41,2.51,-455,1025,0.74,-0.17,330,158,0.52,4718,-1381
4,2013_03_01,I,3,493,36,0.8,-974,499,1.57,-0.423,245,216,0.12,2012,-3975


# Plots with categorical variables

Different kinds of data we may encounter:
* Quantitative - continuous varying (therefore ordered) values
* Categorical - discrete, unordered values that a variable can take
* Ordinal - discrete, ordered values. Integers are classic example
* Temporal - time

Ordinal can be quantitative or treated as categorical

Temporal can also be quantitative

# Bar graph

Don't make bar graphs!




In [4]:
df_mean = df.groupby('ID')['impact force (mN)'].mean().reset_index()

# Take a look
df_mean

Unnamed: 0,ID,impact force (mN)
0,I,1530.2
1,II,707.35
2,III,550.1
3,IV,419.1


To set up categorical axis, need to specify x_range as a list with the categories

In [5]:
p = bokeh.plotting.figure(
    frame_height=200,
    frame_width=400,
    x_axis_label='impact force (mN)',
    y_range=df_mean['ID'].unique()[::-1],
    tools='pan,wheel_zoom,save,reset'
)

In [6]:
source = bokeh.models.ColumnDataSource(df_mean)

p.hbar(
    source=source,
    y='ID',
    right='impact force (mN)',
    height=0.6
)

# Turn off gridlines on categorical axis
p.ygrid.grid_line_color = None

# Start axes at origin on quantitative axis
p.x_range.start = 0

bokeh.io.show(p)

Make vertical by specifying x_range and using p.vbar()

In [7]:
p = bokeh.plotting.figure(
    frame_height=250,
    frame_width=250,
    y_axis_label='impact force (mN)',
    x_range=df_mean['ID'].unique()
)

p.vbar(
    source=source,
    x='ID',
    top='impact force (mN)',
    width=0.6
)

p.xgrid.grid_line_color = None
p.y_range.start = 0

bokeh.io.show(p)

# iqplot

iqplot generates plots from tidy data frames where one or more columns contain categorical data and the column of interest is quantitative

## plots with categorical axis
* Box plots
* Strip plots
* Strip-box plots
* Parallel coordinate plots
## Plots without a categorical axis
* histograms
* ECDFs

First seven arguments are the same for all plots
data: A tidy data frame or Numpy array.

q: The column of the data frame to be treated as the quantitative variable.

cats: A list of columns in the data frame that are to be considered as categorical variables in the plot. If None, a single box, strip, histogram, or ECDF is plotted.

q_axis: Along which axis, x or y that the quantitative variable varies. The default is 'x'.

palette: A list of hex colors to use for coloring the markers for each category. By default, it uses the Glasbey Category 10 color palette from colorcet.

order: If specified, the ordering of the categories to use on the categorical axis and legend (if applicable). Otherwise, the order of the inputted data frame is used.

p: If specified, the bokeh.plotting.Figure object to use for the plot. If not specified, a new figure is created.



# Box plots with iqplot

In [8]:
p = iqplot.box(
    data=df,
    q="impact force (mN)",
    cats="ID",
)

bokeh.io.show(p)

# Plot all your data
# Strip plots

In [9]:
p = iqplot.strip(
    data=df,
    q='impact force (mN)',
    cats='ID',
)

bokeh.io.show(p)

# Strip-box plots

In [10]:
p = iqplot.stripbox(
    data=df,
    q='impact force (mN)',
    cats='ID'
)

bokeh.io.show(p)

# Histograms


In [12]:
p = iqplot.histogram(
    data=df,
    q="impact force (mN)",
    cats="ID",
)

bokeh.io.show(p)

# ECDFs

Shows distributions better


In [13]:
# generate normally distributed data
rg = np.random.default_rng(3252)
x = rg.normal(size=500)

# plot the histogram
p = iqplot.histogram(x, rug=False)

bokeh.io.show(p)

Histograms suffer from binning bias. If you can plot all your data, you should.

ECDF evaluated at x for a set of measurements is:

ECDF(x) = fraction of measurements <= x


ECDF visualizes cumulative density function

In [14]:
p = iqplot.ecdf(x)

bokeh.io.show(p)

In [15]:
p = iqplot.ecdf(
    data=df,
    q='impact force (mN)',
    cats='ID',
)

bokeh.io.show(p)

In [16]:
# ECDF is a continuous function (with discontinuous derivatives at each data point)
# so it should be plotted like a staircase

p = iqplot.ecdf(
    data=df,
    q='impact force (mN)',
    cats='ID',
    style='staircase'
)

bokeh.io.show(p)

In [17]:
p = iqplot.ecdf(
    data=df,
    q="impact force (mN)",
    cats="ID",
    p=p,
    show_legend=False,
)

bokeh.io.show(p)

# Don't make bar graphs

Instead, plot all your data when you can. If you can't, box plots are always better than bar graphs