In [1]:
from matplotlib import rcParams
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from asymmetric_kde import ImproperGammaEstimator, ProperGammaEstimator
from multiprocessing import Pool
import copy_reg
import types
import pandas as pd

seed = 123

In [2]:
def pmap(function, sequence, processes=None):
    """
    Parallel map.
    """
    pool = Pool(processes)
    result = pool.map(function, sequence)
    pool.close()
    return result

def _pickle_method(method):
    func_name = method.im_func.__name__
    obj = method.im_self
    cls = method.im_class
    return _unpickle_method, (func_name, obj, cls)

def _unpickle_method(func_name, obj, cls):
    for cls in cls.mro():
        try:
            func = cls.__dict__[func_name]
        except KeyError:
            pass
        else:
            break
    return func.__get__(obj, cls)

copy_reg.pickle(types.MethodType, _pickle_method, _unpickle_method)

In [3]:
# Set up the plotting parameters
width_pt = 345
width = width_pt / 72
aspect = 3./4
height = width * aspect

rcParams['font.size'] = 9
rcParams['legend.fontsize'] = 'medium'
rcParams['figure.dpi'] = 160
rcParams['axes.linewidth'] = rcParams['lines.linewidth'] = .75

In [4]:
# Generate samples
np.random.seed(seed)
log_mean = 1
log_std = 1
distribution = stats.lognorm(log_std, scale=np.exp(log_mean))
samples = distribution.rvs(size=300)

# Improper gamma estimator

The code below estimates the density using an improper gamma kernel estimator as defined by Chen (2000).

In [5]:
# Fit a density estimator
kde = ImproperGammaEstimator(samples, 'plugin')

fig = plt.figure(figsize=(width, height))
ax = fig.add_subplot(111)

# Plot the original distribution and KDEs
x = np.linspace(1e-4, 15, 500)
ax.plot(x, kde(x), color='k', label='Improper gamma estimator')
ax.plot(x, distribution.pdf(x), color='k', ls=':', label='Generating distribution')

ax.scatter(samples, np.zeros_like(samples), marker='|', color='k')

# Finally plot the approximation with a Gaussian
kde_sample_smoothing = kde.to_variable_gaussian()
ax.plot(x, kde_sample_smoothing.evaluate(x), color='k', ls='--', label='Gaussian approximation')

ax.set_xlim(-1,15)
ax.set_xlabel('Random variable $X$')
ax.set_ylabel('Density')
ax.legend(frameon=False, loc='best')
fig.tight_layout()
fig.savefig('paper/improper-gamma.pdf', bbox_inches='tight')
fig.savefig('paper/improper-gamma.ps', bbox_inches='tight')
fig.show()

# Proper gamma estimator

The code below estimates the density using a proper gamma kernel estimator as defined by Jeon and Kim (2014).

In [6]:
# Fit a density estimator
kde = ProperGammaEstimator(samples, 'plugin')

fig = plt.figure(figsize=(width, height))
ax = fig.add_subplot(111)

# Plot the original distribution and KDEs
x = np.linspace(1e-4, 15, 500)
ax.plot(x, kde(x), color='k', label='Proper gamma estimator')
ax.plot(x, distribution.pdf(x), color='k', ls=':', label='Generating distribution')

ax.scatter(samples, np.zeros_like(samples), marker='|', color='k')

# Finally plot the approximation with a Gaussian
kde_sample_smoothing = kde.to_variable_gaussian()
ax.plot(x, kde_sample_smoothing.evaluate(x), color='k', ls='--', label='Gaussian approximation')

ax.set_xlim(-1,15)
ax.set_xlabel('Random variable $X$')
ax.set_ylabel('Density')
ax.legend(frameon=False, loc='best')
fig.tight_layout()
fig.savefig('paper/proper-gamma.pdf', bbox_inches='tight')
fig.savefig('paper/proper-gamma.ps', bbox_inches='tight')
fig.show()

# Comparison of LOO and plugin method

The code below generates 1000 realisations of 300 samples drawn from a lognormal distribution with logarithmic mean and variance equal to unity. For each realisation, it computes the MISE using leave-one-out (LOO) cross validation and the Gaussian approximation we have developed. The MISE curves are plotted with an aribtrary offset because we are only interested in the shape of the curves rather than their absolute value (which would be useful for performance evaluation of the estimator though).

In [7]:
# Define a reference distribution
distribution = stats.lognorm(log_std, scale=np.exp(log_mean))
# Define a bandwidth range
bandwidths = np.logspace(-2, 0, 50)

# Define containers for the scores
plugin_scores=[]
cv_scores=[]

# Iterate and evaluate the bandwidths
runs = 1000
print "Evaluating cross-validation scores..."
for run in range(runs):
    if (run + 1) % 50 == 0:
        print run + 1
    # Generate data
    X = distribution.rvs(size=300)
    # Fit an estimator
    kde = ProperGammaEstimator(X, None)
    # Evaluate the scores of the quality function
    plugin_scores.append([kde.evaluate_asymptotic_score(bw) for bw in bandwidths])
    cv_scores.append(pmap(kde.evaluate_cv_score, bandwidths))

print "Done."

Evaluating cross-validation scores...
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
Done.


In [8]:
def plot_scores(scores, f=5, color='k', ls='-', offset=0, label=None, ax=None):
    # Get default axes if none are given
    ax = ax or plt.gca()

    # Get the median and the interval
    scores = np.asarray(scores)
    median = np.median(scores, axis=0)
    # Adjust the offset
    offset -= np.min(median)
    median += offset
    scores += offset
    lower = np.percentile(scores, f, axis=0)
    upper = np.percentile(scores, 100-f, axis=0)

    # Plot
    ax.fill_between(bandwidths, lower, upper, color='silver')
    ax.plot(bandwidths, median, color=color, ls=ls, label=label)
    ax.scatter(bandwidths[np.argmin(median)], np.min(median), 10, color=color)

fig = plt.figure(figsize=(width, height))
ax = fig.add_subplot(111)

plot_scores(plugin_scores, label='Plugin', ax=ax)
plot_scores(cv_scores, ls='--', offset=0.07, label='LOO cross-validation', ax=ax)
ax.set_xscale('log')
ax.set_xlim(0, 1)
ax.set_xlabel('Bandwidth $\sigma$')
ax.set_ylabel('MISE score (arbitrary offset)')
ax.legend(loc='best', frameon=False)

fig.tight_layout()
fig.savefig('paper/bandwidth-comparison.pdf', bbox_inches='tight')
fig.savefig('paper/bandwidth-comparison.ps', bbox_inches='tight')
fig.show()

# Mitochondrial nucleoids

The following code performs density estimation for the number of mitochondrial nucleoids observed in some cells. Data were collected by Juvid Aryaman and Hanne Hoitzing--thanks.

In [9]:
# Load the data
filename = 'data/ddc_0.dat'
samples = np.loadtxt(filename)

In [10]:
# Create a figure
fig = plt.figure(figsize=(width, height))
ax = fig.add_subplot(111)

# Fit a KDE using the plugin method
x = np.linspace(1e-4, np.max(samples), 500)
kde = ProperGammaEstimator(samples, 'plugin')
print "Plugin bandwidth: {}".format(kde.bandwidth)
ax.plot(x, kde(x), color='k', label='Plugin')

# Rug plot
ax.scatter(samples, np.zeros_like(samples), marker='|', color='k')

# ax.legend(loc=0, frameon=False)

Plugin bandwidth: 1.01989555293


<matplotlib.collections.PathCollection at 0x11b32d8d0>

In [11]:
# Obtain MISE profiles
dict_bandwidths = {
    'data/ddc_0.dat': np.logspace(np.log10(.07), np.log10(3)),
    'data/ddc_3.dat': np.logspace(np.log10(.1), 0),
    'data/ddc_10.dat': np.logspace(np.log10(.1), 0, 20),
    'data/ddc_25.dat': np.logspace(np.log10(.1), 0, 20),
}
bandwidths = dict_bandwidths[filename]

# Create a density estimator without bandwidth selection
dummy = ProperGammaEstimator(samples, None)
# Obtain profiles
mise_plugin = [dummy.evaluate_asymptotic_score(bw) for bw in bandwidths]
mise_cv = pmap(dummy.evaluate_cv_score, bandwidths)

In [12]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(width, height))

# Fit a KDE using the plugin method
x = np.linspace(1e-4, np.max(samples), 500)
kde = ProperGammaEstimator(samples, 'plugin')
print "Plugin bandwidth: {}".format(kde.bandwidth)
ax1.plot(x, kde(x), color='k', label='Proper gamma')

# Rug plot
ax1.scatter(samples, np.zeros_like(samples), marker='|', color='k',
            alpha=.1)
# Labels
ax1.set_ylabel('Density')
ax1.set_xlabel('Nucleoid count')

# Show the profiles
offset = 2e-4
ax2.plot(bandwidths, mise_plugin - np.min(mise_plugin) + offset, color='k', 
         label='Plugin')
ax2.plot(bandwidths, mise_cv - np.min(mise_cv), color='k', ls='--',
         label='LOO')
ax2.scatter(kde.bandwidth, offset, 10, color='k')
ax2.set_xscale('log')
ax2.legend(loc=0, frameon=False)
ax2.set_yticks([])
ax2.set_xlabel('Bandwidth $\sigma$')
ax2.set_ylabel('MISE')
ax2.set_xlim(.05, 5)

fig.tight_layout()
fig.savefig('paper/nucleoid_0.pdf')

Plugin bandwidth: 1.01989555293


In [13]:
fig, axes = plt.subplots(2, 2, True, True, figsize=(width, height))

filenames = ['data/ddc_0.dat', 'data/ddc_3.dat', 
             'data/ddc_10.dat', 'data/ddc_25.dat']
concentrations = [0, 3, 10, 25]

for i, (filename, ax) in enumerate(zip(filenames, axes.ravel())):
    samples = np.loadtxt(filename)
    kde = ProperGammaEstimator(samples, 'plugin')
    x = np.linspace(1e-4, 200, 500)
    ax.plot(x, kde(x), color='k')
    ax.scatter(samples, np.zeros_like(samples), marker='|', color='k',
            alpha=.1)
    if i % 2 == 0:
        ax.set_ylabel('Density')
    if i > 1:
        ax.set_xlabel('Nucleoids')
        
    ax.set_xlim(-10, 200)
    
    ax.text(.95, .95, u'ddC concentration {}µM'.format(concentrations[i]),
            transform=ax.transAxes, ha='right', va='top')
        
fig.tight_layout()
fig.savefig('paper/nucleoids.pdf')

# Nano particle sizes

In [14]:
sheets = ['MBG15', 'MBG3', 'Manchester', 'MBG14', 'MBG70H', 'MBG61H', 'MBG61']

for sheet in sheets:
    filename = 'data/particles.xlsx'
    samples = pd.read_excel(filename, sheet, header=None).values.ravel()

    # Create a figure
    fig = plt.figure(figsize=(width, height))
    ax = fig.add_subplot(111)
    ax.set_title(sheet)

    # Fit a KDE using the plugin method
    x = np.linspace(1e-4, np.max(samples), 500)
    kde = ProperGammaEstimator(samples, 'plugin')
    print "Plugin bandwidth: {}".format(kde.bandwidth)
    ax.plot(x, kde(x), color='k', label='Plugin')

    # Rug plot
    ax.scatter(samples, np.zeros_like(samples), marker='|', color='k')

    # ax.legend(loc=0, frameon=False)

Plugin bandwidth: 0.0893410085052
Plugin bandwidth: 0.132003063291
Plugin bandwidth: 0.107830184547
Plugin bandwidth: 0.157367001078
Plugin bandwidth: 0.357098596814
Plugin bandwidth: 0.254730665917
Plugin bandwidth: 0.266774181792
