In [1]:
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
from scipy.stats import qmc
from sklearn.neighbors import KernelDensity

In [2]:
def quantify_null_density_area(kde, num_features, threshold=1e-5, num_samples=10000):
    """
    Quantify the proportion of feature space [0, 1]^p with density below a threshold
    using Latin Hypercube Sampling.
    
    Parameters:
    kde : sklearn.neighbors.KernelDensity object
        The trained KDE object
    num_features : int
        The number of features (dimensionality of the space)
    threshold : float, optional
        The density threshold below which is considered "null" (default 1e-5)
    num_samples : int, optional
        The number of points to sample using LHS (default 10000)
    
    Returns:
    float
        The proportion of the feature space with density below the threshold
    """
    # Create Latin Hypercube sampler
    sampler = qmc.LatinHypercube(d=num_features)
    
    # Generate samples in [0, 1]^p space
    lhs_samples = sampler.random(n=num_samples)
    
    # Evaluate the KDE at each sample point
    log_densities = kde.score_samples(lhs_samples)
    densities = np.exp(log_densities)
    
    # Calculate the proportion of low-density areas
    # n_null_density_samples / n_samples
    null_density_proportion = np.mean(densities < threshold)
    
    return null_density_proportion

In [9]:
def scott_bandwidth(data):
    data = np.atleast_2d(data.T).T
    n, d = data.shape
    mean = np.mean(data, axis=0)
    euclidean_distances = np.sqrt(np.sum((data - mean)**2, axis=1))
    std = np.std(euclidean_distances)
    return n**(-1./(d+4)) * std

def silverman_bandwidth(data):
    """
    Compute Silverman's rule for bandwidth selection.
    """
    data = np.atleast_2d(data.T).T
    n, d = data.shape
    
    # Calculate mean and Euclidean distances
    mean = np.mean(data, axis=0)
    euclidean_distances = np.sqrt(np.sum((data - mean)**2, axis=1))
    
    # Calculate standard deviation and IQR of Euclidean distances
    std = np.std(euclidean_distances, ddof=1)
    iqr = stats.iqr(euclidean_distances)
    
    # Use the minimum of std and normalized IQR
    sigma = np.minimum(std, iqr / 1.34)
    
    # Silverman's rule
    return 0.9 * sigma * n**(-1./(d+4))

In [15]:
# Generate sample data
np.random.seed(42)
# data = np.concatenate([
#     np.random.normal(0.3, 0.05, 200),
#     np.random.normal(0.7, 0.05, 300)
# ])
# data = np.random.normal(0, 1, 500)
data = np.random.rand(500)

data = data[(data >= 0) & (data <= 1)]  # Ensure all data is between 0 and 1

# Calculate Scott's rule and Silverman's rule bandwidths
scott_bw = scott_bandwidth(data)
silverman_bw = silverman_bandwidth(data)

# Create the figure
fig = make_subplots(rows=1, cols=1)

# Add normalized histogram
fig.add_trace(go.Histogram(x=data, name='Histogram', opacity=0.7, nbinsx=30, histnorm='probability density'))

# Generate points for KDE plotting
x_plot = np.linspace(0, 1, 1000)

# Create KDE traces for different bandwidths
bandwidths = np.logspace(-4, 0, 40)  # Log scale from 0.001 to 1
kde_traces = []
null_density_areas = []

for bw in bandwidths:
    kde = KernelDensity(kernel='gaussian', bandwidth=bw).fit(data.reshape(-1, 1))
    y_plot = np.exp(kde.score_samples(x_plot.reshape(-1, 1)))
    trace = go.Scatter(x=x_plot, y=y_plot, mode='lines', name=f'KDE (bw={bw:.3f})', visible=False)
    kde_traces.append(trace)
    fig.add_trace(trace)
    
    # Calculate null density area
    null_density_area = quantify_null_density_area(kde, num_features=1)
    null_density_areas.append(null_density_area)

# Make the first KDE trace visible
kde_traces[0].visible = True

# Add small symbols to show data points
fig.add_trace(go.Scatter(x=data, y=np.zeros_like(data), mode='markers', marker=dict(size=3, color='black', symbol='line-ns-open'), name='Data points'))

# Create slider
steps = []
for i, bw in enumerate(bandwidths):
    step = dict(
        method="update",
        args=[
            {"visible": [True] + [False] * len(bandwidths) + [True]},
            {"annotations": [{
                "text": f"Null density area: {null_density_areas[i]:.4f}",
                "x": 1.,
                "y": 1.1,
                "xref": "paper",
                "yref": "paper",
                "showarrow": False,
                "font": {"size": 14}
            }]}
        ],
        label=f"{bw:.5f}"
    )
    step["args"][0]["visible"][i+1] = True  # +1 because of histogram
    steps.append(step)

sliders = [dict(
    active=0,
    currentvalue={"prefix": "Bandwidth: "},
    pad={"t": 50},
    steps=steps
)]

# Update layout
fig.update_layout(
    sliders=sliders,
    title=f'Kernel Density Estimation with Adjustable Bandwidth. Scott\'s rule: {scott_bw:.3f}, Silverman\'s rule: {silverman_bw:.3f}',
    xaxis_title='X',
    yaxis_title='Density',
    showlegend=True,
    legend=dict(
        y=1, yanchor="top",
        x=1, xanchor="left",
    ),
    xaxis=dict(
        rangeslider=dict(visible=False),
        tickson="boundaries",
        ticklen=20
    )
)

# Update names for better legend
fig.data[0].name = 'Histogram'
for trace in fig.data[1:-1]:  # Exclude the last trace (data points)
    trace.name = 'KDE'

# Show the plot
fig.show()

In [11]:
def scott_bandwidth_for_plot(n, sigma):
    return sigma * (n ** (-1/5))

# Generate data
n_range = np.logspace(1, 5, 100).astype(int)  # 10 to 100,000
sigma_range = np.logspace(-2, 2, 100)  # 0.01 to 100

# Create meshgrid
N, SIGMA = np.meshgrid(n_range, sigma_range)

# Calculate bandwidth using Scott's rule
Z = scott_bandwidth_for_plot(N, SIGMA)

# Create heatmap
fig = go.Figure(data=go.Heatmap(
    z=np.log10(Z),  # Log scale for better visualization
    x=n_range,
    y=sigma_range,
    colorscale='Viridis',
    colorbar=dict(title='log10(Bandwidth)'),
))

# Update layout
fig.update_layout(
    title='Scott\'s Rule Bandwidth Heatmap',
    xaxis_title='Number of Data Points',
    yaxis_title='Standard Deviation (σ)',
    xaxis_type='log',
    yaxis_type='log',
)

# Show the plot
fig.show()

In [12]:
# Generate data
n_range = np.logspace(2, 5, 20).astype(int)  # 100 to 100,000
dimensions = [1, 2, 3, 5, 10]
bandwidths = {d: [] for d in dimensions}

for n in n_range:
    for d in dimensions:
        sampler = qmc.LatinHypercube(d=d)
        data = sampler.random(n=n)
        bw = scott_bandwidth(data)
        bandwidths[d].append(np.mean(bw))  # Average bandwidth across dimensions

# Create plot
fig = go.Figure()

for d in dimensions:
    fig.add_trace(go.Scatter(
        x=n_range,
        y=bandwidths[d],
        mode='lines+markers',
        name=f'{d}D'
    ))

fig.update_layout(
    title='Scott\'s Rule Bandwidth vs. Number of Data Points (LHS)',
    xaxis_title='Number of Data Points',
    yaxis_title='Average Bandwidth',
    xaxis_type='log',
    yaxis_type='log',
    legend_title='Dimensions'
)

fig.show()