This notebook looks at the influence function of the logarithm of the score matching density function
in a kernel exponential family $\mathcal{Q}$ evaluated at a point $w$, which is defined as 

$$
\mathrm{IF} \big( y; \log q (w; F) \big) := \lim_{\varepsilon \to 0^+} \frac{1}{\varepsilon} \Big(\log q \big(w; (1 - \varepsilon) F + \varepsilon \delta_y \big) - \log q \big(w; F\big)\Big), \quad \text{ for all } w \in \mathcal{X}, \hspace{50pt} (*)
$$

where $\mathcal{X} \subseteq \mathbb{R}$ is the sample space, $F$ is a probability distribution over $\mathcal{X}$, $q (\cdot; F): \mathcal{X} \to [0, \infty)$ is the score matching density function in $\mathcal{Q}$ under $F$, $\varepsilon \in (0, 1]$, and $\delta_y$ is the point mass 1 at $y \in \mathcal{X}$. 

We approximate $(*)$ by 

$$
\widehat{\mathrm{IF}} \big( y; \log q (w; F_n) \big) := \frac{1}{\varepsilon} \Big(\log q \big(w; (1 - \varepsilon) F_n + \varepsilon \delta_y\big) - \log q \big(w; F_n\big)\Big), \quad \text{ for all } w \in \mathcal{X}, \hspace{50pt} (**)
$$

with a small $\varepsilon$, where $F_n$ is the empirial distribution. 

In the below, we use the `waiting` variable in the Old Faithful Geyser dataset and insert an additional observation, i.e., $y$ in $(**)$, each time. These additional observations are $90$, $92$, $\cdots$, $398$, $400$. In additional, we choose the sample space $\mathcal{X} = (0, \infty)$, the kernel function to be the Gaussian kernel function, the bandwidth parameter to be $5.0$, $7.0$ and $9.0$, the penalty parameter to be $\exp({-12.0})$, $\exp({-10.0})$ and $\exp({-8.0})$, and $\varepsilon$ in $(*)$ to be `1e-8`. 

In [None]:
import os 
import copy 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import matplotlib.gridspec as gridspec
import seaborn as sns

from matplotlib.animation import FuncAnimation
# %matplotlib qt
%matplotlib inline 

In [None]:
os.chdir('/Users/chenxizhou/Dropbox/code_package/IFdensity')

kernel_type = 'gaussian_poly2'
bw_list = [5.0] # [5.0, 7.0, 9.0]
log_pen_param_list = [-12.0] # [-12.0, -10.0, -8.0]
contam_weight = 1e-8

We look the resulting influence function $(**)$ evaluated at $w \in \mathcal{X}$ as we move the additional observation. 

In [None]:
for bw in bw_list: 
    for log_pen_param in log_pen_param_list: 
        print('=' * 100)
        print(f'bw={bw}, pen={log_pen_param}')
        save_dir=f'data/bw={bw}-kernel={kernel_type}-loglambda={log_pen_param}-contamweight={contam_weight}'

        contam_data_array = np.array([106.]).reshape(-1, 1) # np.arange(90., 402., 20).reshape(-1, 1)
        new_data = np.load(save_dir + f'/new_data.npy').flatten()

        linewidth = 2.0
        label_fontsize = 20
        info_fontsize = 25
        tick_fontsize = 15
        linewidth = 3.0
        x_label = 'waiting'
        y_label = 'IF'

        for i in range(len(contam_data_array)): 
            # read in data 
            save_dir = f'data/bw={bw}-kernel={kernel_type}-loglambda={log_pen_param}-contamweight={contam_weight}'

            result = np.load(save_dir + f'/contam_data={contam_data_array[i]}-IF-logden-newdata.npy')

            fig = plt.figure(figsize=(20, 10))
            left, bottom, width, height = 0.1, 0.1, 0.8, 0.8
            ax = fig.add_axes([left, bottom, width, height])

            plt.plot(new_data.flatten(), result, color = 'tab:blue', linewidth = linewidth)
            ax.axhline(0., 0, 1, ls = '--', color = 'tab:purple', alpha = 0.5)

            plt.title('Influence Function of Score Matching Log-density with $\sigma$={bw} and $\lambda$=exp({pen})'.format(
    bw=bw, pen=log_pen_param), fontsize = info_fontsize)
            plt.xlabel('waiting', fontsize = label_fontsize)
            plt.ylabel('IF', fontsize = label_fontsize)
            plt.xlim((21., 410.))
            
            ax.tick_params(axis = 'both', labelsize = tick_fontsize)
            
            info = r"Add {add_obs}".format(
                bw = bw, pen = log_pen_param, add_obs = contam_data_array[i][0])
            ax.text(0.995, 0.985,
                    info,
                    fontsize = info_fontsize,
                    multialignment = 'left',
                    horizontalalignment = 'right',
                    verticalalignment = 'top',
                    transform = ax.transAxes,
                    bbox = {'facecolor': 'none',
                            'boxstyle': 'Round, pad=0.2'})
            plt.savefig('plots/IF-plot-L2norm=inf.pdf')
            plt.show()
            

In [None]:
os.chdir('/Users/chenxizhou/Dropbox/code_package/IFdensity')

true_data = np.load('data/geyser.npy').astype(np.float64)
df = copy.deepcopy(true_data[:, 0]).reshape(-1, 1)
data_waiting = df[df != 108.0]

xlimit = (21., 410.)
plot_pts_cnt = 2000
newx = np.linspace(xlimit[0], xlimit[1], plot_pts_cnt)

contam_data_array = np.sort(np.unique(np.concatenate([
    np.arange(2., 45., 4), 
    np.arange(43., 66., 2), 
    np.arange(68., 100., 2), 
    np.arange(90., 401., 4.)]))).reshape(-1, 1)
# np.sort(np.concatenate([np.arange(90., 401., 4.), np.unique(data_waiting)])).reshape(-1, 1)
contam_weight = 1e-8

kernel_type = 'gaussian_poly2'
bw = 9.0
log_pen_param = -12.0
ylimit = (-3600., 4400.)

var_name = 'contaminated observation'

fontsize_label = 15
fontsize_tick = 10
fontsize_info = 20
fontsize_title = 20
fontsize_suptitle = 22
linewidth = 2.0

fig, ax = plt.subplots(
    nrows = 1, 
    ncols = 1, 
    figsize = (20, 10), 
    # tight_layout = True, 
    constrained_layout = False)

fig.subplots_adjust(top=0.9)

def update_IF_plots(contam_data): 

    # read in the original data 
    true_data = np.load('data/geyser.npy').astype(np.float64)
    df = copy.deepcopy(true_data[:, 0]).reshape(-1, 1)
    df = df[df != 108.0]
    
    pddf = pd.DataFrame({'data': df.flatten()})
    
    ax.clear()
    
    # ---------------------------------------------------------------------------------------
    # set x-limit 
    ax.set_xlim(xlimit)
    # set x label 
    ax.set_xlabel(var_name, fontsize = fontsize_label)
    # set y label 
    ax.set_ylabel('IF', fontsize = fontsize_label)
    ax.set_ylim(ylimit)
    # formatting tick marks and tick labels 
    ax.tick_params(axis = 'both', labelsize = fontsize_tick)
    ax.ticklabel_format(axis = 'y')
    # add rug plot at normal observations 
    sns.rugplot(pd.Series(df.flatten()), ax = ax, color = 'tab:blue')
    sns.rugplot(pd.Series(contam_data), ax = ax, color = 'red')
    
    save_dir = f'data/bw={bw}-kernel={kernel_type}-loglambda={log_pen_param}-contamweight={contam_weight}'
    result = np.load(save_dir + f'/contam_data=[{int(contam_data)}.]-IF-logden-newdata.npy')
    result_lim = np.load(
        save_dir + f'/contam_data={contam_data}-IF-natparam-limit-newdata.npy')
    
    # plot density when the basis functions are centered at grid points 
    ax.plot(newx.flatten(), result.flatten(), color = 'tab:blue', 
            linestyle = 'solid', linewidth = linewidth, label = 'IF of Log-density')
    ax.plot(newx.flatten(), result_lim.flatten(), color = 'tab:red', 
            linestyle = 'dashed', linewidth = linewidth, label = 'Limiting Case', alpha = 0.8)
    plt.legend(fontsize = fontsize_info, loc = 'upper right')
    
    # draw a vertical line at the outlier 
    ax.axvline(contam_data, 0, 1, ls = '--', color = 'tab:purple', alpha = 0.5)

    # add plot information 
    info = f'Add {contam_data[0]}'
    ax.text(0.007, 0.988,
            info,
             fontsize = fontsize_info,
             # fontfamily = 'serif',
             multialignment = 'left',
             horizontalalignment = 'left',
             verticalalignment = 'top',
             transform = ax.transAxes,
             bbox = {'facecolor': 'none',
                     'boxstyle': 'Round, pad=0.2'})
    
    return ax

ani = FuncAnimation(
    fig, 
    update_IF_plots, 
    frames = contam_data_array, 
    interval = 200)

fig.suptitle(r'Influence Function of Score Matching Log-density with $\sigma$={bw} and $\lambda$=exp({pen})'.format(
    bw=bw, pen=log_pen_param), 
             fontsize = fontsize_suptitle, y = 0.98)

# uncomment the following line to save the gif
# ani.save(f'gif/IF-logdensity-limit-waiting-kernel={kernel_type}-bw={bw}-pen=exp{log_pen_param}-contamweight={contam_weight}.gif', writer='imagemagick')

plt.show()

In [None]:
os.chdir('/Users/chenxizhou/Dropbox/code_package/IFdensity')

true_data = np.load('data/geyser.npy').astype(np.float64)
df = copy.deepcopy(true_data[:, 0]).reshape(-1, 1)
data_waiting = df[df != 108.0]

xlimit = (1., 410.) # (21., 410.)
plot_pts_cnt = 3000 # 2000
newx = np.linspace(xlimit[0], xlimit[1], plot_pts_cnt)

contam_data_array = np.sort(np.unique(np.concatenate([
    np.arange(2., 410., 4), 
    np.arange(40., 100., 2)]))).reshape(-1, 1)
# np.sort(np.concatenate([np.arange(90., 401., 4.), np.unique(data_waiting)])).reshape(-1, 1)
contam_weight = 1e-8

kernel_type = 'gaussian_poly2'
bw = 9.0
log_pen_param = -8.0
ylimit = (-500., 500.)

var_name = 'contaminated observation'

fontsize_label = 15
fontsize_tick = 10
fontsize_info = 20
fontsize_title = 20
fontsize_suptitle = 22
linewidth = 2.0

fig, ax = plt.subplots(
    nrows = 1, 
    ncols = 1, 
    figsize = (20, 10), 
    # tight_layout = True, 
    constrained_layout = False)

fig.subplots_adjust(top=0.9)

def update_IF_plots(contam_data): 

    # read in the original data 
    true_data = np.load('data/geyser.npy').astype(np.float64)
    df = copy.deepcopy(true_data[:, 0]).reshape(-1, 1)
    df = df[df != 108.0]
    
    pddf = pd.DataFrame({'data': df.flatten()})
    
    ax.clear()
    
    # ---------------------------------------------------------------------------------------
    # set x-limit 
    ax.set_xlim(xlimit)
    # set x label 
    ax.set_xlabel(var_name, fontsize = fontsize_label)
    # set y label 
    ax.set_ylabel('IF', fontsize = fontsize_label)
    ax.set_ylim(ylimit)
    # formatting tick marks and tick labels 
    ax.tick_params(axis = 'both', labelsize = fontsize_tick)
    ax.ticklabel_format(axis = 'y')
    # add rug plot at normal observations 
    sns.rugplot(pd.Series(df.flatten()), ax = ax, color = 'tab:blue')
    sns.rugplot(pd.Series(contam_data), ax = ax, color = 'red')
    
    save_dir = f'data/bw={bw}-kernel={kernel_type}-loglambda={log_pen_param}-contamweight={contam_weight}-plotdomain={xlimit}-plotcnts={plot_pts_cnt}'
    result = np.load(save_dir + f'/contam_data=[{int(contam_data)}.]-IF-logden-newdata.npy')
#     result_lim = np.load(
#         save_dir + f'/contam_data={contam_data}-IF-natparam-limit-newdata.npy')
    
    # plot density when the basis functions are centered at grid points 
    ax.plot(newx.flatten(), result.flatten(), color = 'tab:blue', 
            linestyle = 'solid', linewidth = linewidth)

    # draw a vertical line at the outlier 
    ax.axvline(contam_data, 0, 1, ls = '--', color = 'tab:purple', alpha = 0.5)

    # add plot information 
    info = f'Add {contam_data[0]}'
    ax.text(0.007, 0.988,
            info,
             fontsize = fontsize_info,
             # fontfamily = 'serif',
             multialignment = 'left',
             horizontalalignment = 'left',
             verticalalignment = 'top',
             transform = ax.transAxes,
             bbox = {'facecolor': 'none',
                     'boxstyle': 'Round, pad=0.2'})
    
    return ax

ani = FuncAnimation(
    fig, 
    update_IF_plots, 
    frames = contam_data_array, 
    interval = 200)

fig.suptitle(r'Influence Function of Score Matching Log-density with $\sigma$={bw} and $\lambda$=exp({pen})'.format(
    bw=bw, pen=log_pen_param), 
             fontsize = fontsize_suptitle, y = 0.98)

# uncomment the following line to save the gif
# ani.save(f'gif/IF-logdensity-waiting-kernel={kernel_type}-bw={bw}-pen=exp{log_pen_param}-contamweight={contam_weight}.gif', writer='imagemagick')

plt.show()