This notebook investigates the stability of the learned test location. We consider the case where P is a mixture of two uniform distributions on 1D, one of which has small height, and the second component has a much larger height. Here Q is a uniform distribution whose mass strongly overlaps with the second component of P. 

The idea is that if the sample size n is low, we will have very few to no points from the first component of P, and the learned location will be at the second component. If n is high, the learned location will be at the first component since the difference is larger.

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#%config InlineBackend.figure_format = 'pdf'
import freqopttest.util as util
import freqopttest.data as data
import freqopttest.ex.exglobal as exglo
import freqopttest.kernel as kernel
import freqopttest.tst as tst
import freqopttest.glo as glo
import freqopttest.plot as plot
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import sys

In [None]:
# font options
font = {
    #'family' : 'normal',
    #'weight' : 'bold',
    'size'   : 18
}

plt.rc('font', **font)
plt.rc('lines', linewidth=2)

In [None]:
class SSMix2Unif(data.SampleSource):
    """
    1-d problem.
    
    P: U(m_p - w_p/2, mp+w_p/2) where w_p is the width of the uniform distribution, 
        m_p is the mean
    Q: w*U(m_q-w_q/2, mq+w_q/2) + (1-w)*p(x) where p(x) is the density of 
    """
    def __init__(self, w, mp, wp, mq, wq):
        if not (w>=0 and w<=1):
            raise RuntimeError('w must be in [0, 1]')
           
        self.w = w
        self.mp = mp
        self.wp = wp
        self.mq = mq
        self.wq = wq

    def dim(self):
        return 1

    def sample(self, n, seed):
        
        rstate = np.random.get_state()
        np.random.seed(seed)
        
        w = self.w
        mp = self.mp
        wp = self.wp
        mq = self.mq
        wq = self.wq
        
        disc_var = stats.rv_discrete(values=([0, 1], [w, 1-w]) )
        ind = disc_var.rvs(size=n)
        
        ind0 = ind==0
        #print ind0
        ind1 = ind==1
        # draw from Q
        ys = stats.uniform.rvs(loc=mq-wq/2.0, scale=wq, size=np.sum(ind0))
        yb = stats.uniform.rvs(loc=mp-wp/2.0, scale=wp, size=np.sum(ind1))
        y = np.hstack((ys, yb))
        y = y[:, np.newaxis]
                
        # draw from P
        x = stats.uniform.rvs(loc=mp-wp/2.0, scale=wp, size=n)
        x = x[:, np.newaxis]
        np.random.set_state(rstate)
        return data.TSTData(x, y, label='mix2unif')

In [None]:
# sample source 
n = 300
alpha = 0.01
w = 0.05
seed = 43
prob_params = {'w': w, 'mp': 5, 'wp': 4, 'mq': 0, 'wq': 2}
ss = SSMix2Unif(**prob_params)

In [None]:
tst_data = ss.sample(n, seed=seed)
tr = ss.sample(n/2, seed=seed)
te = ss.sample(n/2, seed=seed+1)
#tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10)
nte = te.X.shape[0]

In [None]:
xtr, ytr = tr.xy()
xytr = tr.stack_xy()
bins = np.linspace(np.min(xytr), np.max(xytr), 30)
plt.hist(xtr, bins, label='X', alpha=0.5)
plt.hist(ytr, bins, label='Y', alpha=0.5)
plt.legend(loc='best')

## Check stability of the two modes

Here we fix the test location to either on the left or the right bump and compare the objective function value.

In [None]:
alpha = 0.01
nte = 400
rep=300

In [None]:
# repeat many trials to see the value of the optimized location
def two_locations_test_results(nte):
    Tp_results = []
    Tq_results = []
    shift_seed = 1000
    for r in range(shift_seed, shift_seed+rep):    
        #tst_data = ss.sample(n, seed=r)
        #tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=10)
        te = ss.sample(nte, seed=r+1)
        
        # test locations
        
        # test loc at the small bump
        Tq = np.array([[prob_params['mq']]])
        # test loc at the big bump
        Tp = np.array([[prob_params['mp']]])

        gwidth0 = util.meddistance(tr.stack_xy(), subsample=1000)**2

        # actual test 
        q_met = tst.MeanEmbeddingTest(Tq, gwidth0, alpha)
        Tq_results.append(q_met.perform_test(te))
        p_met = tst.MeanEmbeddingTest(Tp, gwidth0, alpha)
        Tp_results.append(p_met.perform_test(te))
    return Tp_results, Tq_results


In [None]:
Tp_results, Tq_results = two_locations_test_results(nte)
tp_lambs = np.array([r['test_stat'] for r in Tp_results ])
tq_lambs = np.array([r['test_stat'] for r in Tq_results ])

In [None]:
plt.hist(tp_lambs, label='init v at big bump', alpha=0.6)
plt.hist(tq_lambs, label='init v at small bump', alpha=0.6)
plt.xlabel('$\hat{\lambda}_n$')
plt.ylabel('frequency')
plt.legend()

In [None]:
n_small_bump_high = np.sum(tq_lambs>tp_lambs)
print '#trials where small bump higher: %d/%d (%.2f%%)' %(n_small_bump_high, rep, 100*float(n_small_bump_high)/rep)
print 'std of left lambs: %.3f'%(np.std(tq_lambs))
print 'std of right lambs: %.3f'%(np.std(tp_lambs))

In [None]:
def prob_tq_better(nte):
    Tp_results, Tq_results = two_locations_test_results(nte)
    tp_lambs = np.array([r['test_stat'] for r in Tp_results ])
    tq_lambs = np.array([r['test_stat'] for r in Tq_results ])
    n_left_high = np.sum(tq_lambs>tp_lambs)
    return float(n_left_high)/rep, tp_lambs, tq_lambs

#ntes = np.linspace(50, 300, num=7, dtype=np.int64)
ntes = np.array([ 25, 50, 75, 100, 150, 200, 250])

tup_ntes = [prob_tq_better(nte) for nte in ntes]

In [None]:
ps_small_better = np.array([t[0] for t in tup_ntes])

plt.plot(ntes, ps_small_better, 'ob-')
plt.xlabel('nte')
plt.ylabel('p(loc at small bump gives high $\hat{\lambda}_n$)')

In [None]:
tp_lamb_means = np.array([np.mean(l) for l in [t[1] for t in tup_ntes]])
tp_lamb_stds = np.array([np.std(l) for l in [t[1] for t in tup_ntes]])
tq_lamb_means = np.array([np.mean(l) for l in [t[2] for t in tup_ntes]])
tq_lamb_stds = np.array([np.std(l) for l in [t[2] for t in tup_ntes]])

print('mq is the mean of the small bump')
plt.errorbar(ntes, tp_lamb_means, tp_lamb_stds, 
             label=r'$\hat{\mathbb{E}}[\hat{\lambda}_n \mid \mathrm{at }\,\, m_p]$')
plt.errorbar(ntes, tq_lamb_means, tq_lamb_stds, 
             label=r'$\hat{\mathbb{E}}[\hat{\lambda}_n \mid \mathrm{at }\,\, m_q]$')
plt.xlabel('test sample size')
plt.ylabel('$\hat{\lambda}_n$')
plt.legend(loc='best')