In [None]:
import os
import copy
import numpy as np
import pandas as pd
import seaborn
import matplotlib.pyplot as plt 

from dekef.base_density import *

from IFlogdensity.contam_ml_de import *

from IPython.display import Markdown as md

from datetime import datetime as dt

In [None]:
os.chdir('/Users/chenxizhou/Dropbox/code_package/IFlogdensity')

true_data = np.load('data/geyser.npy').astype(np.float64)
df = copy.deepcopy(true_data[:, 0]).reshape(-1, 1)
data_waiting = df[df != 108.0]

xlimit = (1., 310.)
plot_pts_cnt = 3000
new_data = np.linspace(xlimit[0], xlimit[1], plot_pts_cnt)

contam_weight = 1e-3

kernel_type = 'gaussian_poly2'
bw = 7.0
log_pen_param = -2.

var_name = 'additional observation'
method = 'ML'

base_density = BasedenGamma(data_waiting)
seed_no = 1

contam_pt_list = [20., 120., 150., 200.]

### Uncontaminated Density 

In [None]:
uncontam_ml = ContamMLDensityEstimate(
    data = data_waiting, 
    contam_data = np.array([contam_pt_list[0]]).reshape(-1, 1), 
    contam_weight = 0., 
    penalty_param = np.exp(log_pen_param), 
    base_density = base_density,
    r1 = 1.0, 
    r2 = 0.0, 
    c = 0., 
    bw = bw, 
    kernel_type = 'gaussian_poly2')

In [None]:
start_grid_points = np.arange(1., 311., 1)
gdalgo_params = negloglik_optalgoparams(
        start_pt=np.zeros((start_grid_points.shape[0], 1)),
        step_size=0.1,
        max_iter=50,
        rel_tol=1e-5,
        abs_tol=0.05)

bmc_params = batch_montecarlo_params(
        mc_batch_size=5000,
        mc_tol=5e-3)

In [None]:
np.random.seed(seed_no)
print(f'start time = {dt.now().strftime("%H:%M:%S")}')
uncontam_coef = uncontam_ml.coef_grid_points(
    optalgo_params = gdalgo_params, 
    batchmc_params = bmc_params, 
    algo = 'gd', 
    step_size_factor = 1., 
    grid_points = start_grid_points, 
    print_error = True)
print(f'end time = {dt.now().strftime("%H:%M:%S")}')

In [None]:
kernel_function_grid = GaussianPoly2(
    data=start_grid_points.reshape(-1, 1),
    r1=uncontam_ml.r1,
    r2=uncontam_ml.r2,
    c=uncontam_ml.c,
    bw=uncontam_ml.bw)

gram_grid = kernel_function_grid.kernel_gram_matrix(start_grid_points.reshape(-1, 1))

uncontam_norm = np.sqrt(uncontam_coef[0].T @ gram_grid @ uncontam_coef[0]).item()

uncontam_norm

In [None]:
uncontam_logden_vals = uncontam_ml.log_density(new_data, uncontam_coef, True)

plt.figure(figsize = (10, 10))
plt.plot(new_data, np.exp(uncontam_logden_vals), color = 'tab:blue', lw = 3.0)
plt.hist(data_waiting, bins='fd', density = True, alpha = 0.3, color = 'tab:green')
# plt.axvline(x = 20., color = 'k', linestyle='--')
plt.show()

In [None]:
for contam_pt in contam_pt_list: 
    full_save_folder = (f'data/PenML-GD-ContamData={contam_pt}-basisn={len(start_grid_points)}-bw={bw}-kernel={kernel_type}-' + 
                        f'contamweight={contam_weight}-plotdomain={xlimit}-plotcnts={plot_pts_cnt}-' + 
                        f'seed={seed_no}-new')
    if not os.path.isdir(full_save_folder):
        os.mkdir(full_save_folder)

    file_name_newdata = f'/new_data.npy'
    np.save(full_save_folder + file_name_newdata, new_data)

    file_name_grid_points = f'/grid_points.npy'
    np.save(full_save_folder + file_name_grid_points, start_grid_points)

    file_name_coef = f'/logpenparam={log_pen_param}-uncontam-coef.npy'
    np.save(full_save_folder + file_name_coef, uncontam_coef[0])

    file_name_diff = f'/logpenparam={log_pen_param}-uncontam-logden-newdata.npy'
    np.save(full_save_folder + file_name_diff, uncontam_logden_vals)

### Contaminated Density 

In [None]:
for contam_pt in contam_pt_list: 
    contam_ml = ContamMLDensityEstimate(
        data = data_waiting, 
        contam_data = np.array(contam_pt).reshape(-1, 1), 
        contam_weight = contam_weight, 
        penalty_param = np.exp(log_pen_param), 
        base_density = base_density,
        r1 = 1.0, 
        r2 = 0.0, 
        c = 0., 
        bw = bw, 
        kernel_type = 'gaussian_poly2')
    
    np.random.seed(seed_no)
    print(f'start time = {dt.now().strftime("%H:%M:%S")}')
    contam_coef = contam_ml.coef_grid_points(
        optalgo_params = gdalgo_params, 
        batchmc_params = bmc_params, 
        algo = 'gd', 
        step_size_factor = 1.0, 
        grid_points = start_grid_points, 
        print_error = True)
    print(f'end time = {dt.now().strftime("%H:%M:%S")}')
    
    contam_norm = np.sqrt(contam_coef[0].T @ gram_grid @ contam_coef[0]).item()
    
    contam_logden_vals = contam_ml.log_density(new_data, contam_coef, True)

    plt.figure(figsize = (10, 10))
    plt.plot(new_data, np.exp(contam_logden_vals), color = 'tab:blue', lw = 3.0)
    plt.hist(data_waiting, bins='fd', density = True, alpha = 0.3, color = 'tab:green')
    plt.axvline(x = contam_pt, color = 'k', linestyle='--')
    plt.show()
    
    IF_vals = (contam_logden_vals - uncontam_logden_vals) / contam_weight

    plt.figure(figsize = (10, 10))
    plt.plot(new_data, IF_vals, color = 'tab:blue', lw = 3.0)
    plt.axvline(x = contam_pt, color = 'k', linestyle='--')
    plt.show()
    
    full_save_folder = (f'data/PenML-GD-ContamData={contam_pt}-basisn={len(start_grid_points)}-bw={bw}-kernel={kernel_type}-' + 
                        f'contamweight={contam_weight}-plotdomain={xlimit}-plotcnts={plot_pts_cnt}-' + 
                        f'seed={seed_no}-new')
    
    # save coefficients 
    file_name_coef = f'/logpenparam={log_pen_param}-contam-coef.npy'
    np.save(full_save_folder + file_name_coef, contam_coef[0])

    file_name_diff = f'/logpenparam={log_pen_param}-contam-logden-newdata.npy'
    np.save(full_save_folder + file_name_diff, contam_logden_vals)

    file_name_ifvals = f'/logpenparam={log_pen_param}-IF-newdata.npy'
    np.save(full_save_folder + file_name_ifvals, IF_vals)
    
    print(uncontam_norm, contam_norm, np.max(np.abs(IF_vals)))