In [1]:
import os
import numpy as np
import pandas as pd
from scipy.spatial import Delaunay
import plotly.graph_objects as go
import plotly.express as px

from sampler.pipelines.metrics.asvd import ASVD

ROOT = os.path.dirname(os.path.abspath(""))
env_name = 'base'
np.random.seed(42)

In [2]:
from kedro.framework.startup import bootstrap_project
from kedro.framework.session import KedroSession
from kedro.framework.project import settings

# Bootstrap the Kedro project
metadata = bootstrap_project(ROOT)

# Create a KedroSession
with KedroSession.create(metadata.package_name, metadata.project_path, env=env_name) as session:
    context = session.load_context()
    catalog = context.catalog
    

In [3]:
# Example of usage for kedro catalog
p = catalog.load('parameters')

# Load parameters
features=p["features"]
targets=p["targets"]
additional_values = p["additional_values"]
names=p["names"]
n_iters = p["irbs_opt_sampling_points"]

# Load objects
data=catalog.load('treated_data')  # 'initial_data'
scaler=catalog.load('scaler')
treatment=catalog.load('treatment')

data.describe()

Unnamed: 0,r_ext_pAl,r_ext_pMeO,pAl_richness,Pg_f,Tg_Tmax,sim_time
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.500001,0.500135,0.500085,0.567292,0.880555,0.704504
std,0.290101,0.289639,0.290185,0.15998,0.071803,1.63857
min,0.005308,0.003748,0.00672,0.179282,0.562192,0.1298
25%,0.255631,0.252765,0.256099,0.437854,0.845714,0.222657
50%,0.498671,0.497823,0.498295,0.581713,0.897296,0.323523
75%,0.745648,0.748808,0.746538,0.694336,0.932637,0.39427
max,0.993851,0.993854,0.996655,0.842482,0.993504,10.497417


### Even arcs in 1D

In [4]:
COLORS = px.colors.qualitative.G10
linear_color = COLORS[0]
arc_color = COLORS[1]

# Define the function
n_pi = 10 * np.pi
def custom_function(x):
    return 0.5 * (x * np.sin(n_pi * x**2) + 1)

def custom_function_derivative(x):
    return 0.5 * (np.sin(n_pi * x**2) + 2 * n_pi * x**2 * np.cos(n_pi * x**2))

num_points = 10

layout_dict = dict(
    xaxis_title="x",
    yaxis_title="f(x)",
    showlegend=True,
    xaxis=dict(
        range=[-0.1, 1.1],
        scaleanchor="y",
        scaleratio=1,
        constrain='domain'
    ),
    yaxis=dict(
        range=[-0.1, 1.1],
        scaleanchor="x",
        scaleratio=1,
        constrain='domain'
    )
)


In [5]:
# Generate x values and y values for the function curve
x_values = np.linspace(0, 1, 300)
y_values = [custom_function(x) for x in x_values]

# Generate x points with a regular step
x_linear = np.linspace(0, 1, num_points)
y_linear = [custom_function(x) for x in x_linear]

# Create the plot
fig = go.Figure()

# Add the function curve
fig.add_trace(go.Scatter(x=x_values, y=y_values, mode='lines', name='Function Curve', line=dict(color='cornflowerblue')))

# Add points with an x-axis regular step
fig.add_trace(go.Scatter(x=x_linear, y=[0]*len(x_linear), mode='markers', name='X-axis Points', marker=dict(color=linear_color, size=8, symbol='line-ns-open', line_width=2)))
fig.add_trace(go.Scatter(x=x_linear, y=y_linear, mode='markers', name='Function Curve Points', marker=dict(color=linear_color, size=8)))

# Update layout
fig.update_layout(
    title="Custom Function Plot",
    **layout_dict
)

# Show the plot
fig.show()


In [6]:
from scipy import integrate, interpolate

def compute_arc_length_from_f(f, a, b, dx_step=0.0001):
    x_values = np.arange(a, b, dx_step)
    # Ensure b is included in x_values
    if b not in x_values:
        x_values = np.append(x_values, b)
    y_values = f(x_values)
    
    # Compute differences between consecutive points
    dx = np.diff(x_values)
    dy = np.diff(y_values)
    
    # Compute arc length segments
    arc_segments = np.sqrt(dx**2 + dy**2)
    
    # Sum up the segments
    arc_length = np.sum(arc_segments)
    
    return arc_length

def arc_length_integrand(x, derivative):
    return np.sqrt(1 + derivative(x)**2)

def compute_arc_length_from_grad(derivative, a, b):
    arc_length, _ = integrate.quad(arc_length_integrand, a, b, args=(derivative,))
    return arc_length

In [7]:
def generate_evenly_arc_spaced_points(function, num_points, method='grad', derivative=None, dx_step=0.0001):
    if method not in ['grad', 'f']:
        raise ValueError("Method must be either 'grad' or 'f'")
    
    if method == 'grad' and derivative is None:
        raise ValueError("Derivative function must be provided when using 'grad' method")
    
    # Choose the appropriate arc length computation method
    if method == 'grad':
        compute_arc_length = lambda a, b: compute_arc_length_from_grad(derivative, a, b)
    else:  # method == 'f'
        compute_arc_length = lambda a, b: compute_arc_length_from_f(function, a, b, dx_step)
    
    # Compute the total arc length
    total_arc_length = compute_arc_length(0, 1)
    
    # Generate evenly spaced arc lengths
    even_arc_lengths = np.linspace(0, total_arc_length, num_points)
    
    # Generate x values and corresponding arc lengths
    x_values = np.linspace(0, 1, 1000)
    arc_lengths = np.zeros_like(x_values)
    for i in range(1, len(x_values)):
        arc_lengths[i] = compute_arc_length(0, x_values[i])
    
    # Interpolate to find x values for evenly spaced arc lengths
    interp_func = interpolate.interp1d(arc_lengths, x_values)
    even_x_values = interp_func(even_arc_lengths)
    even_y_values = [function(x) for x in even_x_values]
    
    return even_x_values, even_y_values

# Generate 20 evenly spaced points on the curve
x_arc, y_arc = generate_evenly_arc_spaced_points(custom_function, num_points, method='grad', derivative=custom_function_derivative)

In [8]:
# Generate x values and y values for the function curve
x_values = np.linspace(0, 1, 300)
y_values = [custom_function(x) for x in x_values]

# Create the plot
fig = go.Figure()

# Add the function curve
fig.add_trace(go.Scatter(x=x_values, y=y_values, mode='lines', name='Function Curve'))

# Add evenly spaced points along the curve
fig.add_trace(go.Scatter(x=x_linear, y=y_linear, mode='markers', name='Linear even', marker=dict(color=linear_color, size=8)))
fig.add_trace(go.Scatter(x=x_arc, y=y_arc, mode='markers', name='Arc even', marker=dict(color=arc_color, size=8, symbol='x-thin-open', line_width=2)))

# Update layout to ensure equal scaling on both axes and constrain to [0, 1]
fig.update_layout(
    title="Evenly Spaced Points Along Custom Function Curve",
    **layout_dict
)

# Show the plot
fig.show()


In [9]:
# Create the plot
fig = go.Figure()

# Add evenly spaced points along the x-axis
linear_line_marker = dict(color=linear_color, size=12, symbol='line-nw-open', line_width=4)
arc_line_marker = dict(color=arc_color, size=12, symbol='line-ns-open', line_width=4)
fig.add_trace(go.Scatter(
    x=x_linear, y=[0]*len(x_linear), mode='markers', name='Linear even', 
    marker=linear_line_marker
))
fig.add_trace(go.Scatter(
    x=x_arc, y=[0]*len(x_arc), mode='markers', name='Arc even',
    marker=arc_line_marker
))

# Update layout to focus on x-axis
fig.update_layout(
    title="Comparison of Linear vs Arc Length Evenly Spaced Points",
    xaxis_title="x",
    yaxis_title="",
    showlegend=True,
    xaxis=dict(
        range=[-0.05, 1.05],
        tickmode='linear',
        dtick=0.1
    ),
    yaxis=dict(
        range=[-0.1, 0.1],
        showticklabels=False,
        zeroline=True,
        zerolinewidth=2,
        zerolinecolor='black'
    ),
    height=300  # Reduce the height of the plot to focus on x-axis
)

# Add vertical lines for better visibility
for x in np.arange(0.1, 1, 0.1):
    fig.add_shape(type="line", x0=x, y0=-0.05, x1=x, y1=0.05, line=dict(color="Gray", width=1))

# Show the plot
fig.show()


### Compute even surfaces in 2D

In [10]:
# Define the function
def custom_function_2d(x):
    x = np.atleast_2d(x)
    return (custom_function(x[:, 0]) + custom_function(x[:, 1]))/2
    # return np.sin(2 * np.pi * x[:, 0]) * np.cos(2 * np.pi * x[:, 1])

In [11]:
# Create a grid of x and y values
x = np.linspace(0, 1, 100)
y = np.linspace(0, 1, 100)
X, Y = np.meshgrid(x, y)

# Calculate Z values
XY = np.column_stack((X.ravel(), Y.ravel()))
Z = custom_function_2d(XY).reshape(X.shape)

# Create the 3D surface plot
surface = go.Surface(
    x=X, y=Y, z=Z, colorscale='viridis', opacity=1,
    colorbar=dict(len=0.5, title=dict(text='f(x,y)', side='bottom')),
    name='f(x,y)', showlegend=True
)

# Create the (x, y) plane
xy_plane = go.Surface(x=X, y=Y, z=np.zeros_like(Z), colorscale='Greys', showscale=False, opacity=0.5)

# Create evenly linear distributed point the (x, y) plane
X_linear, Y_linear = np.meshgrid(x_linear, x_linear)
XY_linear = np.column_stack((X_linear.ravel(), Y_linear.ravel()))
Z_linear = custom_function_2d(XY_linear).reshape(X_linear.shape)

linear_points_on_surface = go.Scatter3d(
    x=X_linear.flatten(),
    y=Y_linear.flatten(),
    z=Z_linear.flatten(),
    mode='markers',
    marker=dict(size=3, color=linear_color),
    name='Augmented arc-even points'
)

linear_points_on_xy = go.Scatter3d(
    x=X_linear.flatten(),
    y=Y_linear.flatten(),
    z=np.zeros_like(Z_linear.flatten()),
    mode='markers',
    marker=dict(size=3, color=linear_color),
    name='Arc-even points'
)

# Create evenly distributed point the function curve plane
X_arc, Y_arc = np.meshgrid(x_arc, x_arc)
XY_arc = np.column_stack((X_arc.ravel(), Y_arc.ravel()))
Z_arc = custom_function_2d(XY_arc).reshape(X_arc.shape)

arc_points_on_surface = go.Scatter3d(
    x=X_arc.flatten(),
    y=Y_arc.flatten(),
    z=Z_arc.flatten(),
    mode='markers',
    marker=dict(size=3, color=arc_color),
    name='Augmented linear-even points'
)

arc_points_on_xy = go.Scatter3d(
    x=X_arc.flatten(),
    y=Y_arc.flatten(),
    z=np.zeros_like(Z_arc.flatten()),
    mode='markers',
    marker=dict(size=3, color=arc_color),
    name='Linear-even points'
)

# Create the figure
fig = go.Figure(data=[surface, xy_plane, linear_points_on_surface, linear_points_on_xy, arc_points_on_surface, arc_points_on_xy])

# Update the layout
fig.update_layout(
    title='3D Visualization of custom_function_2d',
    scene=dict(
        xaxis_title='X',
        yaxis_title='Y',
        zaxis_title='Z',
        aspectratio=dict(x=1, y=1, z=1),
        # camera=dict(eye=dict(x=-2, y=-0.5, z=1.2))
        camera=dict(eye=dict(x=-0., y=-0.5, z=2.))
    ),
    width=800,
    height=800,
    margin=dict(r=20, l=10, b=10, t=40)
)

# Show the plot
fig.show()


### Example of meshing in 2D and 3D

tri.simplices: ndarray of ints, shape (nsimplex, ndim+1). Indices of the points forming the simplices in the triangulation. For 2-D, the points are oriented counterclockwise.

In plotly a Mesh3d is defined by a surface triangulation, that consists in two arrays, one that defines the vertices (3d points), of shape (n, 3), and another that define the triangles, of shape (m, 3). A triangle is defined by a list of 3 ints representing the indices of the rows in the vertex array that contain triangle vertices.

In [12]:
def generate_points_2d(n_total):
    # Total number of points
    n_per_quadrant = int(n_total / 4)
    n_sparse = int(n_per_quadrant/4)
    n_dense = 2*n_per_quadrant - n_sparse + n_total%4

    # Upper left quadrant: uniformly random
    upper_left_x = np.random.uniform(low=0, high=0.5, size=n_per_quadrant)
    upper_left_y = np.random.uniform(low=0.5, high=1, size=n_per_quadrant)
    upper_left = np.column_stack((upper_left_x, upper_left_y))

    # Upper right quadrant: uniformly random but only extreme upper right
    upper_right = np.random.uniform(low=0.8, high=1, size=(n_per_quadrant, 2))

    # Lower left quadrant: very dense uniformly random
    lower_left = np.random.uniform(low=0, high=0.5, size=(n_dense, 2))

    # Lower right quadrant: very sparse uniformly random
    lower_right_x = np.random.uniform(low=0.5, high=1, size=n_sparse)
    lower_right_y = np.random.uniform(low=0, high=0.5, size=n_sparse)
    lower_right = np.column_stack((lower_right_x, lower_right_y))

    # Combine all quadrants
    X = np.vstack((upper_left, upper_right, lower_left, lower_right))

    return X

# Generate 2D points
X = generate_points_2d(100)

# Create simplex mesh using Delaunay triangulation
tri = Delaunay(X)

# Get the simplices (triangles)
simplices = tri.simplices


In [13]:

# Visualization for 2D
fig = go.Figure()

# Add the points
fig.add_trace(go.Scatter(
    x=X[:, 0], 
    y=X[:, 1], 
    mode='markers', 
    marker=dict(color='blue', size=5),
    name='Points'
))

# Add vertical and horizontal lines to split the plane into 4 equal squares
fig.add_shape(
    type="line",
    x0=0.5, y0=0, x1=0.5, y1=1, opacity=0.5,
    line=dict(color="grey", width=1)
)
fig.add_shape(
    type="line",
    x0=0, y0=0.5, x1=1, y1=0.5, opacity=0.5,
    line=dict(color="grey", width=1)
)

# Add the Delaunay triangulation
for simplex in simplices:
    fig.add_trace(go.Scatter(
        x=X[simplex, 0], 
        y=X[simplex, 1], 
        mode='lines', 
        line=dict(color='orange'),
        name='Simplex'
    ))

# Update the layout
fig.update_layout(
    title='2D Delaunay Triangulation',
    xaxis_title='X',
    yaxis_title='Y',
    showlegend=False,
    width=800,
    height=800
)

# Show the plot
fig.show()


### Example of ASVD

In [15]:
# Example usage
p, k = 2, 1  # 3D points (2D base + 1D function value)

def f_2d_expl(points):
    """Example function: f(x, y) = x^2 + y^2"""
    points = np.atleast_2d(points)
    return np.sum(np.square(points), axis=1, keepdims=True)

# Generate some random 2D points
features_2d = ['x1', 'x2']
targets_1d = ['y1']
data_expl = pd.DataFrame(np.random.rand(10, 2), columns=features_2d)

# Initialize ASVD object
asvd_expl = ASVD(data_expl, features_2d, targets_1d, use_func=True, func=f_2d_expl)
print(f"Matrix of simplices by vertex index: \n{asvd_expl.simplices_idx[:2]}\n{asvd_expl.simplices_idx.shape}\n")
print(f"Volumes of the original simplices in features space: \n{asvd_expl.simplices_volumes_x}\n {asvd_expl.simplices_volumes_x.shape}\n")
print(f"Volumes of the augmented simplices on the function curve: \n{asvd_expl.simplices_volumes_xy}\n{asvd_expl.simplices_volumes_xy.shape}\n")

# Compute statistics
df_stats_expl = asvd_expl.compute_statistics()
print(f'ASVD metrics: \n{df_stats_expl}')


Matrix of simplices by vertex index: 
[[8 9 1]
 [2 6 4]]
(11, 3)

Volumes of the original simplices in features space: 
[0.03306416 0.06812032 0.02382122 0.15953705 0.07202073 0.00415471
 0.00212241 0.02032273 0.04940049 0.03579398 0.01474967]
 (11,)

Volumes of the augmented simplices on the function curve: 
[0.07964484 0.08520962 0.04451322 0.2207598  0.10281599 0.00725241
 0.00270492 0.04451044 0.07286519 0.08504027 0.02743763]
(11,)

ASVD metrics: 
              volumes_x  volumes_xy
count         10.000000   10.000000
sum            0.483107    0.772754
augmentation        NaN    1.599550
mean           0.048311    0.077275
std            0.038013    0.051380
min            0.002092    0.003319
25%            0.020075    0.040279
50%            0.034908    0.073175
75%            0.073467    0.111585
max            0.123368    0.173015
rsd            0.746462    0.630773
iqr            0.053392    0.071306
skewness       0.694753    0.367335
kurtosis      -0.582397   -0.706815


In [16]:
def set_meshes_trace(f, simplices_up, points_2d, reduce=False, color=None):
    if color is None:
        color=dict(simplex='red', vertice='blue')
    traces = []
    symplex_dim = 'Augmented' if not reduce else 'Reduced'
    vertices_3d = np.array([[*point, *f(point).ravel()] for point in points_2d])
    if reduce:
        simplices_up = simplices_up.copy()
        simplices_up[:, :, 2] = 0
        vertices_3d[:, 2] = 0

    # Add 2D meshes as surfaces
    for i, simplex in enumerate(simplices_up):
        simplex = np.array(simplex)
        traces.append(go.Mesh3d(
            x=simplex[:, 0],
            y=simplex[:, 1],
            z=simplex[:, 2],
            opacity=0.5,
            color=color['simplex'],
            name=f'{symplex_dim} simplices',
            legendgroup=symplex_dim,
            showlegend=(i==0 and not reduce)  # Show legend only for the first mesh
        ))

    # Add edges as lines
    Xe, Ye, Ze = [], [], []
    for simplex in simplices_up:
        for i in range(3):
            Xe.extend([simplex[i, 0], simplex[(i + 1) % 3, 0], None])
            Ye.extend([simplex[i, 1], simplex[(i + 1) % 3, 1], None])
            Ze.extend([simplex[i, 2], simplex[(i + 1) % 3, 2], None])
    traces.append(go.Scatter3d(
        x=Xe,
        y=Ye,
        z=Ze,
        mode='lines',
        line=dict(color='black', width=2),
        name=f'{symplex_dim} edges',
        legendgroup=symplex_dim,
        showlegend=False
    ))

    # Add vertices as scatters
    traces.append(go.Scatter3d(
        x=vertices_3d[:, 0],
        y=vertices_3d[:, 1],
        z=vertices_3d[:, 2],
        mode='markers',
        marker=dict(size=5, color=color['vertice']),
        name=f'{symplex_dim} vertices',
        legendgroup=symplex_dim if reduce else '',  # augmented vertices are independant
        showlegend=True
    ))
    return traces

def plot_augmented_2d_meshes(f, simplices_up, points_2d, color=None):
    # Create 3D plot
    fig = go.Figure()

    # Define the function to be plotted
    x = y = np.linspace(0, 1, 50)
    X, Y = np.meshgrid(x, y)
    Z = f(np.column_stack((X.ravel(), Y.ravel()))).reshape(X.shape)

    fig.add_trace(go.Surface(
        x=X, y=Y, z=Z, colorscale='Viridis', colorbar=dict(len=0.5, title=dict(text='f(x,y)', side='bottom')),
        opacity=1., name='f(x,y)', showlegend=True
    ))

    curve_traces = set_meshes_trace(f, simplices_up, points_2d, False, color)
    plane_traces = set_meshes_trace(f, simplices_up, points_2d, True, color)

    fig.add_traces(curve_traces)
    fig.add_traces(plane_traces)

    # Update layout
    fig.update_layout(
        title='2D Meshes on Function Curve Plane',
        scene=dict(
            xaxis_title='X',
            yaxis_title='Y',
            zaxis_title='Z',
            aspectmode='cube'
        ),
        width=800,
        height=800
    )

    return fig

In [17]:
# Create the plot
fig = plot_augmented_2d_meshes(f_2d_expl, asvd_expl.simplices_xy, data_expl[features_2d].values)
fig.show()

### ASVD for `custom_function_2d`

In [18]:
# Set linear-even sampling case
X_linear, Y_linear = np.meshgrid(x_linear, x_linear)
points_2d_linear = np.column_stack((X_linear.ravel(), Y_linear.ravel()))
data_linear = pd.DataFrame(points_2d_linear, columns=features_2d)

asvd_linear = ASVD(data_linear, features_2d, targets_1d, use_func=True, func=custom_function_2d)

# Set arc-even sampling case
X_arc, Y_arc = np.meshgrid(x_arc, x_arc)
points_2d_arc = np.column_stack((X_arc.ravel(), Y_arc.ravel()))
data_arc = pd.DataFrame(points_2d_arc, columns=features_2d)

asvd_arc = ASVD(data_arc, features_2d, targets_1d, use_func=True, func=custom_function_2d)

In [19]:
df_stats_linear = asvd_linear.compute_statistics()
print(f"Linear-even sampling asvd scores: \n{df_stats_linear}")

df_stats_arc = asvd_arc.compute_statistics()
print(f"Arc-even sampling asvd scores: \n{df_stats_arc}")

Linear-even sampling asvd scores: 
               volumes_x  volumes_xy
count         100.000000  100.000000
sum             1.000000    1.524730
augmentation         NaN    1.524730
mean            0.010000    0.015247
std             0.004084    0.007643
min             0.002058    0.002419
25%             0.008230    0.009632
50%             0.010288    0.014547
75%             0.012346    0.020941
max             0.016461    0.039183
rsd             0.406367    0.498780
iqr             0.004115    0.011309
skewness        0.054642    0.524601
kurtosis       -0.927010   -0.065232
Arc-even sampling asvd scores: 
               volumes_x  volumes_xy
count         100.000000  100.000000
sum             1.000000    3.987711
augmentation         NaN    3.987711
mean            0.010000    0.039877
std             0.014155    0.029611
min             0.000323    0.001099
25%             0.002601    0.018802
50%             0.005371    0.030846
75%             0.012895    0.054297
max     

In [20]:
# Create the plot?
fig = plot_augmented_2d_meshes(
    custom_function_2d, asvd_linear.simplices_xy, points_2d_linear,
    color=dict(simplex=COLORS[-1], vertice=linear_color)
)
fig.show()

In [21]:
# Create the plot
fig = plot_augmented_2d_meshes(
    custom_function_2d, asvd_arc.simplices_xy, points_2d_arc,
    color=dict(simplex=COLORS[-1], vertice=arc_color)
)
fig.show()

In [22]:
# Train 2 GP models:
#   - gp_linear.fit(points_2d_linear, f_2d(points_2d_linear))
#   - gp_arc.fit(points_2d_arc, f_2d(points_2d_arc))
# Generate points_2d_lhs: 100 random LHS samples in [0, 1]^2 for test 
# Set test set (points_2d_lhs, f_2d(points_2d_lhs))
# Compute train and test scores

In [25]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from scipy.stats import qmc

# Train GP models
kernel = C(1.0, (1e-3, 1e3)) * RBF(1, (1e-2, 1e2))
gp_linear = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, random_state=42)
gp_arc = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, random_state=42)
# gp_linear = GaussianProcessRegressor(random_state=42)
# gp_arc = GaussianProcessRegressor(random_state=42)

gp_linear.fit(points_2d_linear, custom_function_2d(points_2d_linear))
gp_arc.fit(points_2d_arc, custom_function_2d(points_2d_arc))

# Generate test data using Latin Hypercube Sampling (LHS)
n_test = 100
sampler = qmc.LatinHypercube(d=2, seed=42)
points_2d_lhs = sampler.random(n=n_test)

# Set test set
X_test = points_2d_lhs
y_test = custom_function_2d(X_test)

# Compute train and test scores
train_score_linear = gp_linear.score(points_2d_linear, custom_function_2d(points_2d_linear))
test_score_linear = gp_linear.score(X_test, y_test)
train_score_arc = gp_arc.score(points_2d_arc, custom_function_2d(points_2d_arc))
test_score_arc = gp_arc.score(X_test, y_test)

# Print train and test scores
print(f"Linear GP Model - Train Score: {train_score_linear:.4f}, Test Score: {test_score_linear:.4f}")
print(f"Arc GP Model - Train Score: {train_score_arc:.4f}, Test Score: {test_score_arc:.4f}")


Linear GP Model - Train Score: 1.0000, Test Score: -0.3844
Arc GP Model - Train Score: 1.0000, Test Score: -8.5659


In [48]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from scipy.stats import qmc

def analyze_gp_model(gp, X_train, y_train, X_test, y_test, model_name):
    # Initial hyperparameters
    initial_params = gp.kernel.get_params()
    initial_theta = gp.kernel.theta

    # Fit the model
    gp.fit(X_train, y_train)

    # Final hyperparameters
    final_params = gp.kernel_.get_params()
    final_theta = gp.kernel_.theta

    # Compute scores and log-likelihood
    train_score = gp.score(X_train, y_train)
    test_score = gp.score(X_test, y_test)
    log_likelihood = gp.log_marginal_likelihood(gp.kernel_.theta)

    print(f"\n{model_name} GP Model Analysis:")
    print(f"Train Score: {train_score:.4f}")
    print(f"Test Score: {test_score:.4f}")
    print(f"Log Marginal Likelihood: {log_likelihood:.4f}")

    hyperparameters = []
    for key in initial_params.keys():
        initial_value = initial_params[key]
        final_value = final_params[key]
        hyperparameters.append([key, initial_value, final_value])
    for i, (init, final) in enumerate(zip(initial_theta, final_theta)):
        hyperparameters.append([f"theta_{i}", init, final])
    
    df_hyperparameters = pd.DataFrame(hyperparameters,
        columns=["Hyperparameter", "Initial Value", "Final Value"]
    )
    df_hyperparameters.set_index("Hyperparameter", inplace=True)
    
    print(df_hyperparameters)

# Set training data
X_train_linear = points_2d_linear
y_train_linear = custom_function_2d(points_2d_linear)
X_train_arc = points_2d_arc
y_train_arc = custom_function_2d(points_2d_arc)

# Generate test data using Latin Hypercube Sampling (LHS)
n_test = 100
sampler = qmc.LatinHypercube(d=2, seed=42)
points_2d_lhs = sampler.random(n=n_test)

# Set test set
X_test = points_2d_lhs
y_test = custom_function_2d(X_test)

# Define and train GP models
kernel = C(1.0, (1e-3, 1e3)) * RBF(1, (1e-2, 1e2))

gp_linear = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, random_state=42)
gp_arc = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, random_state=42)

In [49]:
# Analyze Linear GP Model
analyze_gp_model(gp_linear, X_train_linear, y_train_linear, X_test, y_test, "Linear")


Linear GP Model Analysis:
Train Score: 1.0000
Test Score: -0.3844
Log Marginal Likelihood: 74.1874

Hyperparameters (initial -> final):
                                 Initial Value             Final Value
Hyperparameter                                                        
k1                                        1**2                0.365**2
k2                         RBF(length_scale=1)  RBF(length_scale=0.14)
k1__constant_value                         1.0                0.133034
k1__constant_value_bounds      (0.001, 1000.0)         (0.001, 1000.0)
k2__length_scale                             1                0.139962
k2__length_scale_bounds          (0.01, 100.0)           (0.01, 100.0)
theta_0                                    0.0               -2.017153
theta_1                                    0.0               -1.966387


In [50]:
# Analyze Arc GP Model
analyze_gp_model(gp_arc, X_train_arc, y_train_arc, X_test, y_test, "Arc")


Arc GP Model Analysis:
Train Score: 1.0000
Test Score: -8.5659
Log Marginal Likelihood: -51.5693

Hyperparameters (initial -> final):
                                 Initial Value               Final Value
Hyperparameter                                                          
k1                                        1**2                  0.479**2
k2                         RBF(length_scale=1)  RBF(length_scale=0.0391)
k1__constant_value                         1.0                  0.229692
k1__constant_value_bounds      (0.001, 1000.0)           (0.001, 1000.0)
k2__length_scale                             1                  0.039121
k2__length_scale_bounds          (0.01, 100.0)             (0.01, 100.0)
theta_0                                    0.0                 -1.471016
theta_1                                    0.0                 -3.241097


## TODO

### Next:
* in Notebook, generate LHS test samples and compare test scores of arc vs linear with augm_vicinity
* In notebook, for the arc_lenght interpolation, compute inegral(f, x, x+1) instead of integral(f, 0, x+1) .

### Later:
* ASVD is a special case of augmenting with a f_hat (model.predict)