<a href="https://colab.research.google.com/github/zw2788/LocalMinimaConstruction/blob/main/DwrtXGradientW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from typing import Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from IPython.display import Image
from torch.autograd import grad

In [9]:
class SimpleNN(nn.Module):
    def __init__(self, custom_W_0, custom_b, custom_V_0, custom_c):
        super(SimpleNN, self).__init__()

        # Ensure that the custom weights are tensors
        custom_W_0 = torch.tensor(custom_W_0, dtype=torch.float64)
        custom_b = torch.tensor(custom_b, dtype=torch.float64)
        custom_V_0 = torch.tensor(custom_V_0, dtype=torch.float64)
        custom_c = torch.tensor(custom_c, dtype=torch.float64)

        # Set the custom weights and biases
        self.W_0 = nn.Parameter(custom_W_0)
        self.b = nn.Parameter(custom_b)
        self.V_0 = nn.Parameter(custom_V_0)
        self.c = nn.Parameter(custom_c)

    def forward(self, x):
        x = F.sigmoid(torch.add(torch.matmul(x, self.W_0), self.b))
        x = F.sigmoid(torch.add(torch.matmul(x, self.V_0), self.c))
        return x

# Example usage
#custom_W_0 = [[0.1, 0.2], [0.3, 0.4]]  # Replace with your own initial values
#custom_b = [0.1, 0.2]  # Replace with your own initial values
#custom_V_0 = [[0.1], [0.2]]  # Replace with your own initial values
#custom_c = [0.1]  # Replace with your own initial values


def calculate_second_order_grad(model, X_raw_torch, Y_torch):
    # Forward pass
    output = model(X_raw_torch)
    # Compute loss
    loss = -torch.mean(Y_torch * torch.log(output) + (1 - Y_torch) * torch.log(1 - output))
    # Compute gradients of the loss w.r.t. weights
    loss.backward(create_graph=True)
    # Combine and compute the norm of all gradients
    all_grads = torch.cat([param.grad.flatten() for param in model.parameters()])
    grad_norm = torch.norm(all_grads)
    #print(all_grads)
    # Compute the derivative of the grad_norm with respect to X
    second_order_grad = torch.autograd.grad(grad_norm, X_raw_torch, retain_graph=True)[0]
    return second_order_grad

def perturb_weights(model, max_deviation=0.01):
    with torch.no_grad():
        for param in model.parameters():
            std_dev = param.abs().mean() * max_deviation
            noise = torch.randn(param.size()) * std_dev
            param[:] = param + noise

def restore_weights(model, saved_state):
    with torch.no_grad():
        for name, param in model.named_parameters():
            param[:] = saved_state[name]

def perturb_data(X, max_deviation=0.01):
    """Perturb the data tensor X."""
    with torch.no_grad():
        std_dev = X.abs().mean() * max_deviation
        noise = torch.randn(X.size()) * std_dev
        X.add_(noise)

In [3]:
# Re-executing the code to define the function for computing the Hessian matrix and its eigenvalues

def compute_hessian_and_eigenvalues(model, data, target):
    """
    Compute the Hessian matrix and its eigenvalues for the weights of a neural network model.

    :param model: The neural network model.
    :param data: Input data (X).
    :param target: Target data (Y).
    :return: Hessian matrix and its eigenvalues.
    """
    # Forward pass
    output = model(data)
    # Compute loss
    loss = -torch.mean(target * torch.log(output) + (1 - target) * torch.log(1 - output))

    # First-order gradients (w.r.t weights)
    first_order_grads = torch.autograd.grad(loss, model.parameters(), create_graph=True)

    # Flatten the first-order gradients
    grads_flatten = torch.cat([g.contiguous().view(-1) for g in first_order_grads])

    # Hessian computation
    hessian = []
    for grad in grads_flatten:
        # Compute second-order gradients (w.r.t each element in the first-order gradients)
        second_order_grads = torch.autograd.grad(grad, model.parameters(), retain_graph=True)

        # Flatten and collect the second-order gradients
        hessian_row = torch.cat([g.contiguous().view(-1) for g in second_order_grads])
        hessian.append(hessian_row)

    # Stack to form the Hessian matrix
    hessian_matrix = torch.stack(hessian)

    # Compute eigenvalues
    eigenvalues, _ = torch.linalg.eig(hessian_matrix)

    return hessian_matrix, eigenvalues

# Note: To use this function, you'll need to provide your neural network model, the input data (X), and the target data (Y).

def check_local_minimum(eigenvalues):
    # Check if all eigenvalues have a positive real part
    if all(eig.real > 0 for eig in eigenvalues):
        print("This is a local minimum.")
    else:
        print("This is not a local minimum.")


In [16]:
data = pd.read_csv(
    "https://raw.githubusercontent.com/zw2788/LocalMinimaConstruction/main/output(49830_100000).csv")

data.head()

# data , drop NaN values
X_raw,  Y, W_0, b, V_0, c = data[['x_2dvec']].dropna().values, data['y'].dropna().values, data[['W_0']].dropna().values, data[['b']].dropna().values, data[['V_0']].dropna().values, data[['c']].dropna().values

#convert string to array

X_raw = np.array([eval(s[0]) for s in X_raw])

W_0 = np.array([eval(s[0]) for s in W_0])

b = np.array([eval(s[0]) for s in b])

V_0 = np.array([eval(s[0]) for s in V_0])

c = np.array([eval(s[0]) for s in c])

# Standardize the input
# Leave blank to match the example in paper

# formatting
Y = Y.reshape((-1, 1))
print(X_raw)
print(Y)
print(W_0)
#print(X_raw.shape[0])
X_raw = torch.tensor(X_raw, requires_grad=True)
Y = torch.tensor(Y)
print(W_0, b, V_0, c)

[[-4.05673409 -1.57919633]
 [-4.08621693 -7.32655954]
 [ 5.7382369   6.51035595]
 [-3.54674959 -9.48523903]
 [-5.44635487 -1.49153161]
 [-3.05762291  7.25649118]
 [-9.07657528  3.435395  ]
 [-3.56827617 -2.80349135]
 [ 4.2420435   2.56337643]
 [-8.37438869  4.51332331]]
[[1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[[ 3.37760234 -8.4564867 ]
 [ 0.92768544  2.91912866]]
[[ 3.37760234 -8.4564867 ]
 [ 0.92768544  2.91912866]] [[ 4.70377207 -0.07312825]] [[ 4.8233037 ]
 [-4.45070505]] [[-4.8279748]]


In [17]:
nn_model = SimpleNN(W_0, b, V_0, c)
hessian_matrix_initial, eigenvalues_initial = compute_hessian_and_eigenvalues(nn_model, X_raw, Y)

print(eigenvalues_initial)
check_local_minimum(eigenvalues_initial)

tensor([-3.2549e+00+0.j,  1.0070e-01+0.j,  8.7098e-03+0.j,  1.8168e-03+0.j,
        -2.7687e-04+0.j,  4.1569e-05+0.j, -6.5372e-09+0.j,  1.3711e-10+0.j,
        -4.7058e-13+0.j], dtype=torch.complex128)
This is not a local minimum.


In [None]:
# Convert data to PyTorch tensors
X_raw_torch = torch.tensor(X_raw, requires_grad=True)
Y_torch = torch.tensor(Y)


# Set a threshold for the norm of the second-order gradient
threshold = 0.001 # Adjust this threshold as needed
max_iterations = 50 # Maximum number of iterations to prevent infinite loops

# Learning rate
learning_rate = 0.1

# Monte Carlo method sampling points
MC_num_samples = 1500

# Surrouning points' grads' propotion
surrounding_propotion = 0.8

# Weight perturbation
max_deviation_for_weight = 0.25

nn_model = SimpleNN(W_0, b, V_0, c)

#original_weights = W_0, b, V_0, c
original_weights = {
    'W_0': nn_model.W_0.data.clone(),
    'b': nn_model.b.data.clone(),
    'V_0': nn_model.V_0.data.clone(),
    'c': nn_model.c.data.clone()
}
print("Original weight is {}".format(original_weights))
print("Initial X_raw {}".format(X_raw_torch))
#max_deviation_for_X = 0.02  # You can adjust this value as needed
#perturb_data(X_raw_torch, max_deviation=max_deviation_for_X)
#print("Perturbed X_raw {}".format(X_raw_torch))

for i in range(max_iterations):

    # Calculate the gradient at the central point
    central_grad = calculate_second_order_grad(nn_model, X_raw_torch, Y_torch)
    central_grad_norm = torch.norm(central_grad)
    central_grad = central_grad / central_grad_norm
    #print(central_grad)
    # Surrouning points' grads
    surrounding_grads = []
    norms = []


    # Calculate the gradient at the surrounding points by MC
    for _ in range(MC_num_samples):

      nn_model_sample = SimpleNN(custom_W_0=original_weights['W_0'],custom_b=original_weights['b'],custom_V_0=original_weights['V_0'],custom_c=original_weights['c'])
      #print("W_0 (before perturbation):", nn_model_sample.W_0.data)
      # Perturb weights
      perturb_weights(nn_model_sample, max_deviation=max_deviation_for_weight)
      #print("W_0 (after perturbation):", nn_model_sample.W_0.data)
      # Calculate second-order gradient
      grad = calculate_second_order_grad(nn_model_sample, X_raw_torch, Y_torch)
      #grad_norm = torch.norm(grad)
      #grad = grad / grad_norm
      surrounding_grads.append(grad)
      norms.append(torch.norm(grad).item())

    # Calculate average norm
    average_norm = sum(norms) / len(norms)

    # Filter and sum gradients with norms above average
    above_average_grads = [grad for grad, norm in zip(surrounding_grads, norms) if norm > average_norm]
    above_average_grads = [grad / torch.norm(grad) for grad in above_average_grads]

    #print(above_average_grads)
    sum_above_average_grads = sum(above_average_grads)

    # Average the large norm gradients
    if above_average_grads:
      average_above_average_grad = sum_above_average_grads / len(above_average_grads)
      #print(average_above_average_grad)
    else:
    # Handle the case where no gradient is above average
      average_above_average_grad = torch.zeros_like(X_raw_torch)


    #print("Surrounding grad {}".format(surrounding_grads))

    # Combine gradients
    combined_grad = (1-surrounding_propotion) * central_grad + surrounding_propotion * average_above_average_grad
    #combined_grad =  average_surrounding_grad
    #print(combined_grad)
    # Calculate the norm of the combined gradient
    combined_grad_norm = torch.norm(combined_grad)

    # Check for a non-zero norm to avoid division by zero
    if combined_grad_norm > 0:
    # Normalize the gradient
      normalized_grad = combined_grad / combined_grad_norm

      # Update X_raw using the normalized gradient and learning rate
      X_raw_torch.data -= learning_rate * normalized_grad
    else:
      print("Gradient is zero; no update required.")
###############

    # Check if the norm of the second-order gradient is below the threshold
    if torch.norm(combined_grad) < threshold:
        print(f"Convergence reached at iteration {i}")
        break

    # Update X_raw using gradient descent
    X_raw_torch.data -= learning_rate * normalized_grad

    # Zero out gradients for the next iteration
    nn_model.zero_grad()
    X_raw_torch.grad = None

# Print final modified data
#print(surrounding_grads)
#print("Final modified X_raw:")
if len(above_average_grads) < 100:
  print("need more MC_num_samples")
else:
  print("Used surrounding points: {}".format(len(above_average_grads)))

print(X_raw_torch)

  X_raw_torch = torch.tensor(X_raw, requires_grad=True)
  Y_torch = torch.tensor(Y)
  custom_W_0 = torch.tensor(custom_W_0, dtype=torch.float64)
  custom_b = torch.tensor(custom_b, dtype=torch.float64)
  custom_V_0 = torch.tensor(custom_V_0, dtype=torch.float64)
  custom_c = torch.tensor(custom_c, dtype=torch.float64)


Original weight is {'W_0': tensor([[ 3.3776, -8.4565],
        [ 0.9277,  2.9191]], dtype=torch.float64), 'b': tensor([[ 4.7038, -0.0731]], dtype=torch.float64), 'V_0': tensor([[ 4.8233],
        [-4.4507]], dtype=torch.float64), 'c': tensor([[-4.8280]], dtype=torch.float64)}
Initial X_raw tensor([[-4.0567, -1.5792],
        [-4.0862, -7.3266],
        [ 5.7382,  6.5104],
        [-3.5467, -9.4852],
        [-5.4464, -1.4915],
        [-3.0576,  7.2565],
        [-9.0766,  3.4354],
        [-3.5683, -2.8035],
        [ 4.2420,  2.5634],
        [-8.3744,  4.5133]], dtype=torch.float64, requires_grad=True)


In [39]:
nn_model_final = SimpleNN(W_0, b, V_0, c)
hessian_matrix_final, eigenvalues_final = compute_hessian_and_eigenvalues(nn_model, X_raw_torch, Y_torch)
print(X_raw_torch)
print(Y_torch)
print(eigenvalues_final)
check_local_minimum(eigenvalues_final)

tensor([[-4.3797, -1.2802],
        [-3.9362, -7.3750],
        [ 5.8204,  6.4175],
        [-3.7817, -8.6317],
        [-5.4445, -1.4907],
        [-1.1108,  7.7896],
        [-9.0798,  3.4312],
        [-3.5607, -2.8020],
        [ 4.2450,  2.5582],
        [-8.3687,  4.5204]], dtype=torch.float64, requires_grad=True)
tensor([[1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], dtype=torch.float64)
tensor([ 1.0233e-01+0.j, -4.8506e-02+0.j,  1.2007e-03+0.j, -1.4050e-04+0.j,
        -9.4907e-05+0.j,  3.1041e-05+0.j, -3.1141e-08+0.j,  1.4997e-10+0.j,
        -4.0029e-13+0.j], dtype=torch.complex128)
This is not a local minimum.


# Set a threshold for the norm of the second-order gradient
threshold = 0.05 # Adjust this threshold as needed
max_iterations = 10  # Maximum number of iterations to prevent infinite loops

# Learning rate
learning_rate = 0.2

# Monte Carlo method sampling points
MC_num_samples = 10

# Surrouning points' grad
surrounding_grads = []

# parameters for the first layer
W_0 = np.array([[1.05954587,-0.05625762],[-0.03749863,1.09518945]])
b = np.array([[-0.050686,-0.06894291]])

# parameters for the second layer

V_0 = np.array([[3.76921058],[-3.72139955]])
c = np.array([[-0.0148436]])

nn_model = SimpleNN(W_0, b, V_0, c)

perturb_weights(nn_model, max_deviation=0.01)
restore_weights(nn_model, original_weights)  # Assuming perturb_weights is defined as before
print(perturb_weights)
K=calculate_second_order_grad(nn_model, X_raw_torch, Y_torch)
print(K)
print("W_0 (after perturbation):", nn_model.W_0.data)
print("b (after perturbation):", nn_model.b.data)
print("V_0 (after perturbation):", nn_model.V_0.data)
print("c (after perturbation):", nn_model.c.data)

# Forward pass
output = nn_model(X_raw_torch)

# Compute loss
loss = -torch.mean(Y_torch * torch.log(output) + (1 - Y_torch) * torch.log(1 - output))
print(loss)
# Compute gradients of the loss w.r.t. weights
loss.backward(create_graph=True)


# Combine and compute the norm of all gradients
all_grads = torch.cat([nn_model.W_0.grad.flatten(), nn_model.V_0.grad.flatten(), nn_model.b.grad.flatten(), nn_model.c.grad.flatten()])
print(all_grads)
grad_norm = torch.norm(all_grads)
print(grad_norm)
# Compute the derivative of the grad_norm with respect to X
second_order_grad = torch.autograd.grad(grad_norm, X_raw_torch, retain_graph=True)[0]
print(torch.norm(second_order_grad))
# If you want to perform gradient descent on X_raw
learning_rate = 0.01
#X_raw_torch.data -= learning_rate * second_order_grad