<a href="https://colab.research.google.com/github/theostoican/MastersThesis/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import copy
import csv
import torch.nn as nn
import numpy as np
import torch
from torch.autograd import Variable

# Various modelling parameters

In [None]:
#N is batch size; D_in is input dimension;
#H is the dimension of the hidden layer; D_out is output dimension.
N, D_in, H_teacher, H_student, D_out = 1, 2, 4, 5, 1
num_experiments = 1000

# Dataset creation

In [None]:
def construct_dataset():
  data = []
  for y in np.arange(-5, 5.1, .25):
    for x in np.arange(-5, 5.1, .25):
      data.append([x, y])
  return data

data = torch.Tensor(construct_dataset()) 
print(len(construct_dataset()))

1681


# Models

In [None]:
class TeacherNetwork(nn.Module):
  def __init__(self, D_in, H, D_out):
    """
    In the constructor we instantiate two nn.Linear modules and assign them as
    member variables.

    D_in: input dimension
    H: dimension of hidden layer
    D_out: output dimension of the first layer
    """
    super(TeacherNetwork, self).__init__()
    self.linear1 = nn.Linear(D_in, H, bias=False) 
    self.linear2 = nn.Linear(H, D_out, bias=False)
    self.linear1.weight = torch.nn.Parameter(torch.transpose(torch.Tensor([[0.6, -0.5, -0.2, 0.1], [0.5, 0.5, -0.6, -0.6]]), 0, 1))
    self.linear2.weight = torch.nn.Parameter(torch.transpose(torch.Tensor([[1], [-1], [1], [-1]]), 0, 1))
  def forward(self, x):
    """
    In the forward function we accept a Variable of input data and we must
    return a Variable of output data. We can use Modules defined in the
    constructor as well as arbitrary operators on Variables.
    """
    h_sigmoid = torch.sigmoid(self.linear1(x))
    y_pred = self.linear2(h_sigmoid)
    return y_pred

In [None]:
class StudentNetwork(nn.Module):
  def __init__(self, D_in, H, D_out):
    """
    In the constructor we instantiate two nn.Linear modules and assign them as
    member variables.

    D_in: input dimension
    H: dimension of hidden layer
    D_out: output dimension of the first layer
    """
    super(StudentNetwork, self).__init__()
    self.linear1 = nn.Linear(D_in, H, bias=False) 
    self.linear2 = nn.Linear(H, D_out, bias=False)
    nn.init.xavier_uniform_(self.linear1.weight)
    nn.init.xavier_uniform_(self.linear2.weight)
  def forward(self, x):
    """
    In the forward function we accept a Variable of input data and we must
    return a Variable of output data. We can use Modules defined in the
    constructor as well as arbitrary operators on Variables.
    """
    h_sigmoid = torch.sigmoid(self.linear1(x))
    y_pred = self.linear2(h_sigmoid)
    return y_pred

# Generation of the labels based on the teacher model.

In [None]:
teacher_model = TeacherNetwork(D_in, H_teacher, D_out)
y_labels = teacher_model(data).detach()

# Training

In [None]:
def train(model, x, y_labels, N = 1000, Ninner = 10**3, Nstart = 10,
          maxtime = 10 ** 3, nlopt_threshold = 1e-7,
          collect_history = True):
  optimizer = torch.optim.Adam(model.parameters())
  loss_fn = nn.MSELoss()
  loss_vals = []
  trace = []
  if collect_history:
    trace.append((copy.deepcopy(model.linear1.weight.data.detach().numpy()),
                  copy.deepcopy(model.linear2.weight.data.detach().numpy())))
  for i in range(N):
    loss_tmp = []
    for j in range(Ninner):
      y = model(x)
      loss = loss_fn(y, y_labels)
      loss_grad = torch.autograd.grad(loss, model.parameters(), retain_graph=True)
      loss_tmp.append(loss.item())
      optimizer.zero_grad()
      loss.backward(retain_graph=True)
      optimizer.step()
      if i == 0 and (j % Nstart == 0) and j > 0:
        loss_vals.append(np.mean(loss_tmp[j - Nstart : j]))
        if collect_history:
          trace.append((copy.deepcopy(model.linear1.weight.data.detach().numpy()),
                        copy.deepcopy(model.linear2.weight.data.detach().numpy())))
    loss_vals.append(np.mean(loss_tmp))
    if collect_history:
      trace.append((copy.deepcopy(model.linear1.weight.data.detach().numpy()),
                    copy.deepcopy(model.linear2.weight.data.detach().numpy())))
    # stopping criterion
    cnt = 0
    for g in loss_grad:
        g_vector = g.contiguous().view(-1) if cnt == 0 else torch.cat([g_vector, g.contiguous().view(-1)])
        cnt = 1
    print("Iteration: %d, loss: %s, gradient norm: %s" % (Ninner * i, np.mean(loss_tmp), torch.norm(g_vector)))
    if torch.norm(g_vector) <= 2e-6:
      break
  return loss_vals, trace

# Hessian evaluation

In [None]:
# eval Hessian matrix
def eval_hessian(loss_grad, model):
    cnt = 0
    for g in loss_grad:
        g_vector = g.contiguous().view(-1) if cnt == 0 else torch.cat([g_vector, g.contiguous().view(-1)])
        cnt = 1
    grad_norm = torch.norm(g_vector)
    l = g_vector.size(0)
    hessian = torch.zeros(l, l)
    for idx in range(l):
        grad2rd = torch.autograd.grad(g_vector[idx], model.parameters(), create_graph=True)
        cnt = 0
        for g in grad2rd: 
            g2 = g.contiguous().view(-1) if cnt == 0 else torch.cat([g2, g.contiguous().view(-1)])
            cnt = 1
        hessian[idx] = g2
    hessian = (hessian + hessian.T) / 2
    return grad_norm.cpu().data.numpy(), hessian.cpu().data.numpy()

In [None]:
file_experiment_header = ['loss', 'gradient norm', 'smallest eigenvalue', 'student size']

for i in range(0, H_student):
  file_experiment_header.append('neuron_' + str(i) + '_traj_x')
  file_experiment_header.append('neuron_' + str(i) + '_traj_y')
  file_experiment_header.append('neuron_' + str(i) + '_a')

file_experiment_header.append('teacher_neurons_x')
file_experiment_header.append('teacher_neurons_y')


file_experiment_data = open('experiments_data.csv', 'w')
writer = csv.writer(file_experiment_data)
writer.writerow(file_experiment_header)

for num_experiment in range(0, num_experiments):
  student_model = StudentNetwork(D_in, H_student, D_out)
  loss_vals, trace = train(student_model, data, y_labels)
  last_loss_val = loss_vals[-1]

  loss_grad = torch.autograd.grad(nn.MSELoss()(student_model(data), y_labels), student_model.parameters(), create_graph=True)
  grad_norm, hessian = eval_hessian(loss_grad, student_model)
  smallest_eigenvalue = np.min(np.linalg.eigvals(hessian))

  row = [last_loss_val, grad_norm, smallest_eigenvalue, H_student]

  for i in range(0, H_student):
    neuron_w_x = []
    neuron_w_y = []
    neuron_a = []
    for (inp_weights, out_weights) in trace:
      neuron_w_x.append(inp_weights[i][0])
      neuron_w_y.append(inp_weights[i][1])
      neuron_a.append(out_weights[0][i])
    row.append(neuron_w_x)
    row.append(neuron_w_y)
    row.append(neuron_a[-1])

  teacher_neurons_x = [0.6, -0.5, -0.2, 0.1]
  teacher_neurons_y = [0.5, 0.5, -0.6, -0.6]
  row.append(teacher_neurons_x)
  row.append(teacher_neurons_y)

  writer.writerow(row)

file_experiment_data.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Iteration: 5000, loss: 0.9713216862678528, gradient norm: tensor(1.9700)
Iteration: 6000, loss: 0.6166947178542614, gradient norm: tensor(1.5331)
Iteration: 7000, loss: 0.35503180128335954, gradient norm: tensor(1.1229)
Iteration: 8000, loss: 0.17708319865912198, gradient norm: tensor(0.7507)
Iteration: 9000, loss: 0.07077028541639448, gradient norm: tensor(0.4335)
Iteration: 10000, loss: 0.019779361163266004, gradient norm: tensor(0.1944)
Iteration: 11000, loss: 0.0030475669621955605, gradient norm: tensor(0.0554)
Iteration: 12000, loss: 0.00018881847929151264, gradient norm: tensor(0.0071)
Iteration: 13000, loss: 2.071104802598711e-05, gradient norm: tensor(0.0002)
Iteration: 14000, loss: 1.7513733353553106e-05, gradient norm: tensor(6.4096e-05)
Iteration: 15000, loss: 1.6439860997707002e-05, gradient norm: tensor(6.4526e-05)
Iteration: 16000, loss: 1.5796905536262784e-05, gradient norm: tensor(6.3936e-05)
Iteration: 17

KeyboardInterrupt: ignored

# Plotting

In [None]:
# teacher_neurons_x = [0.6, -0.5, -0.2, 0.1]
# teacher_neurons_y = [0.5, 0.5, -0.6, -0.6]

In [None]:

# neuron_0_w_x = []
# neuron_0_w_y = []
# neuron_0_a = []

# neuron_1_w_x = []
# neuron_1_w_y = []
# neuron_1_a = []

# neuron_2_w_x = []
# neuron_2_w_y = []
# neuron_2_a = []

# neuron_3_w_x = []
# neuron_3_w_y = []
# neuron_3_a = []

# neuron_4_w_x = []
# neuron_4_w_y = []
# neuron_4_a = []

# for (inp_weights, out_weights) in trace:
#   neuron_0_w_x.append(inp_weights[0][0])
#   neuron_0_w_y.append(inp_weights[0][1])
#   neuron_0_a.append(out_weights[0][0])

#   neuron_1_w_x.append(inp_weights[1][0])
#   neuron_1_w_y.append(inp_weights[1][1])
#   neuron_1_a.append(out_weights[0][1])

#   neuron_2_w_x.append(inp_weights[2][0])
#   neuron_2_w_y.append(inp_weights[2][1])
#   neuron_2_a.append(out_weights[0][2])

#   neuron_3_w_x.append(inp_weights[3][0])
#   neuron_3_w_y.append(inp_weights[3][1])
#   neuron_3_a.append(out_weights[0][3])

#   neuron_4_w_x.append(inp_weights[4][0])
#   neuron_4_w_y.append(inp_weights[4][1])
#   neuron_4_a.append(out_weights[0][4])

# plt.plot(neuron_0_w_x, neuron_0_w_y)
# plt.plot(neuron_1_w_x, neuron_1_w_y)
# plt.plot(neuron_2_w_x, neuron_2_w_y)
# plt.plot(neuron_3_w_x, neuron_3_w_y)
# plt.plot(neuron_4_w_x, neuron_4_w_y)

# plt.scatter(teacher_neurons_x, teacher_neurons_y, marker="*")

# outgoing_weights = [neuron_0_a[-1], neuron_1_a[-1], neuron_2_a[-1], neuron_3_a[-1], neuron_4_a[-1]]
# plt.scatter([neuron_0_w_x[-1], neuron_1_w_x[-1], neuron_2_w_x[-1], neuron_3_w_x[-1], neuron_4_w_x[-1]],
#             [neuron_0_w_y[-1], neuron_1_w_y[-1], neuron_2_w_y[-1], neuron_3_w_y[-1], neuron_4_w_y[-1]],
#             c = outgoing_weights,
#             cmap=matplotlib.cm.jet)
# plt.colorbar()

# # Teacher's neurons
# #[0.6, -0.5, -0.2, 0.1],
# #[0.5, 0.5, -0.6, -0.6],

# Checking for local minima

In [None]:
# class DummyNetwork(nn.Module):
#   def __init__(self, D_in, H, D_out, w_in, w_out):
#     """
#     In the constructor we instantiate two nn.Linear modules and assign them as
#     member variables.

#     D_in: input dimension
#     H: dimension of hidden layer
#     D_out: output dimension of the first layer
#     """
#     super(DummyNetwork, self).__init__()
#     self.linear1 = nn.Linear(D_in, H, bias=False) 
#     self.linear2 = nn.Linear(H, D_out, bias=False)
#     self.linear1.weight = torch.nn.Parameter(w_in)
#     self.linear2.weight = torch.nn.Parameter(w_out)
#   def forward(self, x):
#     """
#     In the forward function we accept a Variable of input data and we must
#     return a Variable of output data. We can use Modules defined in the
#     constructor as well as arbitrary operators on Variables.
#     """
#     h_sigmoid = torch.sigmoid(self.linear1(x))
#     y_pred = self.linear2(h_sigmoid)
#     return y_pred

In [None]:
# print (trace[-1])
# def loss_reducer(w_in, w_out):
#   dummy_model = DummyNetwork(D_in, H_teacher, D_out, w_in, w_out)
#   return nn.MSELoss()(dummy_model(data), y_labels)
  
# print(loss_reducer(torch.Tensor(trace[-1][0]), torch.Tensor(trace[-1][1])))
# H = torch.autograd.functional.hessian(loss_reducer, (torch.Tensor(trace[-1][0]), torch.Tensor(trace[-1][1])))
# eval Hessian matrix
# def eval_hessian(loss_grad, model):
#     cnt = 0
#     for g in loss_grad:
#         g_vector = g.contiguous().view(-1) if cnt == 0 else torch.cat([g_vector, g.contiguous().view(-1)])
#         cnt = 1
#     print(torch.norm(g_vector))
#     l = g_vector.size(0)
#     hessian = torch.zeros(l, l)
#     for idx in range(l):
#         grad2rd = torch.autograd.grad(g_vector[idx], model.parameters(), create_graph=True)
#         cnt = 0
#         for g in grad2rd: 
#             g2 = g.contiguous().view(-1) if cnt == 0 else torch.cat([g2, g.contiguous().view(-1)])
#             cnt = 1
#         hessian[idx] = g2
#     return hessian.cpu().data.numpy()