In [None]:
import numpy as np
import numpy.random as rand
import matplotlib.pyplot as plt

from numpy.linalg import svd

from sklearn.datasets import make_blobs, make_moons, make_circles
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Introduction to Neural Networks and Deep Learning


## The simplest *Neural Network*: the perceptron

We can think of the perceptron as the simplest neural network, composed by a single artificial neuron. An artificial neuron is a function of the input $\mathbf{x}=(x_1, \ldots , x_M)$ weighted by a vector of connection weights $\mathbf{w}={w_1, \ldots w_M}$, completed by a neuron bias $b$ and passed to an **activation function** $\hat{y}=\phi(f)$
<center>
<img src="img/perceptron.png" alt="Perceptron" width="350"/>
</center>
$$ \hat{y} = \phi(\mathbf{w}^\top \mathbf{x} + b )$$

Learning with the perceptron consists in updating the weights:

$$\mathbf{w}^{(t+1)}= \mathbf{w}^t + \eta (y_i -\hat{y_i})x_i $$ and
$$b^{(t+1)}= b^t + \eta (y_i -\hat{y_i})$$

with $i \in {1, \ldots, N}$, where $\eta$ is the learning rate, $y_i$ is the correct output for the input $x_i$ and $N$ is the size of the training set.

In [None]:
# Load the dataset
fname='data1.txt'
data = np.loadtxt('data/%s' % fname, delimiter=',')
 
X = data[:, 0:2] 
y = data[:, 2]
 
# Plot data 
plt.plot(X[:,0][y == 1], X[:,1][y == 1], 'r+')
plt.plot(X[:,0][y == 0], X[:,1][y == 0], 'bx')
plt.title(fname)
plt.show()

> **Task 1**: Implement the perceptron algorithm, using the step activation function:
$$
f(x) =
\begin{cases}
0, & x < 0 \\
1, & x \geq 0
\end{cases}
$$

In [None]:
def activation(x):
    # Step activation
    if x >= 0 : return 1
    else : return 0

def perceptron(W,b,x,y,lr):
    """
    Args:
      W (np.array): (INPUT_SHAPE, 1) vector of weights
      b : bias
      x (np.array): (INPUT_SHAPE, 1) input vector (point coordinates)
      y : output label (1 or 0)
      lr : learning rate
    Returns:
        (err, W, b) : err is 1 if the predicted label is different from y, 0 otherwise
        W, b : the updated weights and bias
    """
    #insert your solution here
    
    
    return (err, W, b)

The following cell will run the perceptron on the dataset above and display the *decision boundary*:

$\mathbf{w}^\top \mathbf{x} + b = 0$

Which for our 2D points corresponds to: $w_0 x_1 + w_1 x_2 + b = 0$

Therefore, solving for $x_2$: $x_2 = -\frac{w_0}{w_1} x_1 - \frac{b}{w_1}$

In [None]:
#initialize weights and b
b = 0.0
W = rand.normal(size=(X.shape[1], 1))

#main algorithm
errors=1
while errors > 0:
    # start epoch
    errors=0
    for i in range(X.shape[0]):
        xi = X[i,:].reshape(-1,1)
        yi = y[i]
        (err, W, b) = perceptron(W,b,xi,yi, 0.1)
        errors+=err
    #print(errors)
    
# plot decision boundary
fig, ax = plt.subplots()
ax.plot(X[:,0][y == 1], X[:,1][y == 1], 'r+', label="$y=1$")
ax.plot(X[:,0][y == 0], X[:,1][y == 0], 'bx', label="$y=0$")

# decision boundary: W0 * x_1 + W1 * x_2 + b = 0
x_vals = np.linspace(min(X[:,0]), max(X[:,0]), 200)
y_vals = -(W[0] * x_vals + b) / W[1]

ax.plot(x_vals, y_vals, 'g-', label="Decision boundary")

plt.legend()
plt.show()


> **Question 1**: Consider the dataset data2.txt shown below. What is going to be the problem with this dataset? Can you propose a solution?

In [None]:
# Load the dataset
fname='data2.txt'
data = np.loadtxt('data/%s' % fname, delimiter=',')
 
X = data[:, 0:2] 
y = data[:, 2]
 
# Plot data 
plt.plot(X[:,0][y == 1], X[:,1][y == 1], 'r+')
plt.plot(X[:,0][y == 0], X[:,1][y == 0], 'bx')
plt.title(fname)
plt.show()

In [None]:
import torch
import torch.nn as nn

# Introduction to Deep Learning Using PyTorch


In this part of the lab we will learn to use <a href="https://pytorch.org/">PyTorch</a> to build more complex neural networks. PyTorch is a high-level API for deep learning. It allows users to implement deep learning models very fast and with minimum effort.

The fondamental data structure in PyTorch are <a href="https://pytorch.org/docs/stable/tensors.html#torch.Tensor">Tensors</a>. If you are familiar with Numpy arrays, you will find it easy to adapt to work with tensors. You may also take a look at the [tensor_tutorial](extras/tensor_tutorial.ipynb) for some examples.

In this part of the lab, we will implement a simple feedforward neural network to perform classification on a synthetic dataset, of two classes. Your first objective is to create this dataset. It will consist of 200 points in the 2-dimensional space $(N = 200, d = 2)$. Each point will belong either to class 0 or to class 1 (100 points per class), drawn from a Gaussian distribution: 

$$ \mathbf{x}_i \sim \mathcal{N}(\boldsymbol{\mu}_k, \boldsymbol{\Sigma}_k) $$

for class $k$. For class 0, we have $\boldsymbol{\mu}_0 = [1,1]$ and the covariance matrix is:

$$ \boldsymbol{\Sigma}_1 = \begin{bmatrix} 0.5 & 0 \\ 0 & 0.5 \end{bmatrix} $$

For class 1, $\boldsymbol{\mu}_1 = [-1,-1]$ and $\boldsymbol{\Sigma}_1 = \boldsymbol{\Sigma}_0$. To generate these values make use of the [`randn`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.randn.html) function of NumPy that returns a sample from the "standard normal" distribution as follows: 

```python
sd * np.random.randn(...) + mu
```

In [None]:
import numpy as np

N = 200
d = 2
num_classes = 2

N_per_class=N//2

X = np.zeros((N, d))
y = np.zeros(N, dtype=np.int64)


# Mean vectors
mu0 = np.array([1, 1])
mu1 = np.array([-1, -1])


# Standard deviation (since covariance is diagonal with 0.5)
sd = np.sqrt(0.5)


np.random.seed(101)
# Generate class 0 samples
X0 = sd * np.random.randn(N_per_class, d) + mu0
y0 = np.zeros(N_per_class)


# Generate class 1 samples
X1 = sd * np.random.randn(N_per_class, d) + mu1
y1 = np.ones(N_per_class)


# Combine data
X = np.vstack((X0, X1))
y = np.hstack((y0, y1))


# Shuffle dataset
idx = np.random.permutation(N)
X = X[idx]
y = y[idx]


After generating the 200 points, we plot them in a 2-dimensional plane using [`scatter`](http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.scatter):

In [None]:
# Plot dataset
plt.figure(figsize=(6,6))
plt.scatter(X[y==0, 0], X[y==0, 1], color='blue', label='Class 0')
plt.scatter(X[y==1, 0], X[y==1, 1], color='red', label='Class 1')
plt.legend()
plt.title("Synthetic Gaussian Dataset (2 classes)")
plt.xlabel("x1")
plt.ylabel("x2")
plt.grid(True)
plt.show()

> **Task 2**. Split the dataset into a training and a test set using the [`train_test_split`](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) function of scikit-learn. Set the proportion of the dataset to be included in the test set to 0.2.

In [None]:
from sklearn.model_selection import train_test_split

#your code here (set random state to 42 to have same split as instructors)


Now you will use PyTorch to implement a simple feedforward neural network. We next initialize the CPU (or GPU, if you have CUDA available), and move the data on the device.

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #if you have a GPU with CUDA installed, this may speed up computation

X_train = torch.from_numpy(X_train).float().to(device)
X_test = torch.from_numpy(X_test).float().to(device)
y_train = torch.from_numpy(y_train).unsqueeze(1).float().to(device)
y_test = torch.from_numpy(y_test).unsqueeze(1).float().to(device)

In PyTorch, of particular importance is the notion of a model. The model is the data structure upon which the neural network is built. The most common type of model is the Sequential model, which corresponds to a linear stack of layers. 

PyTorch provides various implementations for a great number of widely used layers. For instance, to apply a linear transformation:

`nn.Linear(input_size, output_size, bias=True)`

The layers are organized into <a href="https://pytorch.org/docs/stable/nn.html#containers">Containers</a>. A standard and flexible container is the <a href="https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module">Module</a> container. However, if we need a simple sequential network and don't need to reference the different layers, we can use the simplified <a href="https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html#torch.nn.Sequential">Sequential</a> container.

> **Task 3.** Implement a sequential model (named `model`) with an input size of 2, a hidden layer of 64 units with [ReLU](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) activation. The output layer will contain 1 neuron. This neuron should be activated with a sigmoid function, that ensures that the output corresponds to the probability that an instance belongs to class 1 of our problem. 

In [None]:
#INSERT YOUR CODE HERE


You can visualize the parameters of the model here:

In [None]:
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())
    
torch.save(model.state_dict(), "./model.st") #saving the uninitialized model for later

Now we choose the *loss* function and the *optimizer* and we start training our model.
Take a look at the various <a href="https://pytorch.org/docs/stable/optim.html">optimizers</a> and <a href="https://pytorch.org/docs/stable/nn.html#loss-functions">loss</a> functions available in Torch. In our case,we are going to use the Binary Cross Entropy: 

$$\mathcal{L}_{\text{BCE}} 
= - \frac{1}{N} \sum_{i=1}^N \Big( 
y_i \, \log p_\theta(\mathbf{x}_i) + 
(1 - y_i) \, \log \big( 1 - p_\theta(\mathbf{x}_i) \big) 
\Big)$$

where $p_\theta(\mathbf{x}_i)$ is the output value of our network after the sigmoid, indicating the predicted probability of $\mathbf{x}_i$ belonging to class $1$.

In [None]:
model.to(device) #if you have CUDA, this will make computation faster

loss_function = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) #actually SGD is just GD in this case

Once compiled, we can train the model for 5 *epochs* (that is, how many times the model "sees" the whole training set):

In [None]:
# Training loop
epochs = 5
for epoch in range(epochs):
    outputs = model.forward(X_train)
    loss = loss_function(outputs, y_train)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

In this case we updated the weights at every pass by looking at the gradient estimated on the full training set. This is commonly known as **gradient descent**:

$$\theta \leftarrow \theta - \eta \, \frac{1}{N} \sum_{i=1}^{N} \nabla_\theta L(\mathbf{x_i}, y_i)$$

With large dataset, it is preferable update the weights with smaller random subsets of the training set (the training **batches**). In this case we talk about **stochastic gradient descent**.

$$\theta \leftarrow \theta - \eta \, \frac{1}{B} \sum_{i\in \mathcal{B}_t} \nabla_\theta L(\mathbf{x_i}, y_i)$$

where $i \in \mathcal{B}_t \subset \{1,2,\dots,N\}, \quad |\mathcal{B}_t| = B$

the mini-batches $\{ \mathbf{x_i} | i \in \mathcal{B}_t \}$ are drawn without replacement within an epoch (the dataset is shuffled once per epoch). This way, each sample is seen exactly once per epoch.

In PyTorch, you can create training batches as follows:

In [None]:
from torch.utils.data import DataLoader, TensorDataset

dataset = TensorDataset(X_train, y_train)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

> **Task 4.** Re-run the training loop using SGD with batches of size 16

In [None]:
# resetting the model to pre-training state:
model.load_state_dict(torch.load("./model.st", weights_only=True))
# YOUR CODE HERE


> **Question 2**: Can you suggest a reason for which the loss after every epoch is lower with SGD?

Once trained, we can use your model to generate predictions on new data. Predictions are real values between 0 and 1. Set predictions larger than 0.5 to 1 and predictions smaller than 0.5 to 0.

> **Task 5**. Evaluate the result of the model on X_test using `from sklearn.metrics import accuracy_score`. NB: take into account that the output of the model will be a Tensor and you will need to convert to numpy again to use `accuracy_score`. 

In [None]:
from sklearn.metrics import accuracy_score

#your code here

Now, we are going to evaluate the error accuracy on training and test set during the training process and plot it.

> **Task 6**: Complete the following code to store the loss, the training accuracy and test accuracy, per batch, in the `losses` , `tr_acc` and `test_acc` arrays respectively.

In [None]:
# resetting the model to pre-training state:
model.load_state_dict(torch.load("./model.st", weights_only=True))

#init performance measures
losses = []
tr_acc = []
test_acc = []

# Training loop
epochs = 5
for epoch in range(epochs):
    for batch_X, batch_y in dataloader:
        # Forward pass
        outputs = model(batch_X)
        loss = loss_function(outputs, batch_y)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # YOUR CODE HERE
        
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,8))
ax1.plot([loss for loss in losses])
ax1.set_title("Training Loss")
ax1.set_xlabel("Iterations")
ax2.plot(test_acc, c='r', label='test')
ax2.plot(tr_acc, c='b', label='train')
ax2.set_title("Train and test accuracy")
ax2.set_xlabel("Iterations")
plt.legend()
plt.show()

Using the `pcolormesh` function, we can plot the decision surface of the network with regard to the input space. This is demonstrated as follows. Use your code from above (using `scatter`) to plot the data points over this surface.

In [None]:
plt.figure()

# Create meshgrid
xx1, xx2 = np.meshgrid(
    np.linspace(X[:,0].min()-1, X[:,0].max()+1, num=100),
    np.linspace(X[:,1].min()-1, X[:,1].max()+1, num=100)
)

# Convert to tensor for model input
grid_points = np.c_[xx1.ravel(), xx2.ravel()]
grid_tensor = torch.tensor(grid_points, dtype=torch.float32)

# Forward pass through the model
with torch.no_grad():  # no gradients needed for plotting
    p_y = model(grid_tensor)

# Convert back to NumPy for plotting
p_y = p_y.cpu().numpy()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,8))

# --- Titles & labels for subplots ---
ax1.set_title("Surface $P(Y | \\mathbf{X})$ (Train Data)")
ax1.set_xlabel("$x_1$")
ax1.set_ylabel("$x_2$")

ax2.set_title("Surface $P(Y | \\mathbf{X})$ (Test Data)")
ax2.set_xlabel("$x_1$")
ax2.set_ylabel("$x_2$")

# --- Plot decision surfaces ---
pcm = ax1.pcolormesh(xx1, xx2, p_y.reshape(xx1.shape), cmap='RdBu', shading="auto")
ax1.pcolormesh(xx1, xx2, p_y.reshape(xx1.shape), cmap='RdBu')
ax2.pcolormesh(xx1, xx2, p_y.reshape(xx1.shape), cmap='RdBu')

# --- Plot data ---
ax1.scatter(X_train[:,0], X_train[:,1], c=y_train, edgecolor="k")
ax2.scatter(X_test[:,0], X_test[:,1], c=y_test, edgecolor="k")

# Shared colorbar

fig.colorbar(pcm, ax=[ax1, ax2], shrink=0.8, label="$P(Y | \\mathbf{X})$")

#plt.tight_layout()  # avoid overlap
plt.show()