In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

# fix the random seed
torch.manual_seed(1234)

# output should be:
#    tensor([0.0290, 0.4019, 0.2598, 0.3666])
print (torch.rand(4))

## Part 1: Feedforward neural networks

## a simple neural network 

PyTorch provides various types of layers to build NNs:
    https://docs.pytorch.org/docs/stable/nn.html
    
 In this example, the layers are:
 1. input layer of width 1
 2. first hidden layer of width 3
 3. second hidden layer of width 4
 4. output layer of width 1
 
It defines a function $f$ from $\mathbb{R}$ to $\mathbb{R}$:
 $$f(x) = f_3(\psi(f_2(\psi(f_1(x))))),$$
 where $\psi$ is a (nonlinear) activation function, and
 1. $f_1: \mathbb{R}\rightarrow \mathbb{R}^3$
 2. $f_2: \mathbb{R}^3\rightarrow \mathbb{R}^4$
 3. $f_3: \mathbb{R}^4\rightarrow \mathbb{R}$
 
 are linear transformations.
 
 We have:
 $$
 f_i(x) = W_ix + b_i
 $$
 where $W_i$, $b_i$ are coefficients (parameters).
 
 **nn.Linear**: https://docs.pytorch.org/docs/stable/generated/torch.nn.Linear.html

In [None]:
class MyNet(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        self.net = nn.Sequential(
            # linear transformation
            nn.Linear(1, 3),
            # nonlinear activation 
            nn.Tanh(),
            # notice that the width needs to match 
            nn.Linear(3, 4), 
            nn.Tanh(),
            nn.Linear(4, 1), 
            # you can include more layers here to make the network deeper!
        )
        
    # define how the output of model is computed given input x
    def forward(self, x):
        output = self.net(x)
        return output

model = MyNet()

# have a look at the model
print(model)

# print all (training) parameters of the model
# These are the parameters $W_i, b_i$ in the linear functions $f_1, f_2, f_3$.
for param in model.parameters():
    print (param)

### we can evaluate the model on data

we can evalue the model on multiple data points.

**Note**: The last dimension of the input tensor x (dim=1) needs to match 
the input dimension of the network, while the first dimension of $x$ (usually the batch-size)
can be any number.

In [None]:
# get data, and change its shape to [N, dim], where
#    N: number of points
#  dim: 1
x = torch.linspace(0, 1, 101).reshape(-1,1)

print ('shape of x:', x.shape)

y = model(x)

# when plotting, we have to change the PyTorch tensor to numpy! 
plt.plot(x.detach().numpy(), y.detach().numpy(), c='r')

### Let's make the network more general:

User can specify width of the internal layers and activation function, such as: 

1. identity (by default): no nonlinear activation
2. nn.Tanh() 
3. nn.ReLU() 

Many more are possible, see: https://docs.pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity

In [None]:
class MyNet1(nn.Module):
    
    def __init__(self, layer_width, activation=None):
        
        super().__init__()
        if activation is None:
            act = nn.Identity()  
        else:
            act = activation  # use activation if provided
            
        self.net = nn.Sequential(
            nn.Linear(1, layer_width),  # user-specified width 
            act,   # activation or identity
            nn.Linear(layer_width, layer_width),
            act,
            nn.Linear(layer_width, 1),
        )

    def forward(self, x):
        output = self.net(x)
        return output

### examples 

We can define different networks by providing different parameters.

Please notice the differences in the outputs.

In [None]:
# width is 3, no activation
model = MyNet1(3)
print(model)

# width is 4, activation is tanh
model = MyNet1(4, nn.Tanh())
print(model)

## Part 2: Training

#### We want to train neural networks to learn two functions on $x\in [0,1]$:

1. linear: $f_1(x)=3x+1$
2. nonlinear: $f_2(x)=sin(\pi x)$


In [None]:
# the first function is linear
def linear_f(x):
    return 3*x + 1

# the second function is nonlinear
def nonlinear_f(x):
    return torch.sin(torch.pi * x)

# evaluates functions on a grid of [0,1]
x = torch.linspace(0, 1, 101).reshape(-1,1)
linear_y = linear_f(x)
sin_y = nonlinear_f(x)

# plot these two functions
plt.plot(x.detach().numpy(), linear_y.detach().numpy(), c='r', label='3x+1')
plt.plot(x.detach().numpy(), sin_y.detach().numpy(), c='b', label='sin(pi x)')
plt.legend()

### we define the training procedure as a function so that we can reuse it.


**mini-batch**: a (randomly drawn) subset of data used to evaluate the loss in each training step.

**epoch**: consists of multiple training steps within which all data points have been drawn once. 

We use the **DataLoader** provided by PyTorch to sample mini-batch from dataset. 

We train the neural network for multiple epochs.  

Users provides parameters:

1. model: network to be trained.
2. fun: the function to be learned (regression).
3. batch_size: batch-size in training. 
4. total_epochs: number of training epochs.

For DataLoader, see:
1. https://docs.pytorch.org/docs/stable/data.html
2. https://docs.pytorch.org/tutorials/beginner/basics/data_tutorial.html

For optimizer:

see 

1. torch.optim: https://docs.pytorch.org/docs/stable/optim.html
2. Adam: https://docs.pytorch.org/docs/stable/generated/torch.optim.SGD.html
3. Adam: https://docs.pytorch.org/docs/stable/generated/torch.optim.Adam.html


In [None]:
def training(model, fun,  batch_size=10, total_epochs=10):
    
    # use mean square error (MSE) as loss function
    criterion = torch.nn.MSELoss(reduction='sum')
    
    #optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) 
    
    # tell PyTorch that we want to optimize parameters in the model.
    # we use Adam
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    
    ## Input data set
    x = torch.linspace(0, 1, 1001).reshape(-1,1)

    ## grid used for plotting
    small_grid = torch.linspace(0, 1, 50).reshape(-1,1)

    # model before training
    y_pred_untrained = model(small_grid)
    
    # specifiy the dataset and the batch-size
    data_loader = DataLoader(x, batch_size=batch_size, shuffle=True, drop_last=True)
    
    # list to record the loss
    loss_list = []
    
    for epoch in range(total_epochs):   # for each epoch
        
        for data in data_loader:  # loop over all mini-batches 
            
            # Forward pass: Compute predicted y by passing data to the model
            y_pred = model(data)
            # evaluate true function on mini-batch data 
            y = fun(data) 

            # Compute loss
            
            # Alternatively, we can write the loss function ourself:
            #loss = ((y-y_pred)**2).sum()
            loss = criterion(y_pred, y)

            # zero gradients
            optimizer.zero_grad()
            # gradient step
            loss.backward()
            # update weights
            optimizer.step()
            
        # record the loss    
        loss_list.append(loss.item())
            
        #print(epoch, loss.item())
          
    
    y = fun(small_grid)
    y_pred = model(small_grid)
    
    fig, ax = plt.subplots(1,2, figsize=(10, 4))
    
    ax[0].plot(small_grid.detach().numpy(), y.detach().numpy(), '.', c='r', label='true')
    ax[0].plot(small_grid.detach().numpy(), y_pred.detach().numpy(), '.', c='b', label='learned')
    ax[0].plot(small_grid.detach().numpy(), y_pred_untrained.detach().numpy(), '.', c='gray', label='untrained')
    ax[0].legend() 
    ax[0].set_xlabel('x')
    
    ax[1].plot(loss_list)
    ax[1].set_xlabel('epoch')
    ax[1].set_title('loss vs epoch')


### Test 1: learn a linear function using a linear model

In [None]:
# no activation (hence linear model)
model = MyNet1(3)
print (model)
training(model, linear_f, batch_size=100, total_epochs=200)

### Test 2: learn a nonlinear function using a linear model

In [None]:
model = MyNet1(10)
print (model)
training(model, nonlinear_f, batch_size=100, total_epochs=200)

### Test 3: Learn a nonlinear function using a nonlinear model 

(see how traning depends on model size and epochs.)

#### Test 3.1: width=2, epoch=200

In [None]:
model = MyNet1(2, activation=nn.Tanh())
print (model)
training(model, nonlinear_f, batch_size=100, total_epochs=200)

#### Test 3.2: width=2, epoch=600 

(more training epochs)

In [None]:
model = MyNet1(2, activation=nn.Tanh())
print (model)
training(model, nonlinear_f, batch_size=100, total_epochs=600)

#### Test 3.3: width=10 

In [None]:
model = MyNet1(10, activation=nn.Tanh())
print (model)
training(model, nonlinear_f, batch_size=100, total_epochs=600)