## 0. Setup

In [None]:
# Import dependencies
import torch
import torch.nn as nn
from plot_lib import set_default, show_scatterplot, plot_bases
from matplotlib.pyplot import plot, title, axis

In [None]:
# Set up your device 
cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if cuda else "cpu")

In [None]:
# Set up random seed to 1008. Do not change the random seed.
seed = 1008
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed_all(seed)

## 1. Data generation
#### You'll be creating data points that are generated from a particular function.

### 1.1 Quadratic: $y = f(x) = x^2$

In [None]:
# Implement the function below
def quadratic_data_generator(n_samples):
    """
    Generate: 
    1) tensor x of size (n_samples, 1) 
    with values uniformly distributed in the interval (-1, 1] 
    using torch.rand()
    2) tensor y of size (n_samples, 1) 
    equal to x^2 using torch.pow() 
    
    The function should return: x, y
    """
    # x = TODO
    # y = TODO
    # return x.to(device), y.to(device)

In [None]:
# Generate the data with n_samples = 128
# x_quadr, y_quadr = TODO

In [None]:
# Visualize the data
# TODO

### 1.2 Cubic: $y = f(x) = x^3 - 0.5x$

In [None]:
# Implement the function below
def cubic_data_generator(n_samples):
    """
    Generate: 
    1) tensor x of size (n_samples, 1) 
    with values uniformly distributed in the interval (-1, 1] 
    using torch.rand()
    2) tensor y of size (n_samples, 1) 
    equal to (x^3 - 0.5x) using torch.pow() and torch.mul() 
    
    The function should return: x, y
    """
    # x = TODO
    # y = TODO
    # return x.to(device), y.to(device)

In [None]:
# Generate the data with n_samples = 128
# x_cubic, y_cubic = TODO

In [None]:
# Visualize the data 
# TODO

### 1.3 Sine: $y = f(x) = \sin(2.5x)$

In [None]:
# Implement the function below
def sine_data_generator(n_samples):
    """
    Generate: 
    1) tensor x of size (n_samples, 1) 
    with values uniformly distributed in the interval (-1, 1] 
    using torch.rand()
    2) tensor y of size (n_samples, 1) 
    equal to sin(2.5 * x) using torch.sin() 
    
    The function should return: x, y
    """
    # x = TODO
    # y = TODO
    # return x.to(device), y.to(device)

In [None]:
# Generate the data with n_samples = 128
# x_sine, y_sine = TODO

In [None]:
# Visualize the data 
# TODO

### 1.4 Absolute value: $y = f(x) = |x|$

In [None]:
# Implement the function below
def abs_data_generator(n_samples):
    """
    Generate: 
    1) tensor x of size (n_samples, 1) 
    with values uniformly distributed in the interval (-1, 1] 
    using torch.rand()
    2) tensor y of size (n_samples, 1) 
    equal to |x| using torch.abs() 
    
    The function should return: x, y
    """
    # x = TODO
    # y = TODO
    # return x.to(device), y.to(device)

In [None]:
# Generate the data with n_samples = 128
# x_abs, y_abs = TODO

In [None]:
# Visualize the data 
# TODO

### 1.5 Heavyside Step Function: $y = f(x) = \begin{cases} 0, & x < 0 \\ 1, & x \geq 0 \end{cases}$

In [None]:
# Implement the function below
def hs_data_generator(n_samples):
    """
    Generate: 
    1) tensor x of size (n_samples, 1) 
    with values uniformly distributed in the interval (-1, 1] 
    using torch.rand()
    2) tensor y of size (n_samples, 1) 
    equal to the Heavyside Step Function using a condition.
    Make sure that y is a torch.FloatTensor.
    
    The function should return: x, y
    """
    # x = TODO
    # y = TODO
    # return x.to(device), y.to(device)

In [None]:
# Generate the data with n_samples = 128
# x, y = TODO

In [None]:
# Visualize the data 
# TODO

## 2. Models
#### You are going to approximate the functions above with fully connected models of different depths.  

### 2.1. Dimensionality
The models you define below will be predicting $y$ from $x$ and will use the data generated in Part 1 as training data. Fill in the input and output dimensions for each of the models.

Hint: These dimensions are independent from the number of samples. 

In [None]:
# input_dim = TODO
# output_dim = TODO

### 2.2. No Hidden 
Define a model with a single linear module `torch.nn.Linear(input_dim, output_dim)` and no non-linearity.

In [None]:
class Linear_0H(nn.Module):
    def __init__(self):
        super(Linear_0H, self).__init__()
        
        # Layers
        # TODO
        pass

    def forward(self, code):
        # TODO
        pass

### 2.2. One Hidden 
Define a model with a single hidden layer of size 3 and one ReLU non-linearity.
Use `nn.Sequential()` for defining the layers.

Hint: Architecture should be `nn.Linear(intput_dim, 3)` -> `nn.ReLU()` -> `nn.Linear(3, output_dim)`

In [None]:
class Linear_1H(nn.Module):
    def __init__(self):
        super(Linear_1H, self).__init__()
        
        # Layers
        # TODO
        pass

    def forward(self, x):
        # TODO
        pass

### 2.3. Two Hidden 
Define a model with a two hidden layers of size 3 and two ReLU non-linearities.
Use `nn.Sequential()` for defining the layers.

Hint: Architecture should be `nn.Linear(input_dim,3)` -> `nn.ReLU()` -> `nn.Linear(3,3)` -> `nn.ReLU()` -> `nn.Linear(3, output_dim)`

In [None]:
class Linear_2H(nn.Module):
    def __init__(self):
        super(Linear_2H, self).__init__()
        # TODO 
        pass

    def forward(self, x):
        # TODO
        pass

## 3. Training

### 3.1 Train method
You are going to implement a training method which takes a model, number of epochs, training data, and threshold for loss functions as input and returns the (detached) predicitons from the last epoch. 

Make sure you understand what the method is doing and how early stopping works in this case.

In [None]:
# Training function
def train(model, epochs, x, y, loss_threshold=1e-2):
    # Set model to training mode
    # model.train()
    
    # Define Mean Squared Error as loss function using nn.MSELoss()
    # critereon = TODO
    
    # Define the SGD optimizer with learning rate of 0.01 using torch.optim.SGD()
    # optimizer = TODO
    
    # Training loop
    for epoch in range(epochs):
        # Forward data through model 
        # y_pred = TODO
        
        # Compute the loss 
        # loss = TODO 
        
        # Zero-out the optimizer 
        # TODO
        
        # Backpropagate loss
        # TODO
        
        # Make a step with the optimizer
        # TODO
        
        # Uncomment lines below once you implement the code above
        # Print out loss every 100 epochs 
        # if epoch == 0 or (epoch+1) % 100 == 0:
        #     print('Epoch {} loss: {}'.format(epoch+1, loss.item()))
        
        # Uncomment lines below once you implement the code above
        # Early stopping based on training loss
        # if loss.item() < loss_threshold:
        #     print('Epoch {} loss: {}'.format(epoch+1, loss.item()))
        #     break    
        
        pass # remove once implemented
    
    # Return predictions from the last epoch.
    # Uncomment line below once you implement
    # return y_pred.detach()

### 3.2. `Linear_0H`

In [None]:
# Define model
model_0H = Linear_0H().to(device)

In [None]:
# Train model on quadratic data for 10000 epochs and loss_threshold 1e-2
y_pred = train(model_0H, epochs=10000, x=x_quadr, y=y_quadr, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

In [None]:
# Train model on cubic data for 10000 epochs and loss_threshold=1e-2
y_pred = train(model_0H, epochs=10000, x=x_cubic, y=y_cubic, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

In [None]:
# Train model on sine data for 10000 epochs and loss_threshold=1e-2
y_pred = train(model_0H, epochs=10000, x=x_sine, y=y_sine, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

In [None]:
# Train model on abosulte value data for 10000 epochs and loss_threshold 1e-2
y_pred = train(model_0H, epochs=10000, x=x_abs, y=y_abs, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

In [None]:
# Train model on Heavyside Step Function data for 10000 epochs and loss_threshold 1e-2
y_pred = train(model_0H, epochs=10000, x=x_hs, y=y_hs, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

### 3.3. `Linear_1H`

In [None]:
# Define model
model_1H = Linear_1H().to(device)

In [None]:
# Train model on quadratic data for 10000 epochs and loss threshold 1e-2
y_pred = train(model_1H, epochs=10000, x=x_quadr, y=y_quadr, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

In [None]:
# Train model on cubic data for 10000 epochs and loss_threshold 1e-2
y_pred = train(model_1H, epochs=10000, x=x_cubic, y=y_cubic, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

In [None]:
# Train model on sine data for 10000 epochs and loss_threshold 1e-2
y_pred = train(model_1H, epochs=10000, x=x_sine, y=y_sine, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

In [None]:
# Train model on abosulte value data for 10000 epochs and loss_threshold 1e-2
y_pred = train(model_1H, epochs=10000, x=x_abs, y=y_abs, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

In [None]:
# Train model on Heavyside Step Function data for 10000 epochs and loss_threshold 1e-2
y_pred = train(model_1H, epochs=10000, x=x_hs, y=y_hs, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

### 3.3. `Linear_2H`

In [None]:
# Define model
model_2H = Linear_2H().to(device)

In [None]:
# Train model on quadratic data for 10000 epochs and loss_threshold 1e-2
y_pred = train(model_2H, epochs=10000, x=x_quadr, y=y_quadr, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

In [None]:
# Train model on cubic data for 10000 epochs and loss_threshold 1e-2
y_pred = train(model_2H, epochs=10000, x=x_cubic, y=y_cubic, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

In [None]:
# Train model on sine data for 10000 epochs and loss_threshold 1e-2
y_pred = train(model_2H, epochs=10000, x=x_sine, y=y_sine, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

In [None]:
# Train model on abosulte value data for 10000 epochs and loss_threshold=1e-2
y_pred = train(model_2H, epochs=10000, x=x_abs, y=y_abs, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

In [None]:
# Train model on Heavyside Step Function data for 10000 epochs and loss_threshold 1e-2
y_pred = train(model_2H, epochs=10000, x=x_hs, y=y_hs, loss_threshold=1e-2)

In [None]:
# Plot predictions vs actual data
# TODO

### 3.4. Which of the models stopped early and on what data?
Please list the experiments where the `loss_threshold` of 1e-2 was reached early. 

List: 