In [1]:
# import pytorch libraries
%matplotlib inline
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

# Intro to Pytorch

PyTorch consists of 4 main packages:
* torch: a general purpose array library similar to Numpy that can do computations on GPU
* torch.autograd: a package for automatically obtaining gradients
* torch.nn: a neural net library with common layers and cost functions
* torch.optim: an optimization package with common optimization algorithms like SGD, Adam, etc

## Pytorch tensors
Like Numpy tensors but can utilize GPUs to accelerate its numerical computations. 

In [2]:
# creating tensors from lists or numpy arrays
x = torch.tensor([[1, 2],[3, 4]])
x.shape

torch.Size([2, 2])

In [3]:
# Create random tensor
N = 5
x = torch.randn(N, 10).type(torch.FloatTensor)

In [4]:
x.shape

torch.Size([5, 10])

In [5]:
# reshaping of tensors using .view()
x.view(1,-1) #-1 makes torch infer the second dim

tensor([[-0.5556,  1.0111, -0.3320,  0.5048,  0.4068,  0.8588,  0.2263,  0.0692,
         -1.0756, -0.1189, -1.1118,  1.3169,  0.0506,  1.1767, -0.4279, -1.2701,
         -1.3569,  1.3882,  0.4517,  0.1626, -0.8388,  0.4674,  0.3479, -0.1104,
          2.9179, -0.8408,  0.4950,  2.0096, -1.4413, -0.4001,  0.9010, -1.2915,
          1.0777, -0.7556, -0.1641, -0.1862, -0.1205,  0.7879, -0.1338,  1.7658,
          1.0678,  0.3406, -1.9920, -0.1941,  1.0800, -1.4676,  1.4172,  0.1523,
          0.4050,  1.3027]])

In [6]:
# reshaping of tensors using .view()
x.view(2,-1) #-1 makes torch infer the second dim

tensor([[-0.5556,  1.0111, -0.3320,  0.5048,  0.4068,  0.8588,  0.2263,  0.0692,
         -1.0756, -0.1189, -1.1118,  1.3169,  0.0506,  1.1767, -0.4279, -1.2701,
         -1.3569,  1.3882,  0.4517,  0.1626, -0.8388,  0.4674,  0.3479, -0.1104,
          2.9179],
        [-0.8408,  0.4950,  2.0096, -1.4413, -0.4001,  0.9010, -1.2915,  1.0777,
         -0.7556, -0.1641, -0.1862, -0.1205,  0.7879, -0.1338,  1.7658,  1.0678,
          0.3406, -1.9920, -0.1941,  1.0800, -1.4676,  1.4172,  0.1523,  0.4050,
          1.3027]])

In [7]:
x.view(1,-1).shape

torch.Size([1, 50])

In [8]:
# from tensors to numpy arrays
x.numpy()

array([[-0.5556204 ,  1.0111425 , -0.3320242 ,  0.5047684 ,  0.40676013,
         0.8588337 ,  0.22625639,  0.06918798, -1.0755863 , -0.11891922],
       [-1.1117711 ,  1.3168824 ,  0.05064068,  1.1766511 , -0.42786482,
        -1.2700926 , -1.3568871 ,  1.3882266 ,  0.45172557,  0.1625831 ],
       [-0.8388016 ,  0.4673902 ,  0.34794477, -0.11038603,  2.9178827 ,
        -0.84082204,  0.49498332,  2.0096323 , -1.441344  , -0.40009186],
       [ 0.9009545 , -1.2915255 ,  1.0777217 , -0.7555992 , -0.16405432,
        -0.18616329, -0.12052615,  0.787911  , -0.13377175,  1.7657547 ],
       [ 1.0678122 ,  0.34058693, -1.992028  , -0.19406235,  1.0799829 ,
        -1.4676211 ,  1.4172497 ,  0.15225224,  0.40498593,  1.3027384 ]],
      dtype=float32)

## Pytorch Autograd
The autograd package in PyTorch provides classes and functions implementing automatic differentiation of arbitrary scalar valued function. For example, the gradient of the error with respect to all parameters.

`requires_grad=True` tells PyTorch that it needs to calculate the gradient with respect to this tensor. Here is an example:

In [9]:
x = torch.tensor([1., 2., 3., 4., 5., 6.], requires_grad=True)

In [10]:
# this is equivalent
x = torch.tensor([1., 2., 3., 4., 5., 6.]).requires_grad_()

In [11]:
x.grad

In [12]:
2*x**2 + 1

tensor([ 3.,  9., 19., 33., 51., 73.], grad_fn=<AddBackward0>)

In [13]:
L = (2*x**2 +1).mean()
L

tensor(31.3333, grad_fn=<MeanBackward0>)

In [14]:
L.backward() # computes the grad of L with respect to x

In [15]:
x.grad

tensor([0.6667, 1.3333, 2.0000, 2.6667, 3.3333, 4.0000])

### Sanity Check 


$\dfrac{\partial L }{\partial x_i}  = \dfrac{\partial }{\partial x_i}n^{-1}\sum_{i=1}^n (2 x_i^2 + 1) = \dfrac{4 x_i}{n}$

When $n = 6,~x_1 = 1$, $\dfrac{\partial L }{\partial x_1} = 4/6 = 2/3 = 0.6667. ~~\checkmark $


In [16]:
# here is another example
x = torch.randn(2, 3)
x.requires_grad = True
x

tensor([[-0.7820,  0.2855, -0.5394],
        [-0.3869,  0.0409, -1.3156]], requires_grad=True)

In [17]:
L = (3*x).sum()
L

tensor(-8.0923, grad_fn=<SumBackward0>)

In [18]:
L.backward()
x.grad # note, it is the same shape as x

tensor([[3., 3., 3.],
        [3., 3., 3.]])

## detach()

Run `x.numpy()` on x after the previous computation. See what happens. How would you fix this error?

In [19]:
x.numpy()

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

The detach() method constructs a new view on a tensor which is declared not to need gradients. This may be needed for example when you want to take the output to a model to numpy to compute a metric with sklearn.

In [20]:
x.detach().numpy()

array([[-0.78203183,  0.28550372, -0.53936857],
       [-0.38685572,  0.04091536, -1.3156031 ]], dtype=float32)

## with torch.no_grad()
Prevent the gradients from being calculated in a piece of code. It does this by temporarily seting all of the `requires_grad` flags to false This is useful at validation time.

In [21]:
x = torch.randn(3, requires_grad=True)
print(x.requires_grad)
print((x ** 2).requires_grad)

with torch.no_grad():
    print((x ** 2).requires_grad)

True
True
False


## torch.nn module
A neural net library with common layers and cost functions.

`nn.Linear(5, 3)` creates a linear transformation with parameters $A$ and $b$. ( The transformation looks like: $A\cdot X+b$). Given an input matrix   $X$ of $n$ observations and $5$ features ($X$ is $n \times 5$), `nn.Linear(5, 3)` transforms X into a $n \times 3$ matrix, where $n$ can be anything (number of observations).

In [22]:
D = 5 # number of input featutes
M = 3 # neurons in the first hidden layer
linear_map = nn.Linear(D, M)

In [23]:
# parameters are initialized randomly
list(linear_map.parameters())

[Parameter containing:
 tensor([[-0.3365, -0.1785, -0.0724, -0.1225, -0.2347],
         [-0.3199,  0.0450,  0.1699,  0.1412,  0.2326],
         [ 0.4101,  0.4020,  0.1323,  0.2950, -0.2079]], requires_grad=True),
 Parameter containing:
 tensor([0.4141, 0.1879, 0.1561], requires_grad=True)]

In [24]:
# shape of parameters
[p.shape for p in linear_map.parameters()]

[torch.Size([3, 5]), torch.Size([3])]

In [25]:
# total number of elements per parameter tensor. 
[p.numel() for p in linear_map.parameters()]

[15, 3]

**Exercise:** Create a layer with 20 input features  and 10 output features. Compute how many total parameters do you have. 

#  Linear Regression with Pytorch

The goal of linear regression is to fit a line to a set of points.

In [26]:
# Here we generate some fake data
def lin(a,b,x): return a*x+b

def gen_fake_data(n, a, b):
    x = np.random.uniform(0,1,n) 
    y = lin(a,b,x) + 0.1 * np.random.normal(0,3,n) #line plus noise
    return x, y

x, y = gen_fake_data(50, 3., 8.)

In [27]:
import matplotlib.pyplot as plt
plt.scatter(x,y, s=8); plt.xlabel("x"); plt.ylabel("y"); 

You want to find **parameters** (weights) $a$ and $b$ such that you minimize the *error* between the points and the line $a\cdot x + b$. Note that here $a$ and $b$ are unknown. For a regression problem the most common *error function* or *loss function* is the **mean squared error** ($\sum_i ( y_i - \hat{y}_i)^2$). 

In [28]:
def mse(y_hat, y): return ((y - y_hat) ** 2).mean()

Suppose we believe $a = 10$ and $b = 5$ then we can compute `y_hat` which is our *prediction* and then compute our error.

In [29]:
y_hat = lin(10,5,x)
mse(y_hat, y)

5.191551012350949

In [30]:
def mse_loss(a, b, x, y): return mse(lin(a,b,x), y)

In [31]:
mse_loss(10, 5, x, y)

5.191551012350949

So far we have specified the *model* (linear regression) and the *evaluation criteria* (or *loss function*). Now we need to handle *optimization*; that is, how do we find the best values for $a$ and $b$? How do we find the best *fitting* linear regression.

## Gradient Descent with Pytorch

For a fixed dataset $x$ and $y$ `mse_loss(a,b)` is a function of $a$ and $b$. We would like to find the values of $a$ and $b$ that minimize that function.

**Gradient descent** is an algorithm that minimizes functions. Given a function defined by a set of parameters, gradient descent starts with an initial set of parameter values and iteratively moves toward a set of parameter values that minimize the function. This iterative minimization is achieved by taking steps in the negative direction of the function gradient.

Here is gradient descent implemented in [PyTorch](http://pytorch.org/).

In [32]:
# generate some more data
x, y = gen_fake_data(10000, 3., 8.)
x.shape, y.shape

((10000,), (10000,))

In [33]:
# Wrap x and y as tensor 
x = torch.tensor(x)
y = torch.tensor(y)

In [34]:
# Create random Tensors for weights, and wrap them in tensors.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these tensors during the backward pass.
a, b = np.random.randn(1), np.random.randn(1)
a = torch.tensor(a, requires_grad=True)
b = torch.tensor(b, requires_grad=True)
a,b

(tensor([-1.0954], dtype=torch.float64, requires_grad=True),
 tensor([-0.5999], dtype=torch.float64, requires_grad=True))

In [35]:
learning_rate = 1e-3
for t in range(10000):
    # Forward pass: compute predicted y using operations on Variables
    loss = mse_loss(a,b,x,y)
    if t % 1000 == 0: print(loss.item())
    
    # Computes the gradient of loss with respect to all Variables with requires_grad=True.
    # After this call a.grad and b.grad will be Variables holding the gradient
    # of the loss with respect to a and b respectively
    loss.backward()
    
    # Update a and b using gradient descent; a.data and b.data are Tensors,
    # a.grad and b.grad are Variables and a.grad.data and b.grad.data are Tensors
    a.data -= learning_rate * a.grad.data
    b.data -= learning_rate * b.grad.data
    
    # Zero the gradients
    a.grad.data.zero_()
    b.grad.data.zero_()    

115.26338471349538
0.8059405225603997
0.1037140881017801
0.09745208879999998
0.09589891423174791
0.09472411490931112
0.09382054461481021
0.09312549092911954
0.09259083397659587
0.0921795591856202


In [36]:
# not that a and b should be close to 3 and 8 respectively
print(a,b)

tensor([3.1232], dtype=torch.float64, requires_grad=True) tensor([7.9327], dtype=torch.float64, requires_grad=True)


## Simplified GD Loop

In [37]:
# linear tranformation with input dimension=1 and output dimension=1
nn.Linear(1, 1)

Linear(in_features=1, out_features=1, bias=True)

### Models in Pytorch

In [38]:
# simple way of specifying a linear regression model
model = torch.nn.Sequential(
    nn.Linear(1, 1),
)
model

Sequential(
  (0): Linear(in_features=1, out_features=1, bias=True)
)

In [39]:
# equivalent way of specifiying the same model
class LinearRegression(nn.Module):
    def __init__(self):
        super(LinearRegression, self).__init__() #call parent class (nn.Module) to initialize model
        self.lin = nn.Linear(1, 1) #1 input (x), 1 output (y)
        
    def forward(self, x): #define the forward pass
        x = self.lin(x) #start with a , transform to ax+b through self.lin
        return x 
model =  LinearRegression()

In [40]:
# note here we have just two parameters, why?
print([p for p in model.parameters()])

[Parameter containing:
tensor([[-0.9756]], requires_grad=True), Parameter containing:
tensor([0.7696], requires_grad=True)]


In [41]:
x, y = gen_fake_data(10000, 3., 8.)
x = torch.tensor(x).float()
y = torch.tensor(y).float()
x.shape

torch.Size([10000])

In [42]:
# you have to be careful with the dimensions that your model is expecting
# unsqueeze dim=1 transforms [10000] to [10000, 1]
x = torch.unsqueeze(x, 1)
x.shape

torch.Size([10000, 1])

In [43]:
y_hat = model(x)
print(y_hat)

tensor([[-0.1926],
        [ 0.7411],
        [ 0.0954],
        ...,
        [ 0.1010],
        [ 0.7421],
        [ 0.3487]], grad_fn=<AddmmBackward0>)


In [44]:
y_hat.shape

torch.Size([10000, 1])

In [45]:
y = y.unsqueeze(1)
F.mse_loss(y_hat, y)

tensor(86.6403, grad_fn=<MseLossBackward0>)

In [46]:
# validation data
x_val, y_val = gen_fake_data(1000, 3., 8.)
x_val = torch.tensor(x_val).float().unsqueeze(1)
y_val = torch.tensor(y_val).float().unsqueeze(1)

### Optimizer
Use the optim package to define an Optimizer that will update the weights of the model for us. Here we will use AdamW.

In [47]:
learning_rate = 0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [48]:
for t in range(10000):
    # Forward pass: compute predicted y using operations on Variables
    model.train() # some layers have different behavior during train/and evaluation
    y_hat = model(x) #perform forward pass
    loss = F.mse_loss(y_hat, y)
       
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables so they dont accumulate (default pytorch behavior)
    optimizer.zero_grad()
    loss.backward() # computes gradients
    
    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()
    
    # checking validation loss
    model.eval()  # some layers have different behavior during train/and evaluation
    y_hat_val = model(x_val)
    val_loss = F.mse_loss(y_hat_val, y_val)
    
    if t % 1000 == 0: print("train loss %.3f valid loss %.3f" % (loss.item(), val_loss.item()))

train loss 86.640 valid loss 83.247
train loss 0.106 valid loss 0.114
train loss 0.094 valid loss 0.103
train loss 0.091 valid loss 0.100
train loss 0.090 valid loss 0.099
train loss 0.090 valid loss 0.099
train loss 0.090 valid loss 0.098
train loss 0.090 valid loss 0.098
train loss 0.090 valid loss 0.098
train loss 0.090 valid loss 0.098


In [49]:
print([p for p in model.parameters()])

[Parameter containing:
tensor([[3.0366]], requires_grad=True), Parameter containing:
tensor([7.9822], requires_grad=True)]


# Logistic Regression

In [50]:
# generating fake data

def lin(a,b,x): return a*x+b

def gen_logistic_fake_data(n, a, b):
    x = np.random.uniform(-20,20, (n, 2))
    x2_hat = lin(a,b, x[:,0])
    y = x[:,1] > x2_hat
    return x, y.astype(int)

x, y = gen_logistic_fake_data(100, 1., 0.5)

In [51]:
y

array([1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0])

In [52]:
t = np.arange(-20, 20, 0.2)
import matplotlib.pyplot as plt
plt.scatter(x[:,0],x[:,1],c=y, s=8)
plt.xlabel("x1"); plt.ylabel("x2");
plt.plot(t, t + 0.5, '--', c='gray')

[<matplotlib.lines.Line2D at 0x1695faa20>]

In [53]:
x = torch.tensor(x)
y = torch.tensor(y)

In [54]:
model = torch.nn.Sequential(
    torch.nn.Linear(2, 1),
)
model

Sequential(
  (0): Linear(in_features=2, out_features=1, bias=True)
)

In [55]:
y_hat = model(x.float())

In [56]:
F.binary_cross_entropy_with_logits(y_hat, y.unsqueeze(1).float())

tensor(3.9327, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [57]:
# generating train and val data
x, y = gen_logistic_fake_data(10000, 1., 0.5)
x = torch.tensor(x).float()
y = torch.tensor(y).float().unsqueeze(1)

x_val, y_val = gen_logistic_fake_data(1000, 1., 0.5)
x_val = torch.tensor(x_val).float()
y_val = torch.tensor(y_val).float().unsqueeze(1)

In [58]:
learning_rate = 0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [59]:
for t in range(10000):
    # Forward pass: compute predicted y using operations on Variables
    model.train()
    y_hat = model(x)
    loss = F.binary_cross_entropy_with_logits(y_hat, y)
       
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables
    optimizer.zero_grad()
    loss.backward()
    
    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()
    
    model.eval() #switch to eval mode
    y_hat_val = model(x_val)
    val_loss = F.binary_cross_entropy(torch.sigmoid(y_hat_val), y_val)
    
    if t % 1000 == 0: print("train loss %.3f valid loss %.3f" % (loss.item(), val_loss.item()))

train loss 3.903 valid loss 3.819
train loss 0.020 valid loss 0.021
train loss 0.015 valid loss 0.016
train loss 0.013 valid loss 0.013
train loss 0.011 valid loss 0.011
train loss 0.009 valid loss 0.009
train loss 0.008 valid loss 0.008
train loss 0.007 valid loss 0.007
train loss 0.006 valid loss 0.006
train loss 0.005 valid loss 0.005


In [60]:
print([p for p in model.parameters()])

[Parameter containing:
tensor([[-8.7830,  8.8057]], requires_grad=True), Parameter containing:
tensor([-4.4387], requires_grad=True)]


Exercise: Instead of using `F.binary_cross_entropy(torch.sigmoid(y_hat), y)` try `F.binary_cross_entropy_with_logits(y_hat, y)`. Look at the documentation for `F.binary_cross_entropy_with_logits`.

## How to take a vector back to numpy?

In [61]:
x, y = gen_logistic_fake_data(10, 1., 0.5)
x = torch.tensor(x).float()

In [62]:
x.numpy()

array([[  5.9329267, -19.899004 ],
       [  0.373951 , -18.658577 ],
       [ 15.385204 ,  -2.3565853],
       [ -8.183436 , -15.361464 ],
       [ 19.594461 , -11.031542 ],
       [-18.951801 ,  18.229431 ],
       [ 18.444773 , -19.746264 ],
       [-11.439652 , -12.466691 ],
       [-11.289719 ,  -9.797044 ],
       [-11.107415 , -11.857159 ]], dtype=float32)

Exercise: Compute the accuracy of the validation logistic regression model

## Dataset and Data loaders 

Nearly all of deep learning is powered by one very important algorithm: **stochastic gradient descent (SGD)**. SGD can be seeing as an approximation of **gradient descent** (GD). In GD you have to run through *all* the samples in your training set to do a single iteration. In SGD you use *only one* or *a subset*  of training samples to do the update for a parameter in a particular iteration. The subset used in every iteration is called a **batch** or **minibatch**.


**Comparison of Batch, Mini-Batch, and Stochastic Gradient Descent (SGD)**

| Method                        | Batch Size        | Updates per Epoch    | Stability        | Memory Usage      | Speed       |
|-------------------------------|------------------|----------------------|------------------|------------------|-------------|
| **Batch Gradient Descent**     | Entire Dataset   | 1                    | ✅ Stable       | ❌ High          | ❌ Slow     |
| **Mini-Batch Gradient Descent**| Small Subset (e.g., $n_{sub}=$ 50, 100, etc.) | $n/n_{sub}$ | ✅ Balanced  | ✅ Medium       | ✅ Faster   |
| **Stochastic Gradient Descent (SGD)** | 1 Sample        | $n$                 | ❌ Noisy        | ✅ Low           | ✅ Fast     |

**Takeaway Messages:**
- **Batch Gradient Descent**: Uses the entire dataset at once. Slow but provides stable updates.
- **Mini-Batch Gradient Descent**: Processes a small subset at a time. Most commonly used because it balances speed and accuracy.
- **Stochastic Gradient Descent (SGD)**: Updates model parameters after every single sample, leading to **high variance** in updates.


In [63]:
model2 = torch.nn.Sequential(
    torch.nn.Linear(1, 1),
)

In [64]:
from torch.utils.data import Dataset, DataLoader

In [65]:
def lin(a,b,x): return a*x+b

def gen_fake_data(n, a, b):
    x = np.random.uniform(0,1,n) 
    y = lin(a,b,x) + 0.1 * np.random.normal(0,3,n)
    return x.astype(np.float32), y.astype(np.float32)

# create a dataset
class RegressionDataset(Dataset):
    def __init__(self, a=3, b=8, n=10000):
        x, y = gen_fake_data(n, a, b)
        x = torch.from_numpy(x).unsqueeze(1)
        y = torch.from_numpy(y)
        self.x, self.y = x, y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
fake_train_ds = RegressionDataset()
fake_valid_ds = RegressionDataset()

In [66]:
x, y = fake_train_ds[10000-1]

In [67]:
x, y

(tensor([0.5411]), tensor(9.6364))

Next we are going to create a data loader. The data loader provides the following features:
* Batching the data
* Shuffling the data
* Load the data in parallel using multiprocessing workers.

In [68]:
train_dl = DataLoader(fake_train_ds, batch_size=1000, shuffle=True)
valid_dl = DataLoader(fake_valid_ds, batch_size=1000, shuffle=False)

In [69]:
# getting a batch of data
x, y = next(iter(train_dl))

In [70]:
x.shape, y.shape

(torch.Size([1000, 1]), torch.Size([1000]))

In [71]:
model2(x).shape

torch.Size([1000, 1])

In [72]:
from sklearn.metrics import r2_score

def val_metric(model, valid_dl):
    model.eval()
    losses = []
    y_hats = []
    ys = []
    for x, y in valid_dl:
        y = y.unsqueeze(1)
        y_hat = model(x.float())
        loss = F.mse_loss(y_hat, y.float())
        y_hats.append(y_hat.detach().numpy())
        ys.append(y.numpy())
        losses.append(loss.item())
    
    ys = np.concatenate(ys)
    y_hats = np.concatenate(y_hats)
    return np.mean(losses), r2_score(ys, y_hats)

In [73]:
valid_loss, valid_r2 = val_metric(model2, valid_dl)
valid_loss, valid_r2

(93.8237060546875, -110.66011810302734)

Q: Why is R^2 negative???
What does it say about the current state of the model?

Quick note: when we use SGD or minibatch SGD, not all of the data goes into the loss at the same time. Once we have done enough minibatches so that the loss has seen and learned from all of the observations in the dataset, we call that an epoch. 

In [74]:
## train_loop function
def train_loop(model, train_dl, valid_dl, optimizer, epochs):
    losses = []
    for i in range(epochs):
        model.train()
        for x, y in train_dl:
            y = y.unsqueeze(1)
            y_hat = model(x.float())
            loss = F.mse_loss(y_hat, y.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
        train_loss = np.mean(losses)
        valid_loss, valid_auc = val_metric(model, valid_dl)
        print("train loss %.3f valid loss %.3f R^2 %.3f" % 
              (train_loss, valid_loss, valid_auc))

In [75]:
model2 = torch.nn.Sequential(
    torch.nn.Linear(1, 1),
)
learning_rate = 1
optimizer = torch.optim.AdamW(model2.parameters(), lr=learning_rate)

In [76]:
train_loop(model2, train_dl, valid_dl, optimizer, epochs=20)

train loss 26.455 valid loss 6.685 R^2 -6.956
train loss 16.588 valid loss 1.044 R^2 -0.242
train loss 11.741 valid loss 2.329 R^2 -1.772
train loss 9.034 valid loss 0.503 R^2 0.402
train loss 7.296 valid loss 0.262 R^2 0.688
train loss 6.130 valid loss 0.165 R^2 0.804
train loss 5.273 valid loss 0.119 R^2 0.858
train loss 4.631 valid loss 0.144 R^2 0.829
train loss 4.131 valid loss 0.115 R^2 0.863
train loss 3.730 valid loss 0.125 R^2 0.851
train loss 3.402 valid loss 0.117 R^2 0.861
train loss 3.128 valid loss 0.118 R^2 0.860
train loss 2.897 valid loss 0.116 R^2 0.862
train loss 2.698 valid loss 0.112 R^2 0.867
train loss 2.526 valid loss 0.113 R^2 0.866
train loss 2.375 valid loss 0.109 R^2 0.870
train loss 2.242 valid loss 0.110 R^2 0.869
train loss 2.123 valid loss 0.108 R^2 0.872
train loss 2.017 valid loss 0.106 R^2 0.873
train loss 1.922 valid loss 0.106 R^2 0.874


In [77]:
optimizer = torch.optim.Adam(model2.parameters(), lr=0.1)
train_loop(model2, train_dl, valid_dl, optimizer, epochs=10)

train loss 0.097 valid loss 0.090 R^2 0.893
train loss 0.094 valid loss 0.091 R^2 0.892
train loss 0.093 valid loss 0.089 R^2 0.894
train loss 0.093 valid loss 0.090 R^2 0.893
train loss 0.092 valid loss 0.089 R^2 0.894
train loss 0.092 valid loss 0.089 R^2 0.894
train loss 0.092 valid loss 0.090 R^2 0.893
train loss 0.091 valid loss 0.089 R^2 0.894
train loss 0.091 valid loss 0.089 R^2 0.894
train loss 0.091 valid loss 0.089 R^2 0.894


In [78]:
optimizer = torch.optim.Adam(model2.parameters(), lr=0.01)
train_loop(model2, train_dl, valid_dl, optimizer, epochs=20)

train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894
train loss 0.090 valid loss 0.089 R^2 0.894


**Exercise**: Play with the training of the previous model to get the max R^2 possible. Can you use larger learning rates or more epochs? 

# References
* https://pytorch.org/docs/stable/index.html
* http://pytorch.org/tutorials/beginner/pytorch_with_examples.html
* https://hsaghir.github.io/data_science/pytorch_starter/