In [77]:
#source /home/scarlett/github/ipython_notebook/Coding_pythons/myenv/bin/activate
import sys
print(sys.executable)

import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.nn import functional as F
from torch.nn import CrossEntropyLoss

import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torchmetrics

/bin/python


To install new packages:
- source /home/scarlett/github/Coding_pythons/newenv/bin/activate
- pip install <package_name>
- pip list
- import <package_name>
- deactivate

## Deep learning
- deep learning is everywhere:
    - language translation
    - self-driving cars
    - medical diagnostics
    - chatbots

- used on multiple data types: images, text and audio
- traditional machine learning: relies on hand-crafted feature engineering
- deep learning: enables feature learning from raw data

## What is deep learning?
- An architecture of network consists of input, hidden layers and output
- A network can have one or many hidden layers
- Require 100K data
- pytorch supports tabular data, also unstructral data like image (torchvision), audio data (torchaudio), text data (torchtext)
- deep learning often requires a GPU, which, compard to a CPU can offer: parallel computing capabilities, faster training times and better performance
- tensors are multidimensional representations of their elements

#### make tensor: load from list

In [48]:
lst = [[1,2,3],[4,5,6],[7,8,9]]
tensor = torch.tensor(lst)
tensor 

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [49]:
tensor.shape

torch.Size([3, 3])

In [50]:
tensor.dtype

torch.int64

In [51]:
tensor.device

device(type='cpu')

#### load from NumPy array

In [52]:
array = [[1,2,3],[4,5,6],[7,8,9]]
np_array = np.array(array)
np_tensor = torch.from_numpy(np_array)

### tensor operations - most numpy array operations can be performedon PyTorch tensors

In [53]:
tensor + np_tensor

tensor([[ 2,  4,  6],
        [ 8, 10, 12],
        [14, 16, 18]])

In [54]:
tensor * np_tensor

tensor([[ 1,  4,  9],
        [16, 25, 36],
        [49, 64, 81]])

## neural net
### basic two layer NN without hidden layer
- first layer: input, second layer: linear layer
- each linear layer has a .weight and .bias associated
- nn.linear performs
    - when input_tensor is passed to linear_layer, the linear operation performed is matrix multiplication of input_tensor and the weights, followed by adding in the bias
    - input X, weights W0, bias b0 --> y0 = W0 * X +b0; in pytorch: output = W0 * input + b0
    - initially, when we call nn.Linear(), weights and biases are initialized randomly, so they are not yet useful
    - we have to tune these weights and biases
- two-layer network summary:
    - took 1 by 3 input as the first layer (1 linear layer with specific arguments as the second layer, and retured a 1 by 2 output)
    - linear layers have connections (or arrows) between each input and output neuron, making them fully connected
        - networks with only linear layers are called: "fully connected networks"
        - each neuron in one layer is connected to each neuron in the next layer

In [55]:
## create input_tensor with 3 features
input_tensor = torch.tensor([[0.3471, 0.4547, -0.2356]], dtype=torch.float32)

## define first linear layer ( a linear layer takes an input, applies a linear fxn, and returns output)
linear_layer = nn.Linear(in_features=3, out_features=2)

## pass input through linear layer
output = linear_layer(input_tensor)
print(output)

tensor([[ 0.4958, -0.3643]], grad_fn=<AddmmBackward0>)


In [None]:
linear_layer.weight

In [None]:
linear_layer.bias

### Multi-layer model: Stacking layers with nn.Sequential()
- this model takes input, passes it to each linear layer in sequence, and returns output
    - the first layer takes input with 10 features, and output with a tensor with 18 features
    - the second layer takes input with 18 features, and output with a tensor with 20 features
    - the third layer takess input with 20 featuresl and output with a tensor with 5 features
- output is not linear until each layer has tuned weights and biases

In [None]:
model = nn.Sequential(
    nn.Linear(10,18), 
    nn.Linear(18,20), 
    nn.Linear(20,5))

In [None]:
# make a input_tensor with ten features (neurons)
input_tensor = torch.rand(1,10)
input_tensor

In [None]:
# pass input_tensor through model
output = model(input_tensor)

#### In this exercise, you will implement a small neural network containing two linear layers. The first layer takes an eight-dimensional input, and the last layer outputs a one-dimensional tensor.

In [None]:
input_tensor = torch.Tensor([[2, 3, 6, 7, 9, 3, 2, 1]])

# Implement a small neural network with exactly two linear layers
model = nn.Sequential(nn.Linear(8,5),
                      nn.Linear(5,1)
                     )

output = model(input_tensor)
print(output)

### Activation functions
-  used in the last layer of NN

## Chapter 2: Training First NN with PyTorch
### Section 2.1 Running a forward pass
- input data is **passed forward** or **propagated** through a network
- computations performed at each layer
- outputs of each layer passed to each subsequent layer
- output of final layer: "prediction"
- used for both **training** and prediction
- some possible outputs:
    - binary classification (sum up to 1)
    - multiclass classification (sum up to 1, value with highest probability is assigned predicted label in each row)
    - regression values
- results would not be meaningful untill we use backpropagation to update weights and biases
- there is also a backward pass
    - used to update weights and baises during training
    - in the training loop: 
        1. propagate data forward
        2. compare outputs to the true values (ground-truth)
        3. backpropagate to update model weights and baises
        4. repeaat weights and biases are tuned to produce useful outputs

#### binary classificationl forward pass

In [3]:
# create input data of shape 5*6
data = torch.randn(5, 6)
input_data = data
input_data

tensor([[ 0.3303,  0.2862, -1.0250,  0.2587, -0.0196, -0.1723],
        [-0.8667,  1.0218, -2.2905, -0.2409, -0.4707,  1.5993],
        [ 0.0264,  0.5633,  1.2191, -0.4299,  0.9548, -0.7300],
        [ 0.1807,  0.9939,  1.2226, -0.6087,  0.9974, -1.3976],
        [ 1.4309, -0.6085,  0.1766, -0.1891,  0.3819,  0.0889]])

In [4]:
# create binary classification model
model = nn.Sequential(
    nn.Linear(6, 3),
    nn.Linear(3, 1),
    nn.Sigmoid()
)

# pass input data through model
output = model(input_data)
print(output)

tensor([[0.4012],
        [0.3571],
        [0.3702],
        [0.3924],
        [0.4166]], grad_fn=<SigmoidBackward0>)


#### multi-class classificationl forward pass

In [5]:
# specifcy model has 3 clases
n_classes = 3

# create 2-layer multi-class classification model
model = nn.Sequential(
    nn.Linear(6, 4),
    nn.Linear(4, n_classes),
    nn.Softmax(dim=-1)
)

# pass input data through model
output = model(input_data)
print(output.shape)

torch.Size([5, 3])


#### regression forward pass
- no activation layer

In [6]:
# create regression model
model = nn.Sequential(
    nn.Linear(6, 4),
    nn.Linear(4, 1)
)

# pass input data through model
output = model(input_data)

# return output
print(output)

tensor([[-0.2892],
        [-0.1950],
        [-0.1918],
        [-0.2454],
        [-0.2936]], grad_fn=<AddmmBackward0>)


### Section 2.2 Using loss function to assess model predictions
#### Loss function
- gives feedback to model during training
- takes in model prediction y_hat and ground truth y
- outputs a float
- example: predicted class = 0
    - if correct = low loss
    - if wrong = high loss
- Goal: minimize loss
- one-hot encoding concepts: loss = F(y, y_hat)
    - **y is a single integer** (class label) e.g. y = 0 when y is mammal
    - **y_hat is a tensor** (output of softmax)
        - if N is the number of classes, e.g. N = 3
        - y_hat is a tensor with N dimensions, e.g. y_hat = [0.5,0.8,0.1]
    - transform true label to tensor of zeros and ones
        - ground truth y = 0
        - number of classes N = 3
        - class: 0  1  2 
        - one-hot encoding: 1  0  0 / 
                            0  1  0 /
                            0  1  1 

In [8]:
F.one_hot(torch.tensor([0, 1, 2]), num_classes=3)

tensor([[1, 0, 0],
        [0, 1, 0],
        [0, 0, 1]])

In [9]:
F.one_hot(torch.tensor(0), num_classes=3)

tensor([1, 0, 0])

#### loss function takes
- scores
    - model predictions before the final softmax function
- one_hot_target
    - one hot encoded ground truth label
#### loss function output
- loss
    - a single float

In [29]:
# Correct way to create tensors
scores = torch.tensor([[-0.1211, 0.1059]])  # Raw scores (logits), not probabilities
# Target should be the index of the correct class, not one-hot encoded
class_target = torch.tensor([0])  # Class index, e.g., class 0

# Instantiate the CrossEntropyLoss
criterion = nn.CrossEntropyLoss()

# Compute the loss
loss = criterion(scores, class_target)

print("Loss:", loss.item())


Loss: 0.81307452917099


**Creating one-hot encoded labels**
- Manually create a one-hot encoded vector of the ground truth label y by filling in the NumPy array provided.
- Create a one-hot encoded vector of the ground truth label y using PyTorch.

In [32]:
y = 1
num_classes = 3

# Create the one-hot encoded vector using NumPy
one_hot_numpy = np.array([0, 1, 0])

# Create the one-hot encoded vector using PyTorch
one_hot_pytorch = F.one_hot(torch.tensor(y), num_classes)

**Calculating cross entropy loss**
- Create the one-hot encoded vector of the ground truth label y and assign it to one_hot_label.
- Create the cross entropy loss function and store it as criterion.
- Calculate the cross entropy loss using the one_hot_label vector and the scores vector, by calling the loss_function you created.

In [35]:
y = [2]
scores = torch.tensor([[0.1, 6.0, -2.0, 3.2]])

# Create a one-hot encoded vector of the label y
one_hot_label = F.one_hot(torch.tensor(y), scores.shape[1])

# Create the cross entropy loss function
criterion = CrossEntropyLoss()

# Calculate the cross entropy loss
loss = criterion(scores.double(), one_hot_label.double())
print(loss)

tensor(8.0619, dtype=torch.float64)


### Section 2.3 Using derivatives to update model parameters
- goal: minimize the loss
    - high loss: wrong prediction
    - low loss: correct prediction
- hiking down a mountain to the valley floor
    - steep slopes: a step makes us lose a lot of elevation = derivative is high (red arrows)
    - gentle slopes: a step makes us lose a little bit of elevation = detivative is low (green arrows)
    - valley floor: not losing elevation by taking a step = derivative is null
- connecting derivatives and model training
    - model training: updating a model's parameters to minimize the loss
        - we take a dataset with features X, and groud truth y. We run a forward pass using X and calculate loss by comparing model output, y_hat, with y. we compute gradients of the loss function and use them to update the model parameters with backpropagation so that weights are no longer random and biases are useful. we repeat until the layers are tuned.


#### Backpropagation concepts
- consider a network made of 3 layers. L0, L1, L2
    - calculate local gradients for L0, L1 and L2 using backpropagation
    - calculate loss gradients with respect to L2, then use L2 gradients to calcualte L1 gradients, and so on

In [38]:
# create the model and run a forward pass
model = nn.Sequential(
    nn.Linear(6, 4),
    nn.Linear(4, 3),
    nn.Linear(3,2))
prediction = model(input_data)

In [39]:
# calculate the loss and compute the gradients
criterion = CrossEntropyLoss()
loss = criterion(prediction, target)
loss.backward()

IndexError: Target 2 is out of bounds.

In [40]:
# access each layer's gradidents
model[0].weight.grad, model[0].bias.grad
model[1].weight.grad, model[1].bias.grad
model[2].weight.grad, model[2].bias.grad

(None, None)

#### updating model parametes
- update the weights by subtracting local gradients scaled by the learning rate

In [41]:
# learning rate is typically small
lr = 0.001

# update the weights
weight = model[0].weight
weight_grad = model[0].weight.grad
weight = weight - lr * weight_grad

# update the bias
bias = model[0].bias
bias_grad = model[0].bias.grad
bias = bias - lr * bias_grad

TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

#### convex and non-convex funtions
- convex function has one global minimum
- non-convex function has more than 1 global minimum (loss function in DL)

#### gradient descent
- for non-convex functins, the iterative process used is GD
- in PyTorch, an optimizer takes care of weight updates
- the most common optimizer is stochastic gradient descent (SGD)
- optimizer handles updating model parameters (or weights) after calculation of local gradients

In [44]:
# create the optimizer
optimizer = optim.SGD(model.parameters(), lr=0.0001)
optimizer.step()

Question:
- The following tensors are provided:
    - weight: a 2 by 9 element tensor
    - bias: a 2 element tensor
    - preds: a 1 by 2 element tensor containing the model predictions
    - target: a 1 by 2 element one-hot encoded tensor containing the ground-truth label

- Use the criterion you have defined to calculate the loss value with respect to the predictions and target values.
- Compute the gradients of the cross entropy loss.
- Display the gradients of the weight and bias tensors, in that order.

In [None]:
criterion = nn.CrossEntropyLoss()

# Calculate the loss
loss = criterion(preds, target)

# Compute the gradients of the loss
loss.backward()

# Display gradients of the weight and bias tensors in order
print(weight.grad)
print(bias.grad)

Question: A PyTorch model created with the nn.Sequential() is a module that contains the different layers of your network. Recall that each layer parameter can be accessed by indexing the created model directly. In this exercise, you will practice accessing the parameters of different linear layers of a neural network. You won't be accessing the sigmoid.
- Access the weight parameter of the first linear layer.
- Access the bias parameter of the second linear layer.

In [45]:
model = nn.Sequential(nn.Linear(16, 8),
                      nn.Sigmoid(),
                      nn.Linear(8, 2))

# Access the weight of the first linear layer
weight_0 = model[0].weight

# Access the bias of the second linear layer
bias_1 = model[2].bias

Question: Updating the weights manually
- Create the gradient variables by accessing the local gradients of each weight tensor.
- Update the weights using the gradients scaled by the learning rate.
- Use optim to create an SGD optimizer with a learning rate of your choice (must be less than one) for the model provided.
- Update the model's parameters using the optimizer.



In [47]:
weight0 = model[0].weight
weight1 = model[1].weight
weight2 = model[2].weight

# Access the gradients of the weight of each linear layer
grads0 = weight0.grad
grads1 = weight1.grad
grads2 = weight2.grad

# Update the weights using the learning rate and the gradients
weight0 = weight0 - lr*grads0
weight1 = weight1 - lr*grads1
weight2 = weight2 - lr*grads2

# Create the optimizer
optimizer = optim.SGD(model.parameters(), lr=0.001)

loss = criterion(pred, target)
loss.backward()

# Update the model's parameters using the optimizer
optimizer.step()

AttributeError: 'Sigmoid' object has no attribute 'weight'

### Section 2.4 Writing our first training loop
- create a model
- choose a loss function
- create a dataset (normalize, missing, imbalance)
- define an optimizer
- run a training loop, where for each sample of the dataset, we repeat:
    - calculating loss(forward pass)
    - calculating local gradients
    - updating model parameters

Example: data science salary dataset (regression problem)
- no softmax or sigmoid as last activation function
- last layer is linear layer
- **loss function: MSE loss** --> mean (sum(prediction - target)^2)
    - prediction and targets must be a float tensor
- regression problem does not need one-hot encoding and the final linear layer outputs a single float

In [None]:
# create the dataset and the dataloader
dataset = TensorDataset(torch.tensor(features).float(), torch.tensor(target).float())
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# create the model
model = nn.Sequential(nn.Linear(6, 4),
                      nn.Linear(2,1))

# create the loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [None]:
# loop through the dataset multiple times - loop through data once is called an epoch
# each iteration of the dataoader provides a batch of samples
for epoch in range(num_epochs):
    for data in dataLoader:
        # set the gradients to zero
        optimizer.zero_grad()
        # get feature and target from the data loader
        features, target = data
        # run a forward pass
        pred = model(features)
        # calculate the loss and gradients
        loss = criterion(pred, target)
        loss.backward()
        # update the weights
        optimizer.step()


#### Practice: using the MSELoss
- Calculate the MSELoss using NumPy.
- Create a MSELoss function using PyTorch.
- Convert y_hat and y to tensors and then float data types, and then use them to calculate MSELoss using PyTorch as mse_pytorch.

In [None]:
y_hat = np.array(10)
y = np.array(1)

# Calculate the MSELoss using NumPy
mse_numpy = np.mean((y_hat - y)**2)

# Create the MSELoss function
criterion = nn.MSELoss()

# Calculate the MSELoss using the created loss function
mse_pytorch = criterion(torch.tensor(y_hat).float(), torch.tensor(y).float())
print(mse_pytorch)

#### Practice: writing a training loop
- Write a for loop that iterates over the dataloader; this should be nested within a for loop that iterates over a range equal to the number of epochs.
- Set the gradients of the optimizer to zero.
- Write the forward pass.
- Compute the MSE loss value using the criterion() function provided.
- Compute the gradients.
- Update the model's parameters.

In [None]:
# Loop over the number of epochs and the dataloader
for i in range(num_epochs):
  for data in dataloader:
    # Set the gradients to zero
    optimizer.zero_grad()
    # Run a forward pass
    feature, target = data
    prediction = model(feature)    
    # Calculate the loss
    loss = criterion(prediction, target)    
    # Compute the gradients
    loss.backward()
    # Update the model's parameters
    optimizer.step()


## Chapter 3: NN architecture and hyperparameters
### Section 3.1 discover activation functions between layers
- sigmoid: output will be bounded between 0 and 1
    - gradient vanishing problem that gradient is close to zero for low or high values
- softmax can only be used in the last layer
- ReLU: rectified linear unit
    - fx = max(x, 0)
    - overcome vanishing gradients problem
- leaky ReLU: for negative inputs, it multiplies the input by a small coefficient (default to 0.01)

In [57]:
relu = nn.ReLU()
leaky_relu = nn.LeakyReLU(negative_slope = 0.05)

Question: In this exercise, you'll begin with a ReLU implementation in PyTorch. Next, you'll calculate the gradients of the function.
- Calculate the gradient of the ReLU function for x using the relu_pytorch() function you defined, then running a backward pass.
- Find the gradient at x.

In [None]:
# Create a ReLU function with PyTorch
relu_pytorch = nn.ReLU()

# Apply your ReLU function on x, and calculate gradients
x = torch.tensor(-1.0, requires_grad=True)
y = relu_pytorch(x)
y.backward()

# Print the gradient of the ReLU function for x
gradient = x.grad
print(gradient)

Question: In this exercise, you will implement the leaky ReLU function in NumPy and PyTorch and practice using it. The numpy as np package, the torch package as well as the torch.nn as nn have already been imported.
- Create a leaky ReLU function in PyTorch with a negative slope of 0.05.
- Call the function on the tensor x, which has already been defined for you.

In [None]:
# Create a leaky relu function in PyTorch
leaky_relu_pytorch = nn.LeakyReLU(negative_slope = 0.05)

x = torch.tensor(-2.0)
# Call the above function on the tensor x
output = leaky_relu_pytorch(x)
print(output)

### Section 3.2 A deeper dive into NN architecture

#### Counting the number of parameters
- Iterate through the model's parameters to update the total variable with the total number of parameters in the model.

In [58]:
model = nn.Sequential(nn.Linear(16, 4),
                      nn.Linear(4, 2),
                      nn.Linear(2, 1))

total = 0

# Calculate the number of parameters in the model
for parameter in model.parameters():
  total += parameter.numel()

print(total)

81


#### Manipulating the capacity of a network
- Create a neural network with exactly three linear layers and less than 120 parameters, which takes n_features as inputs and outputs n_classes.


In [59]:
def calculate_capacity(model):
  total = 0
  for p in model.parameters():
    total += p.numel()
  return total

In [61]:
n_features = 8
n_classes = 2

input_tensor = torch.Tensor([[3, 4, 6, 2, 3, 6, 8, 9]])

# Create a neural network with less than 120 parameters
model = nn.Sequential(nn.Linear(8, 6),
                      nn.Linear(6, 4),
                      nn.Linear(4, 2))

output = model(input_tensor)

print(calculate_capacity(model))

92


### Section 3.3 Learning rate and momentum
- model's architecture impacts the training process and the model's performance
- training a NN is actualy solving an optimization problem:
    - minimize loss function by tweaking model's parameters (SGD)
- study the impact of the learning rate and momentum on the training process
    - right learning rate:
        - find the minimum of the function shown
        - run SGD optimizer for 10 steps and start at X equals -2
        - after 10 steps, the optimizer has almost found the minimum of the function 
        - the step size taken by the optimizer is getting smaller as we are getting closer to X = 0
        - step size = gradient multiplied by the learning rate
        - around zero, this function is not as steep and therefore the gradient is smaller
    - Low: if we use the same algorithm for a learning rate 10 times smaller, we are still far from the minimum of the function after 10 steps. The optimizer will take much longer to find the functions minimum
    - High: if high value for learning rate, the optimizer cannot find the minimum and bounces back and forth on both sides of the function 
- One challenege is that when try to find the minimum of a non-convex function is getting stuck in a local minimum
    - without momentum - can only find local minimum 
    - with momentum - can find global minimum

Experimenting with learning rate
- In this exercise, your goal is to find the optimal learning rate such that the optimizer can find the minimum of the non-convex function 
 in ten steps.
- You will experiment with three different learning rate values. For this problem, try learning rate values between 0.001 to 0.1.
- You are provided with the optimize_and_plot() function that takes the learning rate for the first argument. This function will run 10 steps of the SGD optimizer and display the results.

In [None]:
# smaller value - hard to find global minima    
lr2 = 0.09
optimize_and_plot(lr=lr2)

Experimenting with momentum
- In this exercise, your goal is to find the optimal momentum such that the optimizer can find the minimum of the following non-convex function 
 in 20 steps. You will experiment with two different momentum values. For this problem, the learning rate is fixed at 0.01.
- You are provided with the optimize_and_plot() function that takes the learning rate for the first argument. This function will run 20 steps of the SGD optimizer and display the results.

In [None]:
# lower momentum value - harder to find global minimum
mom1 = 0.95
optimize_and_plot(momentum=mom1)

### Section 3.4 Layer initialization and transfer 
- layer initialization
    - initialize small values
    - sampling from uniform distribution (nn.init.uniform)
- transfer learning: reusing a model trained on a first task for a second similar task, to accelerate the training process
    - fine-tuning: smaller learning rate, not every layer is trained
        - find a model trained on a similar task
        - load pre-trained weights
        - free none or some of the layers in the model
        - train with a smaller learning rate
        - look at the loss values and see if the learning rate needs to be adjusted

#### Freeze layers of a model
- You are about to fine-tune a model on a new task after loading pre-trained weights. The model contains three linear layers. However, because your dataset is small, you only want to train the last linear layer of this model and freeze the first two linear layers.
- The model has already been created and exists under the variable model. You will be using the named_parameters method of the model to list the parameters of the model. Each parameter is described by a name. This name is a string with the following naming convention: x.name where x is the index of the layer.
- Remember that a linear layer has two parameters: the weight and the bias.

In [None]:
for name, param in model.named_parameters():
  
    # Check if the parameters belong to the first layer
    if name == '0.weight' or name == '0.bias':
   
        # Freeze the parameters
        param.requires_grad = False
        
    # Check if the parameters belong to the second layer
    if name == '1.weight' or name == '1.bias':
      
        # Freeze the parameters
        param.requires_grad = False

#### Layer initialization
- The initialization of the weights of a neural network has been the focus of researchers for many years. When training a network, the method used to initialize the weights has a direct impact on the final performance of the network.
- As a machine learning practitioner, you should be able to experiment with different initialization strategies. In this exercise, you are creating a small neural network made of two layers and you are deciding to initialize each layer's weights with the uniform method.

In [None]:
layer0 = nn.Linear(16, 32)
layer1 = nn.Linear(32, 64)

# Use uniform initialization for layer0 and layer1 weights
nn.init.uniform_(layer0.weight)
nn.init.uniform_(layer1.weight)

model = nn.Sequential(layer0, layer1)

## Chapter 4: Evaluating and improving models
### Section 4.1 A deeper dive into loading data

In [65]:
from sklearn.datasets import load_iris
data = load_iris()
X = data.data  # Features
y = data.target  # Class labels
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [73]:
# Instantiate the dataset class
dataset = TensorDataset(torch.tensor(X).float(), torch.tensor(y))

# access an individual sample
sample = dataset[0]
input_sample, label_sample = sample
print(input_sample, label_sample)

tensor([5.1000, 3.5000, 1.4000, 0.2000]) tensor(0)


In [74]:
batch_size = 2
shuffle = True

# create a dataloader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [75]:
# itearate through the dataloader
for batch_inputs, batch_labels in dataloader:
    print(batch_inputs, batch_labels)

tensor([[4.8000, 3.0000, 1.4000, 0.1000],
        [6.2000, 3.4000, 5.4000, 2.3000]]) tensor([0, 2])
tensor([[5.2000, 3.4000, 1.4000, 0.2000],
        [6.3000, 3.4000, 5.6000, 2.4000]]) tensor([0, 2])
tensor([[6.3000, 2.7000, 4.9000, 1.8000],
        [4.8000, 3.1000, 1.6000, 0.2000]]) tensor([2, 0])
tensor([[5.6000, 2.9000, 3.6000, 1.3000],
        [6.6000, 2.9000, 4.6000, 1.3000]]) tensor([1, 1])
tensor([[5.7000, 3.8000, 1.7000, 0.3000],
        [7.2000, 3.0000, 5.8000, 1.6000]]) tensor([0, 2])
tensor([[4.9000, 3.1000, 1.5000, 0.2000],
        [6.7000, 3.1000, 4.7000, 1.5000]]) tensor([0, 1])
tensor([[5.0000, 3.2000, 1.2000, 0.2000],
        [5.7000, 2.9000, 4.2000, 1.3000]]) tensor([0, 1])
tensor([[4.6000, 3.4000, 1.4000, 0.3000],
        [5.7000, 2.5000, 5.0000, 2.0000]]) tensor([0, 2])
tensor([[5.7000, 4.4000, 1.5000, 0.4000],
        [4.8000, 3.4000, 1.6000, 0.2000]]) tensor([0, 0])
tensor([[5.1000, 3.8000, 1.6000, 0.2000],
        [7.6000, 3.0000, 6.6000, 2.1000]]) tensor([0, 2])


Using the TensorDataset class
- Convert the NumPy arrays provided to PyTorch tensors.
- Create a TensorDataset using the torch_features and the torch_target tensors provided (in this order).
- Return the last element of the dataset.

In [None]:
import numpy as np
import torch
from torch.utils.data import TensorDataset

np_features = np.array(np.random.rand(12, 8))
np_target = np.array(np.random.rand(12, 1))

# Convert arrays to PyTorch tensors
torch_features = torch.tensor(np_features).float()
torch_target = torch.tensor(np_target)

# Create a TensorDataset from two tensors
dataset = TensorDataset(torch_features,torch_target)

# Return the last element of this dataset
print(dataset[-1])

From data loading to running a forward pass
- Extract the features (ph, Sulfate, Conductivity, Organic_carbon) and target (Potability) values and load them into the appropriate tensors to represent features and targets.
- Use both tensors to create a PyTorch dataset using the dataset class that's quickest to use when tensors don't require any additional preprocessing.
- Create a PyTorch DataLoader from the created TensorDataset; this DataLoader should use a batch_size of two and shuffle the dataset.
- Implement a small, fully connected neural network using exactly two linear layers and the nn.Sequential() API, where the final output size is 1.



In [None]:
# Load the different columns into two PyTorch tensors
features = torch.tensor(dataframe[['ph', 'Sulfate', 'Conductivity', 'Organic_carbon']].to_numpy()).float()
target = torch.tensor(dataframe['Potability'].to_numpy()).float()

# Create a dataset from the two generated tensors
dataset = TensorDataset(features, target)

# Create a dataloader using the above dataset
dataloader = DataLoader(dataset, shuffle=True, batch_size=2)
x, y = next(iter(dataloader))

# Create a model using the nn.Sequential API
model = nn.Sequential(
  nn.Linear(4, 16), 
  nn.Linear(16, 1)
)

output = model(features)
print(output)

### Section 4.2 Evaluating model performance

Writing the evaluation loop
- Set the model to evaluation mode.
- Sum the current batch loss to the validation_loss variable.
- Calculate the mean loss value for the epoch.
- Set the model back to training mode.

In [None]:
# Set the model to evaluation mode
model.eval()
validation_loss = 0.0

with torch.no_grad():
  
  for data in validationloader:
    
      outputs = model(data[0])
      loss = criterion(outputs, data[1])
      
      # Sum the current loss to the validation_loss variable
      validation_loss += loss.item()
      
# Calculate the mean loss value
validation_loss_epoch = validation_loss/len(validationloader)
print(validation_loss_epoch)

# Set the model back to training mode
model.train()

Calculating accuracy using torchmetrics
- Create an accuracy metric for a "multiclass" problem with three classes.
- Calculate the accuracy for each batch of the dataloader.
- Calculate accuracy for the epoch.
- Reset the metric for the next epoch.

In [None]:
# Create accuracy metric using torch metrics
metric = torchmetrics.Accuracy(task="multiclass", num_classes=3)
for data in dataloader:
    features, labels = data
    outputs = model(features)
    
    # Calculate accuracy over the batch
    acc = metric(outputs.softmax(dim=-1), labels.argmax(dim=-1))
    
# Calculate accuracy over the whole epoch
acc = metric.compute()

# Reset the metric for the next epoch 
metric.reset()
plot_errors(model, dataloader)

### Section 4.3 Fighting overfitting
- causes: did not train model correctly, it memorize the training data
- ways: drop out, weight decay, and data augmentation

XYZ
- Create a small neural network with one linear layer, one ReLU function, and one dropout layer, in that order. The model should take input_tensor as input and return an output of size 16.

In [None]:
# Create a small neural network
model = nn.Sequential(nn.Linear(3072, 16),
                      nn.ReLU(),
                      nn.Dropout())
model(input_tensor)

Using the same neural network, set the probability of zeroing out elements in the dropout layer to 0.8.

In [None]:
# Using the same model, set the dropout probability to 0.8
model = nn.Sequential(nn.Linear(3072, 16),
                      nn.ReLU(),
                      nn.Dropout(p=0.8))
model(input_tensor)

### Section 4.4 Improving model performance
- overfit the training set
    - minimize the training loss, create large enough model
- reduce overfitting
    - dropout
    - data augmentation
    - weight decay
    - reduce model capacity
- fine-tune hyperparameters
    - grid search
    - random search

Implementing random search
- Randomly sample a learning rate factor between 2 and 4 so that the learning rate (lr) is bounded between 0.01 and 0.0001.
- Randomly sample a momentum between 0.85 and 0.99.

In [None]:
values = []
for idx in range(10):
    # Randomly sample a learning rate factor 2 and 4
    factor = np.random.uniform(2, 4)
    lr = 10 ** -factor
    
    # Randomly sample a momentum between 0.85 and 0.99
    momentum = np.random.uniform(0.85, 0.99)
    
    values.append((lr, momentum))