In [None]:
# !conda install numpy pandas matplotlib --yes

In [None]:
# !pip install torch torchvision torchaudio

In [1]:
import torch
import numpy as np
import random

**Loss function**. The logistic regression does not use the predicted class labels during the learning (training) phase. Instead, it uses the predicted probabilities to optimize a   
1) surrogate loss - general term that refers to a “proxy” loss that is optimized instead of the target evaluation metric (like classification accuracy). We use surrogate loss if we cannot optimize the target metric directly  
2) negative log-likelihood loss - a different term for Binary cross-entropy.  
3) binary cross-entropy loss - a different term for negative log-likelihood loss.  

The logistic regression loss function increases exponentially the farther the predicted probability is from the true target label. There is a steep loss increase for wrong predictions; the loss approaches infinity for wrong predictions.

**Stochastic gradient descent** is based on calculus: we compute the loss function's derivatives (or gradients) with respect to the model weights. Why? The loss measures “how wrong” the predictions are. And the gradient tells us how we have to change the weights to minimize (improve) the loss.  

The loss is correlated to the accuracy, but sadly, we cannot optimize the accuracy directly using stochastic gradient descent. That's because accuracy is not a smooth function.

Computing the loss gradients is based on the chain rule from calculus. Introduce PyTorch functions that can handle the differentiation (that is, the calculation of the gradients) automatically for us. This is known as automatic differentiation or autograd.

**Derivatives vs Gradients**

Partial derivative - function(multiple variables) and we need to compute individual partial derivatives with respect to one variable.

Gradient of the function - the way of writing down partial derivatives in a vector form. 

f(x,y) = x^2 + y  
Gradient - slope in the 2 dimensions  


We can think of a “gradient” as a fancy term to describe the concept of a derivative in multiple dimensions. If we have a function with multiple inputs, we can compute a gradient to capture the slope in multiple dimensions. E.g., if the function takes 2 inputs, we have a 2D slope.

Back propagation - learning algorithm for Deep Neural Networks.  

Gradient descent - 1 model update per training epoch  
Stochastic gradient descent - N model updates for each training epoch, n - number samples    
Minibatch gradient descent - form small groupd of training examples, make 1 update after each batch.  


**Stochastic gradient descent** is a flavor of gradient descent that introduces a certain level of randomness into the training process. For each weight update, we compute the loss based on a single training example or a minibatch, which introduces a certain level of noise (or randomness) compared to regular gradient descent, which computes the weight update based on the whole training set. In this sense, the gradient for the weight update in stochastic gradient descent is an approximation of the full-gradient from regular gradient descent.

## Autograd

In [2]:
# Model params
w_1 = torch.tensor([0.23], requires_grad=True)
b = torch.tensor([0.1], requires_grad=True)

# Inputs and targets
x_1 = torch.tensor([1.23])
y = torch.tensor([1.])

In [3]:
u = x_1.dot(w_1)
z = u + b
print(z)

tensor([0.3829], grad_fn=<AddBackward0>)


In [4]:
a = torch.sigmoid(z)
print(a)

tensor([0.5946], grad_fn=<SigmoidBackward0>)


In [5]:
import torch.nn.functional as F

l = F.binary_cross_entropy(a, y)
print(l)

tensor(0.5199, grad_fn=<BinaryCrossEntropyBackward0>)


In [7]:
# Best practice - use logits - for computation efficiency and numerical stability
l = F.binary_cross_entropy_with_logits(z, y)
print(l)

tensor(0.5199, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


In [8]:
# Compute gradients
from torch.autograd import grad

grad_L_w1 = grad(l, w_1, retain_graph=True) # retain_graph - keep the computation graph in memory
print(grad_L_w1)

(tensor([-0.4987]),)


In [9]:
grad_L_b = grad(l, b, retain_graph=True)
print(grad_L_b)

(tensor([-0.4054]),)


In [10]:
# Compute partial derivatives (gradients) of the loss automatically
l.backward()

In [11]:
w_1.grad

tensor([-0.4987])

In [12]:
b.grad

tensor([-0.4054])

In [13]:
x = torch.tensor(3.)
y = torch.tensor(2., requires_grad=True)
z = torch.tensor(5.)
f = 4 * x**3 + 3 * y**2 + 2*z
print(grad(f, y))

(tensor(12.),)




---



---



## Model training

In the PyTorch Module context, it's a unique method of the Module API that will implement a backward method automatically for us (we don't see it because it happens behind the scenes.)

Why is this useful? Using the Module class comes with certain benefits. If we use it, we can use the `loss.backward()` call in our training loop together with `optimizer.step()`. The `.backward()` method computes all the gradients for us. Then, using the `.step()` method will use the loss gradients to update the model weights automatically for us.  



Python classes typically require an `__init__` if we want to define class attributes upon creating new objects from that class. In this context, we define the model parameters here.


In the `forward` method we define how the model computes the outputs. In the case of logistic regression, this could be the computation of the class-membership probabilities.

In [None]:
""" Define the model """

class MyClassifier(torch.nn.Module):
    def __init__(self, num_features):
        # Define model parameters
        pass

    def forward(self, x):
        # Define how the model produces outputs
        return outputs

In [None]:
'''
model = MyClassifier()
optimizer = torch.optim.SGD()   # init optimizer

for epoch in range(num_epochs):
    for x, y in train_dataloader: # mini-batch

        # forward pass
        outputs = model(x)
        loss = loss_fn(outputs, y)

        # backward pass
        optimizer.zero_grad() # reset the gradients from previous iteration (do not accumulate)
        loss.backward()       # compute gradients

        # update model params
        optimizer.step()   
'''

In [14]:
torch.manual_seed(123)

linear = torch.nn.Linear(in_features=2, out_features=1)

In [15]:
print(linear.weight)

Parameter containing:
tensor([[-0.2883,  0.0234]], requires_grad=True)


In [16]:
print(linear.bias)

Parameter containing:
tensor([-0.3512], requires_grad=True)


In [17]:
x = torch.tensor([[1.2, 0.5]])
x

tensor([[1.2000, 0.5000]])

In [18]:
w = linear.weight.detach()   # detach from the computation graph
b = linear.bias.detach()
z = x.matmul(w.T) + b

print(z)

tensor([[-0.6855]])


In [19]:
z = linear(x)
print(z)

tensor([[-0.6855]], grad_fn=<AddmmBackward0>)


In [20]:
# How many trainable parameters does this layer have?
list(torch.nn.Linear(in_features=5, out_features=1).parameters())

[Parameter containing:
 tensor([[ 0.1687, -0.3811,  0.3278, -0.3251, -0.3556]], requires_grad=True),
 Parameter containing:
 tensor([-0.2826], requires_grad=True)]

## Logistic Regression

In [21]:
class LogisticRegression(torch.nn.Module):
    
    def __init__(self, num_features):
        super().__init__()     # call the __init__ of the (torch.nn.Module)
        self.linear = torch.nn.Linear(num_features, 1)
    
    def forward(self, x):
        logits = self.linear(x)
        probas = torch.sigmoid(logits)
        return probas

In [22]:
torch.manual_seed(1)

model = LogisticRegression(num_features=2)

x = torch.tensor([1.1, 2.1])

# Best practice to run faster
with torch.no_grad():    # within context - disable the construction of computation graph in the background
    proba = model(x)
    
print(proba)

tensor([0.4033])


## Define the Dataset and Dataloaders

In [23]:
from torch.utils.data import Dataset, DataLoader

In [24]:
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.features = torch.tensor(X, dtype=torch.float32)
        self.labels = torch.tensor(y, dtype=torch.float32)

    def __getitem__(self, index):  # Fetch individual training record
        x = self.features[index]
        y = self.labels[index]        
        return x, y

    def __len__(self):
        return self.labels.shape[0]

In [None]:
# Init Dataset
train_ds = MyDataset(X_train, y_train)

# Wrap dataset into DataLoader
train_loader = DataLoader(
    dataset=train_ds,
    batch_size=10,
    shuffle=True,
)

## Train Logistic Regression

In [None]:
# Training Model

torch.manual_seed(1)

model = LogisticRegression(num_features=2)
optimizer = torch.optim.SGD(model.parameters(),    # return all the relevant model parameters to update 
                            lr=0.05
                            )

num_epochs = 20

for epoch in range(num_epochs):
    
    model = model.train()    # set the model into the training mode
    
    for batch_idx, (features, class_labels) in enumerate(train_loader):

        probas = model(features)
        
        loss = F.binary_cross_entropy(probas, class_labels.view(probas.shape))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        ### LOGGING
        print(f'Epoch: {epoch+1:03d}/{num_epochs:03d}'
               f' | Batch {batch_idx:03d}/{len(train_loader):03d}'
               f' | Loss: {loss:.2f}')

When we work with real-world datasets, features often come in different scales. For example, think of the alcohol content of wine (when measured in percent, it's typically a value between 10 and 15). On the other hand, the proline content of wine is measured in mg/L (milligram per Liter) and can be 100 times larger, typically ranging between 300 and 1300.   

Working with features that have vastly different numeric scales can often result in suboptimal training. To make it easier to find a good learning rate and get good convergence (that means, successfully minimizing the loss), **feature normalization** can help.  

This lecture covered two of the most widely used feature normalization methods: min-max scaling and standardization (also known as z-score standardization). In certain scenarios, one normalization scheme might work slightly better than another. Still, the most important lesson is that we use a normalization scheme to ensure that the features are all roughly on the same scale.   


----
If features on different scales --> partial derivateives on different scales --> hard to find a good learning rate working well for different weights.

**Advantages**: 
- easier to find a good learning rate for all gradients
- numerically stable gradients
- faster convergence (fewer epochs)

# Implementing a Logistic Regression Classifies

we are applying logistic regression to a banknote authentication dataset to distinguish between genuine and forged bank notes.

In [27]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

## Loading the Dataset

The dataset consists of 1372 examples and 4 features for binary classification. The features are:

- variance of a wavelet-transformed image (continuous)
- skewness of a wavelet-transformed image (continuous)
- kurtosis of a wavelet-transformed image (continuous)
- entropy of the image (continuous)

In [32]:
df = pd.read_csv("/content/drive/MyDrive/ds-ml/data_banknote_authentication.txt", header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [33]:
X_features = df[[0, 1, 2, 3]].values
y_labels = df[4].values

In [34]:
X_features.shape

(1372, 4)

In [35]:
y_labels.shape

(1372,)

In [37]:
# look at the label distribution
np.bincount(y_labels)

array([762, 610])

In [None]:
# X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0)

In [None]:
# standardization

In [41]:
X_features

array([[  3.6216 ,   8.6661 ,  -2.8073 ,  -0.44699],
       [  4.5459 ,   8.1674 ,  -2.4586 ,  -1.4621 ],
       [  3.866  ,  -2.6383 ,   1.9242 ,   0.10645],
       ...,
       [ -3.7503 , -13.4586 ,  17.5932 ,  -2.7771 ],
       [ -3.5637 ,  -8.3827 ,  12.393  ,  -1.2823 ],
       [ -2.5419 ,  -0.65804,   2.6842 ,   1.1952 ]])

In [42]:
X_features.shape

(1372, 4)

In [43]:
def standardize(X_data, X_mean, X_std):
    return (X_data - X_mean) / X_std

In [44]:
X_mean, X_str = np.mean(X_features, axis=0), np.std(X_features, axis=0)

# mean, std per feature column will be used to standardize any test/live dataset

In [45]:
X_features_std = standardize(X_features, X_mean, X_str)

In [46]:
X_features_std

array([[ 1.12180565,  1.14945512, -0.97597007,  0.35456135],
       [ 1.44706568,  1.06445293, -0.89503626, -0.12876744],
       [ 1.20780971, -0.77735215,  0.12221838,  0.61807317],
       ...,
       [-1.47235682, -2.62164576,  3.75901744, -0.75488418],
       [-1.40669251, -1.75647104,  2.552043  , -0.04315848],
       [-1.04712236, -0.43982168,  0.29861555,  1.1364645 ]])

In [47]:
train_size = int(X_features.shape[0]*0.80)
train_size

1097

In [48]:
val_size = X_features.shape[0] - train_size
val_size

275

## Define Dataloaders

In [50]:
class BanknoteAuthenDataset(Dataset):
    def __init__(self, X, y):
        self.features = torch.tensor(X, dtype=torch.float32)
        self.labels = torch.tensor(y, dtype=torch.float32)

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]        
        return x, y

    def __len__(self):
        return self.labels.shape[0]

In [51]:
# Using torch.utils.data.random_split, we generate the training and validation sets along with the respective data loaders

full_dataset = BanknoteAuthenDataset(X_features_std, y_labels)

train_set, val_set = torch.utils.data.random_split(full_dataset, [train_size, val_size])

In [52]:
train_loader = DataLoader(
    dataset=train_set,
    batch_size=10,
    shuffle=True,
)

val_loader = DataLoader(
    dataset=val_set,
    batch_size=10,
    shuffle=False,
)

## Implementing the model

In [53]:
class LogisticRegression(torch.nn.Module):    
    def __init__(self, num_features):
        super().__init__()
        self.linear = torch.nn.Linear(num_features, 1)
    
    def forward(self, x):
        logits = self.linear(x)
        probas = torch.sigmoid(logits)
        return probas

In [54]:
model = LogisticRegression(num_features=4)

In [55]:
x = torch.tensor([1.1, 2.1, 1.4, 2.3])

with torch.no_grad():
    proba = model(x)
    
print(proba)

tensor([0.9005])


## The training loop

In [56]:
optimizer = torch.optim.SGD(model.parameters(), 
                            lr=0.1)

In [58]:
num_epochs = 25

In [59]:
for epoch in range(num_epochs):
    
    model = model.train()
    for batch_idx, (features, class_labels) in enumerate(train_loader):

        probas = model(features)
        
        loss = F.binary_cross_entropy(probas, class_labels.view(probas.shape))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        ### LOGGING
        print(f'Epoch: {epoch+1:03d}/{num_epochs:03d}'
               f' | Batch {batch_idx:03d}/{len(train_loader):03d}'
               f' | Loss: {loss:.2f}')

Epoch: 001/025 | Batch 000/110 | Loss: 0.95
Epoch: 001/025 | Batch 001/110 | Loss: 1.01
Epoch: 001/025 | Batch 002/110 | Loss: 0.96
Epoch: 001/025 | Batch 003/110 | Loss: 0.88
Epoch: 001/025 | Batch 004/110 | Loss: 0.85
Epoch: 001/025 | Batch 005/110 | Loss: 0.89
Epoch: 001/025 | Batch 006/110 | Loss: 0.76
Epoch: 001/025 | Batch 007/110 | Loss: 0.84
Epoch: 001/025 | Batch 008/110 | Loss: 0.77
Epoch: 001/025 | Batch 009/110 | Loss: 0.79
Epoch: 001/025 | Batch 010/110 | Loss: 0.74
Epoch: 001/025 | Batch 011/110 | Loss: 0.73
Epoch: 001/025 | Batch 012/110 | Loss: 0.68
Epoch: 001/025 | Batch 013/110 | Loss: 0.72
Epoch: 001/025 | Batch 014/110 | Loss: 0.75
Epoch: 001/025 | Batch 015/110 | Loss: 0.65
Epoch: 001/025 | Batch 016/110 | Loss: 0.70
Epoch: 001/025 | Batch 017/110 | Loss: 0.65
Epoch: 001/025 | Batch 018/110 | Loss: 0.76
Epoch: 001/025 | Batch 019/110 | Loss: 0.65
Epoch: 001/025 | Batch 020/110 | Loss: 0.62
Epoch: 001/025 | Batch 021/110 | Loss: 0.71
Epoch: 001/025 | Batch 022/110 |

## Evaluate the results

In [60]:
probas

tensor([[2.8237e-01],
        [1.5443e-04],
        [9.6026e-01],
        [2.8236e-01],
        [4.1159e-03],
        [8.8513e-01],
        [1.2630e-03]], grad_fn=<SigmoidBackward0>)

In [61]:
pred = torch.where(probas > 0.5, 1, 0)
pred

tensor([[0],
        [0],
        [1],
        [0],
        [0],
        [1],
        [0]])

In [62]:
class_labels.view(pred.shape).to(pred.dtype)

tensor([[0],
        [0],
        [1],
        [0],
        [0],
        [1],
        [0]])

In [63]:
def compute_accuracy(model, dataloader):

    model = model.eval()
    
    correct = 0.0
    total_examples = 0
    
    for idx, (features, class_labels) in enumerate(dataloader):
        
        with torch.no_grad():
            probas = model(features)
        
        pred = torch.where(probas > 0.5, 1, 0)
        lab = class_labels.view(pred.shape).to(pred.dtype)

        compare = lab == pred
        correct += torch.sum(compare)
        total_examples += len(compare)

    return correct / total_examples

In [64]:
train_acc = compute_accuracy(model, train_loader)

In [65]:
print(f"Accuracy: {train_acc*100}%")

Accuracy: 98.085693359375%


In [66]:
val_acc = compute_accuracy(model, val_loader)
print(f"Accuracy: {val_acc*100:.2f}%")

Accuracy: 97.82%
