In [6]:
import torch 
import numpy as np

In [7]:
#!pip install ipywidgets

In [8]:
torch.cuda.is_available()

True

In [9]:
device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
print("Device: ", device)

Device:  cuda:0


Plan for today:

1. ETL 
   1. Specifying some some random input
   2. PyTorch Dataset and DataLoader
2. EDA - we gonna just skip because we are lazy...
3. Feature Engineering / Cleaning - which we don't need to....
4. Modeling 
   1. `nn.Linear` (luckily, you already understand this!  Yay!)
   2. Define loss function (mse for regression, cross entrophy for classification)
   3. Define the optimizer function (gradient descent ; adam)
   4. Train the model
5. Inference / Testing

Consider this data:

<img src = "figures/japan.png" width="400">

In a linear regression model, each target variable is estimated to be a weighted sum of the input variables, offset by some constant, known as a bias :

$$\text{yield}_\text{apple}  = w_{11} * \text{temp} + w_{12} * \text{rainfall} + w_{13} * \text{humidity} + b_{1}$$

$$\text{yield}_\text{orange} = w_{21} * \text{temp} + w_{22} * \text{rainfall} + w_{23} * \text{humidity} + b_{2}$$

Visually, it means that the yield of apples is a linear or planar function of temperature, rainfall and humidity:

<img src = "figures/japan2.png" width="400">

The learning part of linear regression is to figure out a set of weights <code>w11, w12,... w23, b1 \& b2</code> using gradient descent


In [10]:
#X(temp, rainfall, hum)

X_train = np.array([[73, 67, 43], [91, 88, 64], [87, 134, 58], 
                   [102, 43, 37], [69, 96, 70], [73, 67, 43], 
                   [91, 88, 64], [87, 134, 58], [102, 43, 37], 
                   [69, 96, 70], [73, 67, 43], [91, 88, 64], 
                   [87, 134, 58], [102, 43, 37], [69, 96, 70]], 
                  dtype='float32')

# Targets (apples, oranges)
Y_train = np.array([[56, 70], [81, 101], [119, 133], 
                    [22, 37], [103, 119], [56, 70], 
                    [81, 101], [119, 133], [22, 37], 
                    [103, 119], [56, 70], [81, 101], 
                    [119, 133], [22, 37], [103, 119]], 
                   dtype='float32')

In [11]:
#Create tensors from the np. arr.

inputs = torch.tensor(X_train)
targets = torch.tensor(Y_train)

#Print the shape
print(inputs.shape)
print(targets.shape)

torch.Size([15, 3])
torch.Size([15, 2])


### 1.2 Dataset

- We are going to crate a TensorDataset on top of these tensors, so we can access each row from the input and target tuples.

- Note - This is neded if we want to use the DataLoader

In [12]:
from torch.utils.data import TensorDataset

In [13]:
# X.shape (m,n) Y.sahpe (m,k)
ds = TensorDataset(inputs, targets)

In [14]:
# This is the format PyTorch wants
# A tuple of two tensors, the x and coresponding  y
ds[1]

(tensor([91., 88., 64.]), tensor([ 81., 101.]))

In [15]:
ds[1][0]

tensor([91., 88., 64.])

### 1.3 DataLoader

- By default PyTorch works in batch.

- In simple words it will ALWAYS take some mini-batch and performs gradient descent

- Mini-batch - beacuse assume you won't be able be able to fit in 1 M samples into GPU RAM

In [16]:
# This will automatically createsa an enumarator , look at each batch
# Means you can simply perform a for loop onto DataLoader
# If DataLoader is not used we have to manually selectthe mini-batch
# Randomized
from torch.utils.data import DataLoader

batch_size = 3 # Can be any number
# Too small - slow
# Too large - run out of memory
dl = DataLoader(ds, batch_size, shuffle=True)


In [17]:
# Now this dl is basically an enumerator, in which we can loop on....
for something in dl:
    print(something)   # Three X+Y tuples
    break

[tensor([[ 69.,  96.,  70.],
        [102.,  43.,  37.],
        [ 73.,  67.,  43.]]), tensor([[103., 119.],
        [ 22.,  37.],
        [ 56.,  70.]])]


In [18]:
for x, y in dl:
    print(f"X : {x}")
    print(f"Y : {y}")
    break

# The dl keeps an internal counter
# This dl is keep on running : which is intentional, we have the concept of "epochs"

X : tensor([[102.,  43.,  37.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.]])
Y : tensor([[ 22.,  37.],
        [ 81., 101.],
        [119., 133.]])


## 2. EDA

## 3. Modeling

### 3.1 Define our neural network

- How many layers we want

In [19]:
import torch.nn as nn
#Define our NN here
#Just one layer at the moment
# Later add one more layer
# Format nn.Linear(in_features, out_features)
# Format nn.Linear(temp;rainfalll;hum, oranges;apples)
model = nn.Linear(3,2) # <-- Hidden later
 
# Linerar layers are simple matrix multiplication...
# Many other names, In Keras we called Dense, In TensorFlow we called FullyConnected
# Keras very high level, not good for research, devlopment 
# TensorFlow is developed by Google, it is quite good, 

# Fore very huge complex high performance models TensorFlow is much better, optimized
# Very low-level than Pytorch
#For very general almost any model that we use in even un reserach - PyTorch is much beteer
# Due to its computational graph

# TensorFlow something called TensorFlowLite which is the way.
# You want to use for mobile phones

In [None]:
# I wonder whether having one extra layer will reduce the loss.

# model = nn.Sequential(
#     nn.Linear()
# )

In [76]:
#Class is the best practice for creating a neural network

#format
'''
class AnyNameCapitalized(nn.Module): # It is basically inherin nn.Module
    def __init()__:
        super(.__init()__ #super is pasically inheriting nn.Module init
        #We define the layers here
    
'''

# class NeuralNetwork(nn.Module):
    
#     def __init__(self, input_size, hidden_size, output_size):
#         super.__init__()
#         self.fc1 = nn.Linear(input_size, hidden_size)
#         self.fc2 = nn.Linear(hidden_size, output_size)
    
#     def forward(self, x):
#         out = self.fc1(x)
#         out = self.fc2(out)
#         return out




In [1]:
# model = NeuralNetwork (3, 99, 2)


In [20]:
model.weight # By default these weights are uniformly close to 0
# model.fc1.weight
# model.fc2.weight

Parameter containing:
tensor([[-0.5724,  0.1971, -0.0043],
        [-0.3552,  0.4748, -0.0542]], requires_grad=True)

In [21]:
model.weight.shape #This one is basically in the shape (out_feature, in_feature)

# You can imagine X @ W^T
# After you transpose W, 

torch.Size([2, 3])

In [22]:
model.bias

# Why two bias, beacause y1 bias and y2 bias

Parameter containing:
tensor([0.5489, 0.0992], requires_grad=True)

In [23]:
list(model.parameters())

[Parameter containing:
 tensor([[-0.5724,  0.1971, -0.0043],
         [-0.3552,  0.4748, -0.0542]], requires_grad=True),
 Parameter containing:
 tensor([0.5489, 0.0992], requires_grad=True)]

In [24]:
#p.numel() just flattern everythin
sum(p.numel() for p in model.parameters() if p.requires_grad)

# Why 8 - 6 weights and 2 bias

8

In [25]:
print(inputs.shape)

output = model(inputs) # (15,3) @ (3, 2) = (15, 2)

print(output)
print(output.shape)

torch.Size([15, 3])
tensor([[-28.2163,   3.6504],
        [-34.4712,   6.0897],
        [-23.0889,  29.6762],
        [-49.5201, -17.7209],
        [-20.3278,  17.3778],
        [-28.2163,   3.6504],
        [-34.4712,   6.0897],
        [-23.0889,  29.6762],
        [-49.5201, -17.7209],
        [-20.3278,  17.3778],
        [-28.2163,   3.6504],
        [-34.4712,   6.0897],
        [-23.0889,  29.6762],
        [-49.5201, -17.7209],
        [-20.3278,  17.3778]], grad_fn=<AddmmBackward0>)
torch.Size([15, 2])


### 3.2 Define the loss fn.

- Should ber MSE or Cross Entrophy

In [26]:
# Under the nn mudule , there are many loss function

J_fn = nn.MSELoss()

#Later on you will know how to use this

### 3.3 Define the optimizer
1. Predict
2. Loss


In [27]:
# Normally in skleran we call fit, it will perform gradient descent
# In code from scratch we need to like specify how we want to update the gradients
# Optimizer handles how we update the parameters
# If we use w = w -alpha (gradient) ==> gradient descent
#Stochastic gradient descent ==>  is not one sample - mini-batch

optim = torch.optim.SGD(model.parameters(), lr=0.0001)


### 3.4 Actually train the model

In [None]:
num_epochs = 5 # It depends ... trian and error...
def fit():
    #For num_epochs
        #for dataloader
         #1. predict(forward pass )
         #2. calculate loss
         #3. calculate gradient
         #4. update the parameters using the optim

In [28]:
num_epochs = 10 # It depends ... trian and error...
for epoch in range(num_epochs):
    #for dataloader
    for x, y in dl: #What is the shape of x and y here
        #x and y are the size of the mini-batch of X_train and y_train
        #batch size is 3
        # x (3,3) y (3,2)
        # X: (batch, feature) = (3,3)
        # y: (batch, target) = (3,2)
        x.to(device)# Device is either cpu or cuda
        y.to(device)

        #1. Predict (forward pass)
        yhat = model(x)

        #2. Calculate loss
        #format J_fn(inputs, targets)
        loss = J_fn(yhat, y)

        #3. Calculate gradient
        #3.1 clear out the previous gradients
        optim.zero_grad()
        #3.2 Call backwars() on loss to retrive all the gradients (backpropagation)
        loss.backward()
        #backward DOES not adjust the weight Yet, just backpropagation
        #We want to calculate the gradients of all the parameters (8-6 weights and 2 bias)
        #IN RESPECT TO the LOSS... dJ/dw11, dJ/dw2, dJ/dw13..., dJ/db1, dJ/db2

        #4. Update the parameters using the optim
        # W = W - alpha * gradient - no need to do this here
        optim.step() # optim already kow the learning rate and has the parameters

        print(f"Epoch: {epoch} - Loss: {loss}")


Epoch: 0 - Loss: 9863.8564453125
Epoch: 0 - Loss: 5102.54052734375
Epoch: 0 - Loss: 4382.27587890625
Epoch: 0 - Loss: 3251.746337890625
Epoch: 0 - Loss: 860.6041870117188
Epoch: 1 - Loss: 331.7251281738281
Epoch: 1 - Loss: 78.15120697021484
Epoch: 1 - Loss: 115.98177337646484
Epoch: 1 - Loss: 58.624481201171875
Epoch: 1 - Loss: 122.74605560302734
Epoch: 2 - Loss: 45.06119918823242
Epoch: 2 - Loss: 58.6786003112793
Epoch: 2 - Loss: 35.11232376098633
Epoch: 2 - Loss: 56.438167572021484
Epoch: 2 - Loss: 26.329345703125
Epoch: 3 - Loss: 68.3761978149414
Epoch: 3 - Loss: 69.85672760009766
Epoch: 3 - Loss: 60.91935348510742
Epoch: 3 - Loss: 68.0940933227539
Epoch: 3 - Loss: 43.65385055541992
Epoch: 4 - Loss: 91.66065216064453
Epoch: 4 - Loss: 114.28926849365234
Epoch: 4 - Loss: 56.47148513793945
Epoch: 4 - Loss: 72.1084213256836
Epoch: 4 - Loss: 117.1617660522461
Epoch: 5 - Loss: 141.39515686035156
Epoch: 5 - Loss: 22.76177978515625
Epoch: 5 - Loss: 34.39036178588867
Epoch: 5 - Loss: 37.7647

## 4. Inference / Testing

Test some data

In [29]:
ds[0]

(tensor([73., 67., 43.]), tensor([56., 70.]))

In [31]:
# Create a np array of 
# [74, 68, 42] , [92, 88, 65]

X_test_np = np.array([[74., 68., 42.],[92., 88., 65.]], dtype='float32')


# Please make a tensor
X_test = torch.tensor(X_test_np)

# Then use our model to predic the number of orenges and apples
yhat = model(X_test)

In [32]:
yhat

tensor([[ 59.8395,  75.8474],
        [ 84.1226, 104.9896]], grad_fn=<AddmmBackward0>)

In [33]:
# Print the loss comparing ds[0] and ds[1]
ytest = ds[0:2][1]
ytest


tensor([[ 56.,  70.],
        [ 81., 101.]])

In [34]:
loss = J_fn(yhat, ytest)
print(loss)

tensor(18.6502, grad_fn=<MseLossBackward0>)
