In [2]:
import torch 
import numpy as np

In [3]:
torch.cuda.is_available()

True

In [4]:
device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
print("Device: ", device)

Device:  cuda:0


1. ETL
    1. Specifying some random input
    2. Pytorch dataset and dataloader
2. EDA ... will not be done today
3. Feature Engineering/ cleaning ... no need to do
4. Modeling
    3. nn.linear
    4. Define loss fn.
    5. Define the optimizer fn.
    6. Train the model
5. Inference/ Testing

Consider this data:

<img src = "figures/japan.png" width="400">

In a linear regression model, each target variable is estimated to be a weighted sum of the input variables, offset by some constant, known as a bias :

$$\text{yield}_\text{apple}  = w_{11} * \text{temp} + w_{12} * \text{rainfall} + w_{13} * \text{humidity} + b_{1}$$

$$\text{yield}_\text{orange} = w_{21} * \text{temp} + w_{22} * \text{rainfall} + w_{23} * \text{humidity} + b_{2}$$

Visually, it means that the yield of apples is a linear or planar function of temperature, rainfall and humidity:

<img src = "figures/japan2.png" width="400">

The learning part of linear regression is to figure out a set of weights <code>w11, w12,... w23, b1 \& b2</code> using gradient descent


In [29]:
#X(temp, rainfall, hum)

X_train = np.array([[73, 67, 43], [91, 88, 64], [87, 134, 58], 
                   [102, 43, 37], [69, 96, 70], [73, 67, 43], 
                   [91, 88, 64], [87, 134, 58], [102, 43, 37], 
                   [69, 96, 70], [73, 67, 43], [91, 88, 64], 
                   [87, 134, 58], [102, 43, 37], [69, 96, 70]], 
                  dtype='float32')

# Targets (apples, oranges)
Y_train = np.array([[56, 70], [81, 101], [119, 133], 
                    [22, 37], [103, 119], [56, 70], 
                    [81, 101], [119, 133], [22, 37], 
                    [103, 119], [56, 70], [81, 101], 
                    [119, 133], [22, 37], [103, 119]], 
                   dtype='float32')

In [30]:
#Create tensors from the np. arr.

inputs = torch.tensor(X_train)
targets = torch.tensor(Y_train)

#Print the shape
print(inputs.shape)
print(targets.shape)

torch.Size([15, 3])
torch.Size([15, 2])


### 1.2 Dataset

- We are going to crate a TensorDataset on top of these tensors, so we can access each row from the input and target tuples.

- Note - This is neded if we want to use the DataLoader

In [31]:
from torch.utils.data import TensorDataset

In [32]:
# X.shape (m,n) Y.sahpe (m,k)
ds = TensorDataset(inputs, targets)

In [33]:
# This is the format PyTorch wants
# A tuple of two tensors, the x and coresponding  y
ds[1]

(tensor([91., 88., 64.]), tensor([ 81., 101.]))

In [34]:
ds[1][0]

tensor([91., 88., 64.])

### 1.3 DataLoader

- By default PyTorch works in batch.

- In simple words it will ALWAYS take some mini-batch and performs gradient descent

- Mini-batch - beacuse assume you won't be able be able to fit in 1 M samples into GPU RAM

In [35]:
# This will automatically createsa an enumarator , look at each batch
# Means you can simply perform a for loop onto DataLoader
# If DataLoader is not used we have to manually selectthe mini-batch
# Randomized
from torch.utils.data import DataLoader

batch_size = 3 # Can be any number
# Too small - slow
# Too large - run out of memory
dl = DataLoader(ds, batch_size, shuffle=True)


In [22]:
# Now this dl is basically an enumerator, in which we can loop on....
for something in dl:
    print(something)   # Three X+Y tuples
    break

[tensor([[69., 96., 70.],
        [73., 67., 43.],
        [73., 67., 43.]]), tensor([[103., 119.],
        [ 56.,  70.],
        [ 56.,  70.]])]


In [24]:
for x, y in dl:
    print(f"X : {x}")
    print(f"Y : {y}")
    break

# The dl keeps an internal counter
# This dl is keep on running : which is intentional, we have the concept of "epochs"

X : tensor([[ 73.,  67.,  43.],
        [ 87., 134.,  58.],
        [ 73.,  67.,  43.]])
Y : tensor([[ 56.,  70.],
        [119., 133.],
        [ 56.,  70.]])


## 2. EDA

## 3. Modeling

### 3.1 Define our neural network

- How many layers we want

In [37]:
import torch.nn as nn
#Define our NN here
#Just one layer at the moment
# Later add one more layer
# Format nn.Linear(in_features, out_features)
# Format nn.Linear(temp;rainfalll;hum, oranges;apples)
model = nn.Linear(3,2) # <-- Hidden later
 
# Linerar layers are simple matrix multiplication...
# Many other names, In Keras we called Dense, In TensorFlow we called FullyConnected
# Keras very high level, not good for research, devlopment 
# TensorFlow is developed by Google, it is quite good, 

# Fore very huge complex high performance models TensorFlow is much better, optimized
# Very low-level than Pytorch
#For very general almost any model that we use in even un reserach - PyTorch is much beteer
# Due to its computational graph

# TensorFlow something called TensorFlowLite which is the way.
# You want to use for mobile phones

In [None]:
# I wonder whether having one extra layer will reduce the loss.

# model = nn.Sequential(
#     nn.Linear()
# )

In [76]:
#Class is the best practice for creating a neural network

#format
'''
class AnyNameCapitalized(nn.Module): # It is basically inherin nn.Module
    def __init()__:
        super(.__init()__ #super is pasically inheriting nn.Module init
        #We define the layers here
    
'''

# class NeuralNetwork(nn.Module):
    
#     def __init__(self, input_size, hidden_size, output_size):
#         super.__init__()
#         self.fc1 = nn.Linear(input_size, hidden_size)
#         self.fc2 = nn.Linear(hidden_size, output_size)
    
#     def forward(self, x):
#         out = self.fc1(x)
#         out = self.fc2(out)
#         return out




In [1]:
# model = NeuralNetwork (3, 99, 2)


In [69]:
#model.weight # By default these weights are uniformly close to 0
model.fc1.weight
model.fc2.weight

In [39]:
model.weight.shape #Tis one is basically in the shape (out_feature, in_feature)

# You can imagine X @ W^T
# After you transpose W, 

torch.Size([2, 3])

In [40]:
model.bias

# Why two bias, beacause y1 bias and y2 bias

Parameter containing:
tensor([-0.0958,  0.4890], requires_grad=True)

In [42]:
list(model.parameters())

[Parameter containing:
 tensor([[ 0.2432,  0.4726,  0.4454],
         [ 0.4577,  0.5533, -0.2382]], requires_grad=True),
 Parameter containing:
 tensor([-0.0958,  0.4890], requires_grad=True)]

In [44]:
#p.numel() just flattern everythin
sum(p.numel() for p in model.parameters() if p.requires_grad)

# Why 8 - 6 weights and 2 bias

8

In [48]:
print(inputs.shape)

output = model(inputs) # (15,3) @ (3, 2) = (15, 2)

print(output)
print(output.shape)

torch.Size([15, 3])
tensor([[ 68.4779,  60.7320],
        [ 92.1343,  75.5885],
        [110.2310, 100.6378],
        [ 61.5157,  62.1564],
        [ 93.2368,  68.5156],
        [ 68.4779,  60.7320],
        [ 92.1343,  75.5885],
        [110.2310, 100.6378],
        [ 61.5157,  62.1564],
        [ 93.2368,  68.5156],
        [ 68.4779,  60.7320],
        [ 92.1343,  75.5885],
        [110.2310, 100.6378],
        [ 61.5157,  62.1564],
        [ 93.2368,  68.5156]], grad_fn=<AddmmBackward0>)
torch.Size([15, 2])


### 3.2 Define the loss fn.

- Should ber MSE or Cross Entrophy

In [50]:
# Under the nn mudule , there are many loss function

J_fn = nn.MSELoss()

#Later on you will know how to use this

### 3.3 Define the optimizer
1. Predict
2. Loss


In [51]:
# Normally in skleran we call fit, it will perform gradient descent
# In code from scratch we need to like specify how we want to update the gradients
# Optimizer handles how we update the parameters
# If we use w = w -alpha (gradient) ==> gradient descent
#Stochastic gradient descent ==>  is not one sample - mini-batch

optim = torch.optim.SGD(model.parameters(), lr=0.0001)


### 3.4 Actually tran the model

In [None]:
num_epochs = 5 # It depends ... trian and error...
def fit():
    #For num_epochs
        #for dataloader
         #1. predict(forward pass )
         #2. calculate loss
         #3. calculate gradient
         #4. update the parameters using the optim

In [52]:
num_epochs = 10 # It depends ... trian and error...
for epoch in range(num_epochs):
    #for dataloader
    for x, y in dl: #What is the shape of x and y here
        #x and y are the size of the mini-batch of X_train and y_train
        #batch size is 3
        # x (3,3) y (3,2)
        # X: (batch, feature) = (3,3)
        # y: (batch, target) = (3,2)
        x.to(device)# Device is either cpu or cuda
        y.to(device)

        #1. Predict (forward pass)
        yhat = model(x)

        #2. Calculate loss
        #format J_fn(inputs, targets)
        loss = J_fn(yhat, y)

        #3. Calculate gradient
        #3.1 clear out the previous gradients
        optim.zero_grad()
        #3.2 Call backwars() on loss to retrive all the gradients (backpropagation)
        loss.backward()
        #backward DOES not adjust the weight Yet, just backpropagation
        #We want to calculate the gradients of all the parameters (8-6 weights and 2 bias)
        #IN RESPECT TO the LOSS... dJ/dw11, dJ/dw2, dJ/dw13..., dJ/db1, dJ/db2

        #4. Update the parameters using the optim
        # W = W - alpha * gradient - no need to do this here
        optim.step() # optim already kow the learning rate and has the parameters

        print(f"Epoch: {epoch} - Loss: {loss}")


Epoch: 0 - Loss: 756.319091796875
Epoch: 0 - Loss: 2455.9990234375
Epoch: 0 - Loss: 2432.283935546875
Epoch: 0 - Loss: 790.4541015625
Epoch: 0 - Loss: 121.9244613647461
Epoch: 1 - Loss: 126.046630859375
Epoch: 1 - Loss: 436.5860595703125
Epoch: 1 - Loss: 837.3534545898438
Epoch: 1 - Loss: 462.3238525390625
Epoch: 1 - Loss: 742.0138549804688
Epoch: 2 - Loss: 306.7709045410156
Epoch: 2 - Loss: 118.33515167236328
Epoch: 2 - Loss: 470.9854431152344
Epoch: 2 - Loss: 730.1071166992188
Epoch: 2 - Loss: 730.603759765625
Epoch: 3 - Loss: 633.5977172851562
Epoch: 3 - Loss: 618.3212890625
Epoch: 3 - Loss: 296.24188232421875
Epoch: 3 - Loss: 222.3563690185547
Epoch: 3 - Loss: 106.81807708740234
Epoch: 4 - Loss: 142.64190673828125
Epoch: 4 - Loss: 115.96732330322266
Epoch: 4 - Loss: 47.31418228149414
Epoch: 4 - Loss: 93.90081787109375
Epoch: 4 - Loss: 176.44415283203125
Epoch: 5 - Loss: 41.66023635864258
Epoch: 5 - Loss: 31.12462043762207
Epoch: 5 - Loss: 89.86382293701172
Epoch: 5 - Loss: 126.4339

## 4. Inference / Testing

Test some data

In [53]:
ds[0]

(tensor([73., 67., 43.]), tensor([56., 70.]))

In [61]:
# Create a np array of 
# [74, 68, 42] , [92, 88, 65]

X_test_np = np.array([[74., 68., 42.],[92., 88., 65.]], dtype='float32')


# Please make a tensor
X_test = torch.tensor(X_test_np)

# Then use our model to predic the number of orenges and apples
yhat = model(X_test)

In [62]:
yhat

tensor([[58.2936, 70.0953],
        [83.0424, 94.3229]], grad_fn=<AddmmBackward0>)

In [63]:
# Print the loss comparing ds[0] and ds[1]
ytest = ds[0:2][1]
ytest


tensor([[ 56.,  70.],
        [ 81., 101.]])

In [64]:
loss = J_fn(yhat, ytest)
print(loss)

tensor(13.5063, grad_fn=<MseLossBackward0>)
