# Introduction to Pytorch

**Tensor**: Like a numpy array that can run either in CPU or GPU

**Autograd**: Package for building computational graphs out of Tensors, and automatically computing gradients

**Module**: A neural Network layer, may store state or learnable weights


In [None]:
#import torch library
import torch

#check if a CUDA capable GPU is available
print("Cuda available:", torch.cuda.is_available())


# Liner regression example

Imagine that we have a cloud of points and want to draw a line that minimizes the square distance to all points

![linear regression](https://camo.githubusercontent.com/1152fe558592a8f67ce2d590a513899fa94b7df7/687474703a2f2f7777772e61746d6f732e77617368696e67746f6e2e6564752f7e726f62776f6f642f7465616368696e672f3435312f6c6162732f696d616765732f636f6e636570747331322e6a7067)

y' = w*x + b

loss function = Sum( (y - y')^2 )



Let's create randomly a cloud of points

In [None]:
%reset
import numpy as np
trainX = np.linspace(-1,1, 200 ) #200 evenly spaced samples
realW=2.0
realb=1.0
trainY = (realW * trainX + realb + np.random.randn(*trainX.shape)*0.8)

%matplotlib inline
import matplotlib.pyplot as plt
fig = plt.figure( figsize=(8, 8))
plt.plot(trainX, trainY, 'bo', markeredgecolor='none')
plt.ylabel('Line with noise')
plt.show()

In [None]:
import torch
device = torch.device("cpu")

w=torch.randn(1,device=device)
b=torch.randn(1,device=device)

X=torch.FloatTensor(trainX)
Y=torch.FloatTensor(trainY)
X.to(device)
Y.to(device)

print("X shape", X.shape)
print("Y shape", Y.shape)

learning_rate = 0.3


for epoch in range(15):
    prediction = w*X + b
    loss = (prediction - Y).pow(2).mean()
    print("loss", loss)

    grad_w = (2*(prediction - Y )*X).mean()
    grad_b = (2*(prediction - Y)).mean()

    #print(grad_w.shape)
    #print(grad_b.shape)

    w -= learning_rate * grad_w
    b -= learning_rate * grad_b
    print("w", w, "b", b)



In [None]:

%matplotlib inline
import time
import pylab as plt
from IPython import display

fig = plt.figure( figsize=(10, 10))


w=torch.randn(1,device=device)
b=torch.randn(1,device=device)

for epoch in range(15):
    prediction = w*X + b
    loss = (prediction - Y).pow(2).mean()
    #print("loss", loss)

    grad_w = (2*(prediction - Y )*X).mean()
    grad_b = (2*(prediction - Y)).mean()

    #print(grad_w.shape)
    #print(grad_b.shape)

    w -= learning_rate * grad_w
    b -= learning_rate * grad_b
    #print("w", w, "b", b)
    current_w = (w.data).cpu().numpy()
    current_b = (b.data).cpu().numpy()

    plt.clf()
    plt.plot(trainX, trainY,'o',color='b', markeredgecolor='none')
    plt.plot(trainX, trainX*current_w + current_b, color='r',)
    print('Cost: ', loss,' w:',current_w, ' b:',current_b)
    display.clear_output(wait=True)
    display.display(plt.gcf())

print('Cost: ', loss,' w:',current_w, ' b:',current_b)



# Task #1
Test last block code with different learning rates
What happens if the learning rate is too high?
What happens if the learning rate is too low?



# Autograd


In [None]:
fig = plt.figure( figsize=(10, 10))


# We will not want gradients
# (of loss) with respect to data
X=torch.FloatTensor(trainX)
Y=torch.FloatTensor(trainY)
X.to(device)
Y.to(device)



# Do want gradients with
# respect to weights 
w=torch.randn(1,device=device, requires_grad=True)
b=torch.randn(1,device=device, requires_grad=True)

for epoch in range(15):
    prediction = w*X + b
    loss = (prediction - Y).pow(2).mean()
    
    #this magically computes the gradient for all parameters involved in the loss function
    loss.backward()

    # Make gradient step on weights, then zero
    # them. Torch.no_grad means “don’t build
    # a computational graph for this part
    with torch.no_grad():
        w -= learning_rate * w.grad
        b -= learning_rate * b.grad
        # PyTorch methods that end in underscore
        # modify the Tensor in-place; methods that
        # don’t return a new Tensor
        w.grad.zero_()
        b.grad.zero_()

    #just visualization
    current_w = (w.data).cpu().numpy()
    current_b = (b.data).cpu().numpy()

    plt.clf()
    plt.plot(trainX, trainY,'o',color='b', markeredgecolor='none')
    plt.plot(trainX, trainX*current_w + current_b, color='r',)
    print('Cost: ', loss,' w:',current_w, ' b:',current_b)
    display.clear_output(wait=True)
    display.display(plt.gcf())

# PyTorch: nn

Great documentation https://pytorch.org/docs/stable/nn.html


In [None]:
model = torch.nn.Linear(1,1,bias=True)
print( "Model parameters", list(model.parameters()))

In [None]:
for epoch in range(15):
    prediction = model(X.view(-1,1))
    print(prediction)
    # loss = (prediction - Y.view(-1,1)).pow(2).mean()
    loss = torch.nn.functional.mse_loss(prediction, Y.view(-1,1))
    print(loss)

    #this magically computes the gradient for all parameters involved in the loss function
    loss.backward()

    # Make gradient step on weights, then zero
    # them. Torch.no_grad means “don’t build
    # a computational graph for this part
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
    model.zero_grad()

    #just visualization
    params = list(model.parameters())
    current_w = params[0].data.cpu().numpy()
    current_b = params[1].data.cpu().numpy()
    

    plt.clf()
    plt.plot(trainX, trainY,'o',color='b', markeredgecolor='none')
    plt.plot(trainX, prediction.view(-1).data.cpu().numpy(), color='r',)
    print('Cost: ', loss.data.cpu().numpy(),' w:',current_w, ' b:',current_b)
    display.clear_output(wait=True)
    display.display(plt.gcf())

print('Cost: ', loss.data.cpu().numpy(),' w:',current_w, ' b:',current_b)

In [None]:

class MyOptimizer():
    def __init__(self,  params, lr):
        self.learning_rate = lr
        self.params = list(params)

    def step(self):
        with torch.no_grad():
          for param in self.params:
              param -= self.learning_rate * param.grad

    def zero_grad(self):
        with torch.no_grad():
          for param in self.params:
              param.grad.zero_()


model = torch.nn.Linear(1,1,bias=True)
optimizer = MyOptimizer(   model.parameters(), lr=learning_rate)

for epoch in range(15):
    prediction = model(X.view(-1,1))
    # loss = (prediction - Y.view(-1,1)).pow(2).mean()
    loss = torch.nn.functional.mse_loss(prediction, Y.view(-1,1))
    

    #this magically computes the gradient for all parameters involved in the loss function
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

    #just visualization
    params = list(model.parameters())
    current_w = params[0].data.cpu().numpy()
    current_b = params[1].data.cpu().numpy()
    

    plt.clf()
    plt.plot(trainX, trainY,'o',color='b', markeredgecolor='none')
    plt.plot(trainX, prediction.view(-1).data.cpu().numpy(), color='r',)
    print('Cost: ', loss.data.cpu().numpy(),' w:',current_w, ' b:',current_b)
    display.clear_output(wait=True)
    display.display(plt.gcf())

print('Cost: ', loss.data.cpu().numpy(),' w:',current_w, ' b:',current_b)

# PyTorch optimizers torch.optim

https://pytorch.org/docs/stable/optim.html

Nice blog post on optimizers: **An overview of gradient descent optimization algorithms**
https://ruder.io/optimizing-gradient-descent/

![linear regression](https://miro.medium.com/max/1240/1*Y2KPVGrVX9MQkeI8Yjy59Q.gif)


In [None]:

class MyOptimizer():
    def __init__(self,  params, lr):
        self.learning_rate = lr
        self.params = list(params)

    def step(self):
        with torch.no_grad():
          for param in self.params:
              param -= self.learning_rate * param.grad

    def zero_grad(self):
        with torch.no_grad():
          for param in self.params:
              param.grad.zero_()


model = torch.nn.Linear(1,1,bias=True)
# optimizer = MyOptimizer(   model.parameters(), lr=learning_rate)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for epoch in range(15):
    prediction = model(X.view(-1,1))
    # loss = (prediction - Y.view(-1,1)).pow(2).mean()
    loss = torch.nn.functional.mse_loss(prediction, Y.view(-1,1))
    

    #this magically computes the gradient for all parameters involved in the loss function
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

    #just visualization
    params = list(model.parameters())
    current_w = params[0].data.cpu().numpy()
    current_b = params[1].data.cpu().numpy()
    

    plt.clf()
    plt.plot(trainX, trainY,'o',color='b', markeredgecolor='none')
    plt.plot(trainX, prediction.view(-1).data.cpu().numpy(), color='r',)
    print('Cost: ', loss.data.cpu().numpy(),' w:',current_w, ' b:',current_b)
    display.clear_output(wait=True)
    display.display(plt.gcf())

print('Cost: ', loss.data.cpu().numpy(),' w:',current_w, ' b:',current_b)

# Logistic Regression


<img src="http://diffsharp.github.io/DiffSharp/img/examples-neuralnetworks-neuron.png" alt="Single Neuron Logistic Function" style="width: 400px;">


<img src=https://upload.wikimedia.org/wikipedia/commons/8/88/Logistic-curve.svg style="width: 300px;" ALIGN="right" >
Sigmoid function: $$\sigma ( z ) = \frac{1}{1+e^{-z}}$$ 

Output a: $$ \hat{y} = \sigma(  \sum_{i=0}^{n}{x_{i} \cdot w_{i}} + b )$$ 
$$ \hat{y} = \sigma( {\bf x} \cdot {\bf W} + b) $$

where $ {\bf x} = \begin{pmatrix} x_1 \cdots  x_n  \end{pmatrix} $

and $  {\bf W} = \begin{pmatrix} w_1 \cdots  w_n  \end{pmatrix}^T $


PyTorch Loss functions:
https://pytorch.org/docs/stable/nn.html#loss-functions

<img src="https://miro.medium.com/max/2328/1*n1T0iYxmckzLGMMpRH6TuA.png" alt="Binary Cross Entropy Loss" width="400">







lets create our dataset

In [None]:
torch.mean(Y*torch.log(y.pred)+((1-Y)*torch.log(1-y.pred)))

In [None]:

import numpy as np

# Create dataset
%matplotlib inline
import matplotlib.pyplot as plt

XX1 = np.concatenate( (np.random.randn( 500,2) + [2,3], 
                       np.random.randn( 500,2) + [0,-1]), axis=0)
YY1 = np.concatenate( (np.zeros((500,1)), np.ones((500,1))), axis=0)
indexes=np.arange(1000)
np.random.shuffle(indexes)
YY1=YY1[indexes,:]#.astype(np.float)
XX1=XX1[indexes,:]#.astype(np.float)
print(XX1.shape, YY1.shape)
fig = plt.figure( figsize=(8, 8))
plt.scatter(XX1[:, 0], XX1[:, 1], marker='o', c=YY1[:,0], s=50, lw = 0)
plt.show()

In [None]:
class MyLogisticRegression(torch.nn.Module):
    def __init__(self, dim_in):
        super(MyLogisticRegression, self).__init__()

        #self.linear1 = torch.nn.Linear(dim_in, 1)
        #self.act1 = torch.nn.Sigmoid()

        #torch.nn.Sequential applies layers to input sequentially
        self.classifier = torch.nn.Sequential( 
          torch.nn.Linear(dim_in, 1),
          torch.nn.Sigmoid()
        )
        
        self.loss_function = torch.nn.BCELoss()

    # Not necessary to define backward, Autograd takes care of it
    def forward(self, x):
        #x = self.linear1(x)
        #x = self.act1(x)
        #return x
        return  self.classifier(x)

    def loss(self, x, y):
        return self.loss_function(x,y)

    def accuracy(self, predictions, y_true):
        y_pred = (predictions > 0.5).float()
        acc_pred = (y_pred == y_true).float().mean()
        return acc_pred * 100


model = MyLogisticRegression(dim_in = 2)
learning_rate=0.3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


X=torch.FloatTensor(XX1)
Y=torch.FloatTensor(YY1)
X.to(device)
Y.to(device)



for epoch in range(15):
    y_pred = model(X)
    loss = model.loss(y_pred, Y)
#     loss = -1*torch.mean(Y*torch.log(y_pred)+((1-Y)*torch.log(1-y_pred)))

    #this magically computes the gradient for all parameters involved in the loss function
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

    #just visualization
    print('Loss: ', loss.data.cpu().numpy())

y_pred = model(X)
print('Training accuracy:', model.accuracy(y_pred,Y))


# Task 2

Write loss code in pytorch in previous code.

Hints:

*   log -> torch.log()
*   sum -> toch.sum()
*   mean -> torch.mean()



In [None]:
loss = -1*torch.mean(Y*torch.log(y_pred)+((1-Y)*torch.log(1-y_pred)))

#### Decision boundary
<img src=https://upload.wikimedia.org/wikipedia/commons/8/88/Logistic-curve.svg style="width: 300px;" ALIGN="right" >

$ sigmoid(z) = 0.5$

which means that:

$z=0.0$

In our regression model we have:

$z = x1 \cdot w1 + x2 \cdot w2 + b = 0$

We can draw the line in the x1 , x2 plane:

$x2 = (-b -x1 \cdot w1)/w2$


In [None]:
from IPython import display
x1= np.linspace(-4,6, 200 )
fig = plt.figure( figsize=(8, 8))


model = MyLogisticRegression(dim_in = 2)
learning_rate=0.3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


for epoch in range(15):
    y_pred = model(X)
    # loss = (prediction - Y.view(-1,1)).pow(2).mean()
    loss = model.loss(y_pred, Y)
    

    #this magically computes the gradient for all parameters involved in the loss function
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

    #just visualization
    params = list( model.parameters())
    current_W = params[0].data.cpu().numpy()[0]
    current_b = params[1].data.cpu().numpy()

    x2 = (-current_b -x1*current_W[0])/current_W[1]
    plt.clf()
    plt.scatter(XX1[:, 0], XX1[:, 1], marker='o', c=YY1[:,0], s=50, lw = 0)
    plt.plot(x1,x2, "g--", lw=4)
    display.clear_output(wait=True)
    display.display(plt.gcf())


# Multi-class Classification Network

<img src="https://raw.githubusercontent.com/segurac/deeplearning-tutorials/dece1ad401a91fd3f7db1aa48a8ebfc6da2841e5/intro_theano/2layersGeneric.png" alt="Multiple classes" style="width: 500px;" ALIGN="right" >
Layer outputs:

$${\bf x}^2 = h^1(  {\bf x}^1 \cdot {\bf W}^1 + {\bf b}^1 )  $$

$${\bf x}^3 = h^2(  {\bf x}^2 \cdot {\bf W}^2 + {\bf b}^2 )  $$

where $ {\bf x} = \begin{pmatrix} x_1 \cdots  x_n  \end{pmatrix} $ 

$ {\bf b} = \begin{pmatrix} b_1 \cdots  b_m  \end{pmatrix} $ 

$  {\bf W} = \begin{pmatrix} w_{1,1} \cdots  w_{1,m} \\ 
\vdots \cdots  \vdots \\
w_{n,1} \cdots w_{n,m} \\
\end{pmatrix} $

$ h(  ) $ is the activation function (ReLU, sigmoid, Softmax etc)

For Multi-class classification: $h^2( \cdot ) = softmax$ 


# MNIST data
<img src="https://www.researchgate.net/profile/Steven_Young11/publication/306056875/figure/fig1/AS:393921575309346@1470929630835/Example-images-from-the-MNIST-dataset.png" >



In [None]:
# Load data
import pandas as pd
base_dir = '/content/sample_data/'
train = pd.read_csv(base_dir + 'mnist_train_small.csv',header=None)
test = pd.read_csv(base_dir + 'mnist_test.csv', header=None)
train.head()

In [None]:
from sklearn.model_selection import train_test_split
# Convert Dataframe into format ready for training
def createImageData(raw: pd.DataFrame):
    y = raw.iloc[:,0].values
    y.resize(y.shape[0],1)
    x = raw.iloc[:,1:].values
    x = x.reshape([-1,1, 28, 28])
    y = y.astype(int).reshape(-1)
    x = x.astype(float)
    return x, y
## Convert to One Hot Encoding
def one_hot_embedding(labels, num_classes=10):
    y = torch.eye(num_classes) 
    return y[labels]
x_train, y_train = createImageData(train)
print(x_train.shape, y_train.shape)
x_test, y_test = createImageData(test)
print(x_test.shape, y_test.shape)

In [None]:
y_test

# MNIST visualization

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
show_data = np.copy(x_test[0:16,:])
fig, axes = plt.subplots(4, 4, figsize=(10,10))
vmin, vmax = show_data.min(), show_data.max()
for coef, ax in zip(show_data, axes.ravel()):
    ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin,
           vmax=.5 * vmax)
ax.set_xticks(())
ax.set_yticks(())
print(y_test[0:16])

# Data normalization


In [None]:
#Normalize data
m_train = x_train.mean()
std_train = x_train.std()

x_train = (x_train - m_train) / std_train
x_test = (x_test - m_train) / std_train
print(x_train.mean(), x_train.std(), x_test.mean(), x_test.std())

# Define our MLP

In [None]:
import torch
import torch.nn as nn
from torchsummary import summary


class MLPClassifier(nn.Module):
    def __init__(self, dim_in, num_classes):
        super(MLPClassifier, self).__init__()
        self.classifier = nn.Sequential( 
          nn.Linear(dim_in, 128),
          nn.ReLU(inplace=True),
          nn.Linear(128, num_classes),
        )
        
        self.loss_function = nn.CrossEntropyLoss()

    # Not necessary to define backward, Autograd takes care of it
    def forward(self, x):
        return  self.classifier(x)

    def loss(self, x, y):
        return self.loss_function(x,y)

    def raw_to_probs(self, raw):
        return torch.nn.functional.softmax(raw)

    def accuracy(self, predictions, y_true):
        y_pred = predictions.argmax(dim=1)
        #print(y_pred.cpu().numpy())
        acc_pred = (y_pred == y_true).float().mean()
        return acc_pred * 100


device = torch.device("cuda:0")
model = MLPClassifier(28*28, 10)
model.to(device)
print(model)
summary(model, input_size=(1, 28*28))
learning_rate = 0.3
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

mbatch_size = 50
n_epochs = 50
n_batches_train = int(x_train.shape[0]/mbatch_size)

test_data = torch.FloatTensor(x_test).view(-1,28*28).to(device)
test_label = torch.LongTensor(y_test).view(-1).to(device)


train_losses = []
test_losses = []
train_accs = []
test_accs= []

for epoch in range(n_epochs):
    for i in range(n_batches_train):
        start_idx = mbatch_size*i
        end_idx = mbatch_size*(i+1)
        data = x_train[start_idx:end_idx,:] 
        label = y_train[start_idx:end_idx]
        data = torch.FloatTensor(data).view(-1,28*28).to(device)
        label = torch.LongTensor(label).view(-1).to(device)
        # print(data.shape, label.shape)

        raw_preds = model(data)
        loss = model.loss(raw_preds, label)
        if i%100 == 0:
            with torch.no_grad():
#                print("Training loss:", loss.cpu().item())
                preds = model.raw_to_probs(raw_preds)
                acc = model.accuracy( preds , label ).cpu().item()
#                print("Training accuracy:", ) 
                train_losses.append(loss.cpu().item())
                train_accs.append(acc)

            
                test_raw_preds = model(test_data)
                test_loss = model.loss(test_raw_preds, test_label)
#                print("Testing loss:", test_loss.cpu().item())
                test_preds = model.raw_to_probs(test_raw_preds)
                test_acc = model.accuracy( test_preds , test_label ).cpu().item()
#                print("Testing accuracy:", test_acc)
                test_losses.append(test_loss)
                test_accs.append(test_acc)

        #this magically computes the gradient for all parameters involved in the loss function
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

    with torch.no_grad():
        preds = model.raw_to_probs(raw_preds)
        acc = model.accuracy( preds , label ).cpu().item()

        model.eval()   
        test_raw_preds = model(test_data)
        test_loss = model.loss(test_raw_preds, test_label)
        test_preds = model.raw_to_probs(test_raw_preds)
        test_acc = model.accuracy( test_preds , test_label ).cpu().item()  
        print("Epoch:", epoch, "Training/Testing Loss:", loss.cpu().item(), " / ", test_loss.cpu().item(), " Training/Testing Acc:", acc, " / ", test_acc)
        model.train()

In [None]:
plt.figure( figsize=(15, 15))
plt.plot(train_losses, '-b', label='train loss')
plt.plot(test_losses, '-r', label='test loss')

plt.xlabel("n iter * 100")
plt.legend(loc='upper right')
plt.title("Loss")

plt.show()


plt.figure( figsize=(15, 15))
plt.plot(train_accs, '-b', label='train accuracy')
plt.plot(test_accs, '-r', label='test accuracy')

plt.xlabel("n iter * 100")
plt.legend(loc='lower right')
plt.title("Accuracy")

# save image

# show
plt.show()