<a href="https://colab.research.google.com/github/zazuetaz/Capstone_project/blob/master/pyTorchTutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Objectives
You will be able to 
* Translate a Neural Network from Keras to PyTorch

# Pytorch vs Keras
## Pros 
Keras 
* easy to code 
* less code 
* great documentation
* been used longer 
* easy for new comers

PyTorch
* more flexible
* dynamic graphs - more pythonic (doesn't compile)
* great for research and debugging algorithm
* easier on memory

## Cons
Keras
* compiles models (static graphs) 
* hard to debug specific parts of your model
* hard for research
* heavier on memory 

## PyTorch
* hard for new comers
* write your own training loops
* you have to know about deep learning to use it

In [None]:
# Load in libraries
import pandas as pd 
import numpy as np 

import keras 
from keras.models import Sequential
from keras.layers import Input, Dense 
from keras.losses import CategoricalCrossentropy
from keras.optimizers import Adam 

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

np.random.seed(42)

Using TensorFlow backend.


# Load in Iris Data

In [None]:
iris = load_iris()
data = iris.data 
target = iris.target
columns = iris.feature_names

In [None]:
df = pd.DataFrame(data, columns=columns)
df['target'] = target 
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


# Build a single layer perceptron

In [None]:
model = Sequential()
model.add(Dense(input_dim=4, units=8, activation='relu'))
model.add(Dense(3, activation='softmax'))

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 8)                 40        
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 27        
Total params: 67
Trainable params: 67
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer=Adam(), loss=CategoricalCrossentropy())

# Split Data into training and test sets

In [None]:
df_train, df_test = train_test_split(df, test_size=0.15)
X_train, y_train = df_train.drop(columns=['target']), df_train['target']
X_test, y_test = df_test.drop(columns=['target']), df_test['target']
y_train_dummies = pd.get_dummies(y_train)
y_test_dummies = pd.get_dummies(y_test)

# Fit the Keras Model and Evaluate it

In [None]:
%timeit
model.fit(X_train, y_train_dummies, epochs=50, batch_size=50)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.58 µs
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x7f559b027198>

In [None]:
y_train_preds = model.predict_classes(X_train)
y_test_preds = model.predict_classes(X_test)

In [None]:
accuracy_score(y_train, y_train_preds), accuracy_score(y_test, y_test_preds)

(0.6614173228346457, 0.6956521739130435)

# Replicating this in PyTorch
* import necessary libraries
* build model architecture
* define loss function
* train model

In [None]:
import tensorflow as tf
import torch
import torch.nn as nn 
import torch.nn.functional as F  

import torchsummary
from torch.autograd import Variable 
from torch.optim import Adam as Adam_torch

## Build Model Architecture

In [None]:
torch_model = nn.Sequential(
    nn.Linear(4,  8),
    nn.ReLU(),
    nn.Linear(8, 3),
    nn.Softmax(dim=1)
)

In [None]:
# Notice we can see our weights here
weights = list(torch_model.parameters())
weights

[Parameter containing:
 tensor([[-0.4179,  0.0135,  0.0296, -0.1518],
         [ 0.3872, -0.1206, -0.4764,  0.3640],
         [-0.0504,  0.3720, -0.3487,  0.2162],
         [-0.0890,  0.4887,  0.2884,  0.3531],
         [-0.0863,  0.2321,  0.3914, -0.3927],
         [ 0.1562,  0.2368, -0.1866,  0.2092],
         [-0.2788,  0.3466, -0.1761,  0.0908],
         [ 0.0889,  0.4292, -0.4459,  0.1175]], requires_grad=True),
 Parameter containing:
 tensor([-0.3588, -0.1922, -0.1841, -0.3577,  0.4194, -0.0551, -0.2723, -0.4355],
        requires_grad=True),
 Parameter containing:
 tensor([[-0.0824, -0.0930, -0.0353,  0.2544, -0.0875, -0.2417, -0.0550, -0.2578],
         [-0.0805, -0.0113,  0.1739,  0.1341,  0.0944, -0.0948, -0.3415,  0.1262],
         [-0.0433,  0.1298, -0.1252, -0.0362,  0.0518, -0.2474, -0.3164, -0.2150]],
        requires_grad=True),
 Parameter containing:
 tensor([-0.0196,  0.3166, -0.1459], requires_grad=True)]

## Define loss function

In [None]:
torch_loss = nn.CrossEntropyLoss()

## Before we fit our model to our data we have to convert our data to PyTorch Tensors
* This seems a bit extraneous but this is part of what makes PyTorch so fast and perform so well.

In [None]:
X_train_tensor = torch.tensor(X_train.values, requires_grad=True)
X_test_tensor = torch.tensor(X_test.values, requires_grad=True)
 # notice we're not using the dummy labels
y_train_tensor = torch.tensor(y_train.values)
y_test_tensor = torch.tensor(y_test.values)

### Let's test that our model can take our training data and return a prediction

In [None]:
torch_model(X_train_tensor.float())[:10] # first 10 y_preds of our model

tensor([[0.2932, 0.5164, 0.1904],
        [0.3262, 0.5256, 0.1483],
        [0.2296, 0.5786, 0.1918],
        [0.3007, 0.5179, 0.1813],
        [0.2090, 0.6143, 0.1767],
        [0.3029, 0.5192, 0.1780],
        [0.3088, 0.5193, 0.1720],
        [0.3193, 0.5234, 0.1574],
        [0.3172, 0.5263, 0.1565],
        [0.2933, 0.5110, 0.1957]], grad_fn=<SliceBackward>)

### Setup your optimizer

In [None]:
# the optimizer takes in the parameters of your model as inputs to update them
# We'll leave the other parameters to their default value
optimizer = Adam_torch(torch_model.parameters())

## Let's go through one iteration of training and then convert that to a loop

In [None]:
# print original weights for reference
weights_original =  list(torch_model.parameters())
weights_original

[Parameter containing:
 tensor([[-0.4179,  0.0135,  0.0296, -0.1518],
         [ 0.3872, -0.1206, -0.4764,  0.3640],
         [-0.0504,  0.3720, -0.3487,  0.2162],
         [-0.0890,  0.4887,  0.2884,  0.3531],
         [-0.0863,  0.2321,  0.3914, -0.3927],
         [ 0.1562,  0.2368, -0.1866,  0.2092],
         [-0.2788,  0.3466, -0.1761,  0.0908],
         [ 0.0889,  0.4292, -0.4459,  0.1175]], requires_grad=True),
 Parameter containing:
 tensor([-0.3588, -0.1922, -0.1841, -0.3577,  0.4194, -0.0551, -0.2723, -0.4355],
        requires_grad=True),
 Parameter containing:
 tensor([[-0.0824, -0.0930, -0.0353,  0.2544, -0.0875, -0.2417, -0.0550, -0.2578],
         [-0.0805, -0.0113,  0.1739,  0.1341,  0.0944, -0.0948, -0.3415,  0.1262],
         [-0.0433,  0.1298, -0.1252, -0.0362,  0.0518, -0.2474, -0.3164, -0.2150]],
        requires_grad=True),
 Parameter containing:
 tensor([-0.0196,  0.3166, -0.1459], requires_grad=True)]

In [None]:
# Step 1: get your y_preds
y_preds_torch = torch_model(X_train_tensor.float())

# Step 2: Calculate your loss
loss = torch_loss(y_preds_torch, y_train_tensor)

# Step 3: Perform gradient descent using the built in method of PyTorch
torch_model.zero_grad()
loss.backward()

# Step 4: Step your optimizer
optimizer.step()

In [None]:
# Check if your weights are updated by comparing this to what you printed above
# when we printed 'weights_original'
weights_new = list(torch_model.parameters())
weights_new

[Parameter containing:
 tensor([[-0.4179,  0.0135,  0.0296, -0.1518],
         [ 0.3882, -0.1216, -0.4754,  0.3650],
         [-0.0514,  0.3710, -0.3497,  0.2152],
         [-0.0900,  0.4877,  0.2874,  0.3521],
         [-0.0853,  0.2311,  0.3924, -0.3917],
         [ 0.1552,  0.2358, -0.1856,  0.2102],
         [-0.2788,  0.3466, -0.1761,  0.0908],
         [ 0.0879,  0.4282, -0.4469,  0.1165]], requires_grad=True),
 Parameter containing:
 tensor([-0.3588, -0.1912, -0.1851, -0.3587,  0.4184, -0.0561, -0.2723, -0.4365],
        requires_grad=True),
 Parameter containing:
 tensor([[-0.0824, -0.0920, -0.0343,  0.2534, -0.0885, -0.2407, -0.0550, -0.2568],
         [-0.0805, -0.0123,  0.1729,  0.1331,  0.0934, -0.0958, -0.3415,  0.1252],
         [-0.0433,  0.1288, -0.1262, -0.0352,  0.0528, -0.2464, -0.3164, -0.2160]],
        requires_grad=True),
 Parameter containing:
 tensor([-0.0206,  0.3156, -0.1449], requires_grad=True)]

# Now that we have a basic understanding let's train our model
To do this you have to write your own loop to run epochs and batches

In [None]:
for epoch in range(50):
  # Step 1: get your y_preds
  y_preds_torch = torch_model(X_train_tensor.float())

  # Step 2: Calculate your loss
  loss = torch_loss(y_preds_torch, y_train_tensor)

  # Step 3: Perform gradient descent using the built in method of PyTorch
  torch_model.zero_grad()
  loss.backward()

  # Step 4: Step your optimizer
  optimizer.step()
  if epoch%10==0:
    print(f"Epoch {epoch} - loss: {loss}")

Epoch 0 - loss: 1.1460719108581543
Epoch 10 - loss: 1.1351532936096191
Epoch 20 - loss: 1.1250324249267578
Epoch 30 - loss: 1.1159694194793701
Epoch 40 - loss: 1.107917070388794


## Let's get the accuracy score for our train and test data

In [None]:
# get our train predictions
y_train_torch_preds = torch_model(X_train_tensor.float())
y_test_torch_preds = torch_model(X_test_tensor.float())
y_train_torch_preds[:10] # print out the first ten rows of predictions

tensor([[0.2529, 0.4146, 0.3325],
        [0.2701, 0.4135, 0.3165],
        [0.2604, 0.4164, 0.3232],
        [0.2597, 0.4139, 0.3264],
        [0.2568, 0.4254, 0.3178],
        [0.2593, 0.4124, 0.3283],
        [0.2629, 0.4160, 0.3211],
        [0.2650, 0.4092, 0.3258],
        [0.2447, 0.4143, 0.3411],
        [0.2559, 0.4121, 0.3319]], grad_fn=<SliceBackward>)

In [None]:
# convert our predictions into labels using the argmax function
y_train_torch_labels = torch.argmax(y_train_torch_preds, axis=1)
y_test_torch_labels = torch.argmax(y_test_torch_preds, axis=1)
y_train_torch_labels

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1])

In [None]:
# score them using accuracy metric with the original y_train
train_accuracy = torch.sum(y_train_tensor==y_train_torch_labels).float()/y_train_tensor.size()[0]
test_accuracy = torch.sum(y_test_tensor==y_test_torch_labels).float()/y_test_tensor.size()[0]
train_accuracy, test_accuracy

(tensor(0.3386), tensor(0.3043))

# These scores are not that great...why? 
Well in our keras model we updated our weights with batches of 50, so how can we do that in our torch model?

In [None]:
BATCH_SIZE=50
BATCHES = int(X_train_tensor.size()[0]/BATCH_SIZE) + 1 
BATCHES

3

In [None]:
# Let's do this again but with batches in our training process

In [None]:
# Instatiate a model
torch_model_2 = nn.Sequential(
    nn.Linear(4,  8),
    nn.ReLU(),
    nn.Linear(8, 3),
    nn.Softmax(dim=1)
)

In [None]:
# Set your optimizer again and this time put in the same default parameters as the 
# Adam optimizer from Keras
optimizer_2 = Adam_torch(torch_model_2.parameters(), lr=0.01, eps=1e-07, betas=[0.9, 0.999], amsgrad=False)

In [None]:
# Print the weights
weights = list(torch_model_2.parameters())
weights

[Parameter containing:
 tensor([[-0.4661,  0.2455, -0.0401, -0.3149],
         [ 0.0669,  0.0920,  0.4099,  0.0289],
         [ 0.3300,  0.4426, -0.3211,  0.4648],
         [ 0.0346, -0.4099,  0.0281, -0.0412],
         [ 0.4326,  0.4621,  0.1303, -0.2941],
         [-0.1068, -0.2049, -0.4479, -0.1129],
         [-0.2391, -0.0022,  0.3952, -0.3517],
         [ 0.1564, -0.3748, -0.2015,  0.0601]], requires_grad=True),
 Parameter containing:
 tensor([ 0.2731, -0.0801, -0.0281,  0.4276, -0.4954,  0.0574,  0.3185,  0.0614],
        requires_grad=True),
 Parameter containing:
 tensor([[ 0.0965, -0.1093, -0.1519, -0.1744,  0.1534, -0.0845,  0.0024,  0.1230],
         [-0.2349, -0.3046,  0.1011,  0.0566,  0.1607,  0.2407,  0.3435, -0.1156],
         [-0.0148, -0.2547, -0.2807, -0.3118,  0.1278,  0.2637,  0.0572,  0.3393]],
        requires_grad=True),
 Parameter containing:
 tensor([ 0.2538, -0.2690, -0.2696], requires_grad=True)]

In [None]:
%timeit
gradient_descent_counter = 0
for epoch in range(50):
  # we need to perform gradient descent in our batches loop!
  for batch in range(BATCHES):
    starting_index = batch*BATCH_SIZE 
    ending_index = (batch+1)*BATCH_SIZE 
    # because our batches don't evenly divide our row we need to do a try/except here
    try:
      X_train_batch = X_train_tensor[starting_index:ending_index]
      y_train_batch = y_train_tensor[starting_index:ending_index]
    except:
      X_train_batch = X_train_tensor[starting_index:]
      y_train_batch = y_train_tensor[starting_index:]
    y_preds_torch = torch_model_2(X_train_batch.float())
    loss = torch_loss(y_preds_torch, y_train_batch)
    torch_model_2.zero_grad()
    loss.backward()
    optimizer_2.step() 
    gradient_descent_counter += 1
  if epoch%10==0:
    print(f"Epoch {epoch} - loss: {loss}")

print(f"Gradient Descent Performed {gradient_descent_counter} times")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.48 µs
Epoch 0 - loss: 1.1238813400268555
Epoch 10 - loss: 0.9576315879821777
Epoch 20 - loss: 0.8014950752258301
Epoch 30 - loss: 0.6788737177848816
Epoch 40 - loss: 0.6187527775764465
Gradient Descent Performed 150 times


In [None]:
def torch_scoring(model, X, y):
  y_preds = model(X.float())
  y_preds = torch.argmax(y_preds, axis=1)
  score = torch.sum(y==y_preds).float()/y.size()[0]  
  return score

In [None]:
train_score = torch_scoring(torch_model_2, X_train_tensor, y_train_tensor)
test_score = torch_scoring(torch_model_2, X_test_tensor, y_test_tensor)
train_score, test_score 

(tensor(0.9764), tensor(1.))

# Let's compare the weights of our 2 models

In [None]:
for keras_layer, torch_layer in zip(model.weights, torch_model_2.parameters()):
  print("Keras Layer")
  print(keras_layer.numpy().T)
  print("PyTorch Layer")
  print(torch_layer)
  print("Diffs")
  print(torch.tensor(keras_layer.numpy().T) - torch_layer)
  print("\n\n")

Keras Layer
[[ 0.3996323   0.3691409  -0.500964    0.22652559]
 [ 0.14377505 -0.1819542  -0.37560984  0.36558813]
 [ 0.5485858  -0.43272293 -0.09885065  0.5367494 ]
 [ 0.21328695 -0.5406608   0.5270672  -0.35551763]
 [-0.67407614 -0.03242999  0.49590892  0.34737724]
 [ 0.47560582 -0.12535208  0.16390562  0.20797464]
 [ 0.11479455 -0.06462967 -0.51813376  0.6005848 ]
 [ 0.40960282  0.40233585  0.16305096 -0.43853295]]
PyTorch Layer
Parameter containing:
tensor([[ 0.0221, -0.3198,  0.7998,  0.8409],
        [ 0.3466,  0.5842, -0.7123, -0.9262],
        [ 0.6306,  0.7011, -0.0882, -1.1945],
        [ 0.7451,  0.4183, -0.8575, -0.5783],
        [ 0.1850,  0.9270, -0.5356, -0.7872],
        [ 0.2716,  0.2639, -0.1019,  0.2651],
        [-0.4291,  0.1793,  0.7335,  1.1916],
        [ 0.1464, -0.6341,  1.0616,  0.6809]], requires_grad=True)
Diffs
tensor([[ 0.3775,  0.6890, -1.3008, -0.6144],
        [-0.2028, -0.7661,  0.3367,  1.2918],
        [-0.0820, -1.1339, -0.0106,  1.7312],
        [-

# Let's save each model 

In [None]:
import os
from joblib import dump, load 

In [None]:
os.mkdir("lesson/")
os.mkdir("lesson/model/")

In [None]:
model.save("./lesson/model/keras_model.pkl")

In [None]:
torch.save(torch_model_2, "./lesson/model/torch_model.pkl")

In [None]:
!cd lesson/model && ls -la

total 44
drwxr-xr-x 2 root root  4096 Jul  9 18:23 .
drwxr-xr-x 3 root root  4096 Jul  9 18:23 ..
-rw-r--r-- 1 root root 21288 Jul  9 18:23 keras_model.pkl
-rw-r--r-- 1 root root  9766 Jul  9 18:23 torch_model.pkl


In [None]:
%%timeit
model_loaded = keras.models.load_model("./lesson/model/keras_model.pkl")

1 loop, best of 3: 378 ms per loop


In [None]:
%%timeit
torch_loaded = torch.load("./lesson/model/torch_model.pkl")

100 loops, best of 3: 6 ms per loop


# Viewing on tensorboard (WIP)

In [None]:
import torchvision
from torch.utils.tensorboard import SummaryWriter

In [None]:
writer = SummaryWriter("./lesson/runs/")

In [None]:
writer.flush()

In [None]:
writer.add_graph(torch_model_2, X_train_tensor.float())
writer.close()



In [None]:
tensorboard --logdir="./lesson/runs/"