In [26]:
import torch
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable

## 2D Convolutional Operation

In [None]:
B = [[ 1,  3,  1,  3,  5,  4], [ 0,  3,  1,  3,  0,  0], [ 20,  3,  1,  3,  -1,  -1],
     [ 2,  0,  1,  -3,  5,  4], [ -2,  0,  0,  -7,  1,  2], [ 10,  0,  0,  0,  1,  8]]

In [None]:
# here is a 6x6 matrix
A = torch.FloatTensor(B)
# here is a 3x3 filter or kernel
f = torch.FloatTensor([[1, 0, -1], [1, 0, -1], [1, 0, -1]])

In [None]:
f

In [None]:
A

Here is the convolution of `A` and `f` which is a 4x4 tensor.

In [None]:
F.conv2d(Variable(A.view(1,1,6,6)),Variable(f.view(1,1,3,3)))

To compute the first element of the output tensor we compute the element wise multiplication of the top 3x3 sub-matrix of `A` (defined below) and `f` and then sum the results.

In [None]:
A1 = [ [1,  3,  1], [ 0,  3,  1], [20,  3,  1]]

**Question:** What is the size of the output tensor after a convolution of a $n \times n$ tensor with a $f x f$ filter?

You can check that is $(n - f + 1) \times (n - f + 1)$. That is why we get a 6 - 3+ 1 = 4, that is a $4 \times 4$ tensor.

## Edge detector
Here is an illustration on how the filter `f` can be seeing as an edge detector.

In [None]:
import matplotlib.pyplot as plt

In [None]:
b = [10, 10, 10, 0, 0, 0]
B = np.array([b, b, b, b, b, b])
A = torch.FloatTensor(B)
B

In [None]:
plt.imshow(B, cmap=plt.cm.gray)

In [None]:
plt.imshow(f, cmap=plt.cm.gray)

In [None]:
C = F.conv2d(Variable(A.view(1,1,6,6)),Variable(f.view(1,1,3,3)))
C

In [None]:
D = C.data
D = D.numpy()
D = D.reshape((4,4))

Here is our edge.

In [None]:
plt.imshow(D, cmap=plt.cm.gray)

## Padding 
Padding is an operation that adds a border with zeros around the image. Padding is important for these reasons:

* Padding allow the size of the output of a convolution to be the same as the size of the input. This is specially important when building deep neural networks.
* Without padding the interior pixes are used more than the edges pixes.

In [None]:
F.conv2d(Variable(A.view(1,1,6,6)),Variable(f.view(1,1,3,3)), padding=1)

**Question:** What is the size of the output tensor after a convolution of a $n \times n$ tensor with a $f x f$ filter if we use padding?

You can check that it is $(n - f + 1 + 2p) \times (n - f + 1 + 2p)$. That is why we get a $6 - 3 + 1 + 2 = 6$, that is a $6 \times 6$ tensor.

**Choice of convolutions type:**
    * "Valid": means no padding.
    * "Same": Pad so that the output size is the same and the input size. 
    
For "Same" convolution you want $n -f + 1 + 2p = n$ this implies $p = \frac{f-1}{2}$. That is one of the reasons you may want to use odd filter size. Filter size are typically 3, 5, 7, 9, 11.

## Stride
Stride controls how the filter convolves around the input. In the previous examples, the filter convolves around the input by shifting one unit at a time. The amount by which the filter shifts is the stride. 

In [None]:
F.conv2d(Variable(A.view(1,1,6,6)),Variable(f.view(1,1,3,3)), padding=0, stride=2)

The final output has dimensions $(\frac{n-f + 2p}{s} + 1) \times (\frac{n-f + 2p}{s} + 1)$. If the fraction is not an integer we take the floor of that number. 

## Color images

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
img = plt.imread("Ari.jpg")
plt.imshow(img)

In [None]:
img.shape

In [None]:
# take one channel
A = img[:,:,0]
A.shape

In [None]:
plt.imshow(A, cmap=plt.cm.gray)

In [None]:
A1 = torch.FloatTensor(A).view(1,1,4032,3024)
f1 = f.view(1,1,3,3)
C = F.conv2d(Variable(A1), Variable(f1))

In [None]:
C.size()

In [None]:
D = C.data
D = D.numpy()
D = D.reshape((4030, 3022))

In [None]:
plt.imshow(D, cmap=plt.cm.gray)

In [None]:
f3 = torch.cat((f1, f1, f1), 1)

In [None]:
f3.size()

In [None]:
A3 = torch.FloatTensor(img).view(1,3,4032,3024)
C3 = F.conv2d(Variable(A3), Variable(f3))

In [None]:
C3.size()

In [None]:
D = C3.data
D = D.numpy()
D = D.reshape((4030, 3022))

In [None]:
plt.imshow(D, cmap=plt.cm.gray)

Note the dimensions of the convolution.

## Max pooling 

In [None]:
B = [[ 1,  3,  1,  3,  5,  4], [ 0,  3,  1,  3,  0,  0], [ 20,  3,  1,  3,  -1,  -1],
     [ 2,  0,  1,  -3,  5,  4], [ -2,  0,  0,  -7,  1,  2], [ 10,  0,  0,  0,  1,  8]]
A = torch.FloatTensor(B)
A

In [None]:
F.max_pool2d(Variable(A.view(1,1,6,6)), kernel_size=2, stride=2)

In [None]:
F.max_pool2d(Variable(A.view(1,1,6,6)), kernel_size=3, stride=1)

## Simplest Transfer learning pipeline with CNN 

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, models, datasets
from torch.autograd import Variable

### Import data

In [2]:
data_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
train = datasets.ImageFolder(root="/data/yinterian/dogscats/train/",
                                           transform=data_transform)
train_loader = torch.utils.data.DataLoader(train,
                                           batch_size=64, shuffle=True,
                                           num_workers=4)
valid = datasets.ImageFolder(root="/data/yinterian/dogscats/valid/",
                                           transform=data_transform)
valid_loader = torch.utils.data.DataLoader(valid,
                                           batch_size=64, shuffle=True,
                                           num_workers=4)

In [3]:
len(train)

23000

In [4]:
len(valid)

2000

In [5]:
train[0]

(
 ( 0 ,.,.) = 
   2.2318  2.1633  2.1290  ...   2.1633  2.1633  2.1633
   2.2318  2.1633  2.1290  ...   2.1633  2.1633  2.1633
   2.2318  2.1633  2.1462  ...   2.1633  2.1633  2.1633
            ...             ⋱             ...          
   2.2318  2.2318  2.2318  ...  -0.3198 -0.3198 -0.3027
   2.2489  2.2489  2.2489  ...  -0.3027 -0.3198 -0.3027
   2.2489  2.2489  2.2489  ...  -0.2856 -0.3027 -0.2856
 
 ( 1 ,.,.) = 
   1.8508  1.7983  1.7633  ...   1.8508  1.8508  1.8508
   1.8508  1.7808  1.7458  ...   1.8508  1.8508  1.8508
   1.8508  1.7808  1.7108  ...   1.8508  1.8508  1.8508
            ...             ⋱             ...          
   1.6057  1.7108  1.7633  ...  -0.7052 -0.7052 -0.6877
   1.7458  1.8333  1.8859  ...  -0.6877 -0.7052 -0.6877
   1.8333  1.8859  1.9384  ...  -0.6702 -0.6877 -0.6702
 
 ( 2 ,.,.) = 
   0.5485  0.5136  0.4788  ...   0.7751  0.7925  0.7925
   0.5485  0.4962  0.4439  ...   0.7751  0.7925  0.7925
   0.5485  0.4614  0.3916  ...   0.7751  0.7925  0.7925


### Load pre-trained VGG16 model

In [6]:
vgg16 = models.vgg16(pretrained=True)

This implementation of VGG as two blocks consisting of features (convolutional layers) and classifier block (fully connected layers).

In [7]:
vgg16

VGG(
  (features): Sequential(
    (0): Conv2d (3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace)
    (2): Conv2d (64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace)
    (4): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (5): Conv2d (64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace)
    (7): Conv2d (128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace)
    (9): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (10): Conv2d (128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace)
    (12): Conv2d (256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace)
    (14): Conv2d (256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace)
    (16): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (17): Conv2d (256, 512, kernel_size=(3, 3), 

### Predict one image

In [8]:
vgg16 = vgg16.cuda()

In [9]:
x, y = train[0]
print(x.shape, y)

torch.Size([3, 224, 224]) 0


In [10]:
x[0]


 1.8208  1.8379  1.8722  ...   2.1119  2.0948  2.0777
 1.8208  1.8550  1.8893  ...   2.1119  2.1119  2.0777
 1.8550  1.8893  1.9064  ...   2.0948  2.0948  2.0777
          ...             ⋱             ...          
 0.8618  0.8618  0.8789  ...  -2.0665 -2.0665 -2.0665
 0.8447  0.8447  0.8618  ...  -2.0665 -2.0665 -2.0665
 0.8276  0.8447  0.8447  ...  -2.0665 -2.0665 -2.0665
[torch.FloatTensor of size 224x224]

In [11]:
X = Variable(x.unsqueeze(0).cuda())

In [12]:
vgg16.features.eval()
vgg16.classifier.eval()

Sequential(
  (0): Linear(in_features=25088, out_features=4096)
  (1): ReLU(inplace)
  (2): Dropout(p=0.5)
  (3): Linear(in_features=4096, out_features=4096)
  (4): ReLU(inplace)
  (5): Dropout(p=0.5)
  (6): Linear(in_features=4096, out_features=1000)
)

In [13]:
x_feature = vgg16.features(X)
x_feature.size()

torch.Size([1, 512, 7, 7])

In [14]:
x_flatten = x_feature.view(x_feature.size(0), -1)
x_flatten.size()

torch.Size([1, 25088])

In [15]:
y_hat = vgg16.classifier(x_flatten)
y_hat

Variable containing:
-2.2531  1.6427 -1.8693  ...  -2.1631  3.3355  4.3211
[torch.cuda.FloatTensor of size 1x1000 (GPU 0)]

In [16]:
y_np = y_hat.data.cpu().numpy()
y_np[0].shape

(1000,)

### Use the network as a feature extractor

In [30]:
# to keep features and outputs
F = []
Y = []

In [31]:
for i, (imgs, labels) in enumerate(train_loader):  
    images = Variable(imgs.cuda())
    features = vgg16.features(images)
    x_flatten = features.view(features.size(0), -1)
    y_hat = vgg16.classifier(x_flatten) ## linear layer with 1000 classes
    f = y_hat.data.cpu().numpy()
    F.append(f)
    Y.append(labels)    

In [32]:
YY = np.concatenate(Y)

In [33]:
FF = np.concatenate(F)
FF.shape

(23000, 1000)