In [3]:
import numpy as np
import torch as tc
from torch.autograd import Variable
from torchvision import datasets, transforms

# 9. Softmax Classifier

For given number (logit) $z_j$ with index $j$, the softmax classifier $\sigma(z_j)$ returns the probability of $z_j$ out of all numbers $z_i$ where $i\in{N}$, and $N$ is the total number of indices.

<center>$\sigma(z_j) = \frac{e^{z_j}}{\sum_{k=1}^{N}e^{z_k}}$</center>

The output of softmax is equivalent to the probability $p(j)$ of $j$:

$$\sigma(z_j) = p(j).$$

For given $j$, we compute the loss by comparing the label with **one-hot label**, and we use **cross-entropy loss:**

$$D(\hat{Y_j}, Y_j) = -Y_j\log\hat{Y_j}$$

where $\hat{Y_j}$ is our predicted $p(j)$, and $Y_j$ is the one-hat label. Cross-entropy measures **differences between the distributions** of $Y$ and $\hat{Y}$. Total loss is the sum of all cross entropies:

$$L = \sum_{j \in N} D(\hat{Y_j}, Y_j)$$

## 9.1 Example: Cross entropy for 0, 1 and 2

In [3]:
Y = np.array([1,0,0])
# One hot
# 0: [1,0,0]
# 1: [0,1,0]
# 2: [0,0,1]

Y_pred1 = np.array([0.7, 0.2, 0.1]) # CORRECT PREDICTION: has highest probability for 0
Y_pred2 = np.array([0.1, 0.3, 0.6]) # WRONG PREDICTION: has highest probability for 2

print("loss1 = ", np.sum(-Y*np.log(Y_pred1)))
print("loss2 = ", np.sum(-Y*np.log(Y_pred2)))

loss1 =  0.356674943939
loss2 =  2.30258509299


`loss1` has smaller loss than `loss2` because `Y_pred1` gives a better prediction.


## 9.2 `CrossEntropyLoss()`: PyTorch implementation of Cross entropy

In PyTorch, we can implement `CrossEntropyLoss()`. It is different from the way we calculated in the above cell:

1. Label (`Y`) for `CrossEntropyLoss()` is **NOT ONE-HOT LABEL, BUT CLASS LABEL.**
2. Predicted label (`Y_pred`) is **not softmax, but logit** ($z_j$ in the above description).

In other words, `CrossEntropyLoss()` is the combination of softmax and cross entropy. (To be more precise, `CrossEntropyLoss` combines **`LogSoftMax` and `NLLLoss` in one single class**).

In [181]:
loss = tc.nn.CrossEntropyLoss()
Y = Variable(tc.LongTensor([0]), requires_grad = False) # zeroth class
Y_pred1 = Variable(tc.Tensor([[2.0, 1.0, 0.1]]))
Y_pred2 = Variable(tc.Tensor([[0.5, 2.0, 0.3]]))

l1 = loss(Y_pred1, Y)
l2 = loss(Y_pred2, Y)

print("Loss1 = ", l1)
print("Loss2 = ", l2)

Loss1 =  Variable containing:
 0.4170
[torch.FloatTensor of size 1]

Loss2 =  Variable containing:
 1.8406
[torch.FloatTensor of size 1]



In [70]:
loss = tc.nn.CrossEntropyLoss()
Y = Variable(tc.LongTensor([2, 0, 1]), requires_grad = False) # [2,0,1] corresponds to second, zeroth, first class.
Y_pred1 = Variable(tc.Tensor([[0.1, 0.2, 0.9], [0.7, 0.2, 0.1], [0.3, 0.6, 0.1]]))
Y_pred2 = Variable(tc.Tensor([[0.3, 0.3, 0.4], [0.4, 0.4, 0.2], [0.33, 0.33, 0.34]]))

l1 = loss(Y_pred1, Y)
l2 = loss(Y_pred2, Y)

print("Loss1 = ", l1.data)
print("Loss2 = ", l2.data)

Loss1 =  
 0.7623
[torch.FloatTensor of size 1]

Loss2 =  
 1.0571
[torch.FloatTensor of size 1]



You can use `NLLLoss()` (**negative log-likelihood**) to use softmax classifier. It uses log-softmax.

In [71]:
loss = tc.nn.NLLLoss()
Y = Variable(tc.LongTensor([2, 0, 1]), requires_grad = False)
Y_pred1 = Variable(tc.Tensor([[0.1, 0.2, 0.9], [0.7, 0.2, 0.1], [0.3, 0.6, 0.1]]))
Y_pred2 = Variable(tc.Tensor([[0.3, 0.3, 0.4], [0.4, 0.4, 0.2], [0.33, 0.33, 0.34]]))

l1 = loss(Y_pred1, Y)
l2 = loss(Y_pred2, Y)

print("Loss1 = ", l1.data)
print("Loss2 = ", l2.data)

Loss1 =  
-0.7333
[torch.FloatTensor of size 1]

Loss2 =  
-0.3767
[torch.FloatTensor of size 1]



## 9.3 Cross-entropy on MNIST Dataset

In [346]:
class Model(tc.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.linear1 = tc.nn.Linear(784, 520)
        self.linear2 = tc.nn.Linear(520, 320)
        self.linear3 = tc.nn.Linear(320, 240)
        self.linear4 = tc.nn.Linear(240, 120)
        self.linear5 = tc.nn.Linear(120, 10)
        
    def forward(self, x):
        x = x.view(-1, 784) # YOU NEED TO FLATTEN THE VECTOR FIRST: (n, 1, 28, 28) -> (n, 784)
        x = tc.nn.functional.relu(self.linear1(x))
        x = tc.nn.functional.relu(self.linear2(x))
        x = tc.nn.functional.relu(self.linear3(x))
        x = tc.nn.functional.relu(self.linear4(x))
        x = self.linear5(x) # We don't need activation here because we use LOGIT.
        return x

In [348]:
train_dataset = datasets.MNIST(root='./Data/MNIST_Data/', train=True, download=True,
               transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]))
# transforms.Compose: Composes several transforms together.
# transforms.Normalize: Normalize an tensor image with mean and standard deviation.
test_dataset = datasets.MNIST(root='./Data/MNIST_Data/', train=False,
             transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]))

train_loader = tc.utils.data.DataLoader(dataset=train_dataset, shuffle=True, batch_size=64)
test_loader = tc.utils.data.DataLoader(dataset=test_dataset, shuffle=True, batch_size=64)

In line 2, you normalize with mean 0.1307 and standard deviation 0.3081. The mean and standard deviation are calculated from the entire distribution of data.

In [205]:
entire_train_data = train_loader.dataset.train_data.view(-1)
entire_train_data = entire_train_data.type(tc.Tensor)
print(entire_train_data.mean()/255) # 255 is the highest value of all elements (0~255).
print(entire_train_data.std()/255)

0.1306604762738429
0.30810780717887876


In [349]:
model = Model()
criterion = tc.nn.CrossEntropyLoss()
optimizer = tc.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

In [350]:
for epoch in range(10):
    model.train()
    for batch_idx, (inputs, outputs) in enumerate(train_loader):
        inputs, outputs = Variable(inputs), Variable(outputs)
        outputs_pred = model(inputs)
        
        loss = criterion(outputs_pred, outputs)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch_idx % 10 == 0:
            print('Train Epoch {} [{}/{} ({:.0f}%)]\t Loss: {:.6f}'
                  .format(epoch, batch_idx*len(inputs), len(train_loader.dataset),
                  100.*batch_idx/len(train_loader), loss.data[0]))



KeyboardInterrupt: 

It would make the code more readable if you define `train` module and `test` module.

When making `train` and `test` modules, **try to explicitly state your intent and set model.train() and model.eval() when necessary.**

Explicitly indicating `model.train()` and `model.eval()` matters for some layers; some layers (like BatchNorm, Dropout) have different behavior during train/and evaluation so setting it matters.

In [351]:
def train(epoch):
    # SETS MODEL IN TRAINING MODEL
    model.train()
    for batch_idx, (inputs, outputs) in enumerate(train_loader):
        inputs, outputs = Variable(inputs), Variable(outputs)
        
        optimizer.zero_grad()
        outputs_pred = model(inputs)
        loss = criterion(outputs_pred, outputs)
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'
                 .format(epoch, batch_idx*len(inputs), len(train_loader.dataset), 100.*batch_idx/len(train_loader),
                 loss.data[0]))

In [357]:
def test():
    # SETS MODEL IN EVALUATION MODEL
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = Variable(data, volatile=True), Variable(target)
        # volatile: Boolean indicating that the Variable should be used in
        # inference mode, i.e. don't save the history. See
        # :ref:`excluding-subgraphs` for more details.
        # Can be changed only on leaf Variables.

        output = model(data)
        # sum up batch loss
        # test_loss += criterion(output, target).data[0]
        test_loss += criterion(output, target, size_average=False).data[0]
        # get the index of the max log-prob
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()
        # What if you don't change the shape of target.data? Check at the bottom of the page.
        
    test_loss /= len(test_loader.dataset)
    print ('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'
           .format(test_loss, correct, len(test_loader.dataset), 100.*correct/len(test_loader.dataset)))

In [353]:
for epoch in range(3):
    train(epoch)
    test()



TypeError: forward() got an unexpected keyword argument 'size_average'

In [368]:
train_dataset = datasets.MNIST(root="./Data/MNIST_Data/", download=True, train=True,
                              transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]))
test_dataset = datasets.MNIST(root="./Data/MNIST_Data/", train=False,
                             transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]))


train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=64, num_workers=2)
test_loader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=64, num_workers=2)

class Model(tc.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.linear1 = tc.nn.Linear(28*28, 520)
        self.linear2 = tc.nn.Linear(520, 420)
        self.linear3 = tc.nn.Linear(420, 250)
        self.linear4 = tc.nn.Linear(250, 100)
        self.linear5 = tc.nn.Linear(100, 10)

        self.relu = tc.nn.functional.relu
    
    def forward(self, x):
        x = x.view(-1, 784)
        l1 = self.relu(self.linear1(x))
        l2 = self.relu(self.linear2(l1))
        l3 = self.relu(self.linear3(l2))
        l4 = self.relu(self.linear4(l3))
        return self.linear5(l4)
    
model = Model()
criterion = tc.nn.CrossEntropyLoss()
optimizer = tc.optim.SGD(model.parameters(), lr=0.01)

def train(epoch):
    model.train()
    for batch_idx, (inputs, outputs) in enumerate(train_loader, 0):
        inputs, outputs = Variable(inputs), Variable(outputs)
        pred_outputs = model(inputs)
        loss = criterion(pred_outputs, outputs)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'
                 .format(epoch, batch_idx*len(inputs), len(train_loader.dataset), 100.*batch_idx/len(train_loader),
                 loss.data[0]))

        
def test():
    model.eval()
    correct = 0
    test_loss = 0
    for inputs, outputs in test_loader:
        inputs, outputs = Variable(inputs, volatile=True), Variable(outputs)
        pred_outputs = model(inputs)
        test_loss += criterion(pred_outputs, outputs).data[0]
        pred = pred_outputs.data.max(1, keepdim=True)[1]
        correct += pred.eq(outputs.data.view_as(pred)).sum()
        
    test_loss /= len(test_loader)
    print ('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'
       .format(test_loss, correct, len(test_loader.dataset), 100.*correct/len(test_loader.dataset)))

for epoch in range(2):
    train(epoch)
    test()


Test set: Average loss: 0.6052, Accuracy: 8282/10000 (83%)




Test set: Average loss: 0.3211, Accuracy: 9030/10000 (90%)



# Supplementary

## 1. What if you don't change the shape of target.data?

In [293]:
pred.eq(outputs.data)



Columns 0 to 12 
    1     0     0     0     0     0     0     0     0     0     0     0     0
    0     1     1     1     0     0     0     0     0     0     0     1     0
    0     0     0     0     0     0     1     1     0     0     0     0     0
    0     1     1     1     0     0     0     0     0     0     0     1     0
    0     0     0     0     1     0     0     0     0     0     0     0     0
    0     0     0     0     0     1     0     0     0     0     1     0     1
    1     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     1     1     0     0     0     0     0
    0     0     0     0     0     0     0     0     1     1     0     0     0
    0     0     0     0     0     0     0     0     1     1     0     0     0
    0     0     0     0     0     1     0     0     0     0     1     0     1
    0     1     1     1     0     0     0     0     0     0     0     1     0
    0     0     0     0     0     1     0    

The above is the comparison between `pred` and `output`, which are different matrices (one is row and the other is column) but the same in size.

In [287]:
outputs.data


 9
 3
 3
 3
 6
 1
 2
 2
 0
 0
 1
 3
 1
 6
 7
 8
[torch.LongTensor of size 16]

In [366]:
pred


    4
    4
    4
    4
    4
    4
    4
    4
    4
    4
    4
    4
    4
    4
    4
    4
[torch.LongTensor of size 16x1]

In [291]:
pred.eq(outputs.data).numpy().diagonal()

array([1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=uint8)

In [292]:
pred.eq(outputs.data.view_as(pred))


    1
    1
    0
    1
    1
    1
    0
    1
    1
    1
    1
    1
    1
    1
    1
    1
[torch.ByteTensor of size 16x1]