### topics

* cross entropy loss
* rnn example (see: https://wizardforcel.gitbooks.io/pytorch-zero-to-all/content/12_3_hello_rnn_seq.html)

### Cross Entropy

* cross-entropy is a measure from information theory
* quantifies the difference between two probability distributions

$$H(X;P;Q) = -\sum_{x \in \Omega} P(X=x) log(Q(X=x))$$

* in deep learning, Q is the likelihood of the data given the parameter
* P is the ground truth (the gold label)
* for Q softmax is used (in the case of more than 2 classes)
* the formula reduces to:

$$-log(\frac{e^{x_i}}{\sum_j e^{x_j}})$$
 where $x_i$ is the output of the neural net for instance $i$
 
 
see: https://gombru.github.io/2018/05/23/cross_entropy_loss/ for a detailed discussion


### Cross Entropy Loss

according to the documentation:

* this criterion combines nn.LogSoftmax() and nn.NLLLoss() in one single class.
* it is useful when training a classification problem with C classes. 


In [None]:
# illustration

import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np

loss = nn.CrossEntropyLoss()
input=torch.tensor([[1,0.5]],requires_grad=True) # [1,0.5] output of system (batch=2)

target=torch.tensor([0],dtype=torch.long)        # index where to find the loss

loss = loss(input, target)

sloss=-np.log(torch.softmax(torch.tensor([1.,0.5]),dim=0))  # loss for whole batch
closs=sloss[0]  # loss at index position

loss,sloss,closs

In [None]:
torch.manual_seed(777)  # reproducibility

idx2char = ['h', 'i', 'e', 'l', 'o']

# Teach hihell -> ihello
#x_data = [[0, 1, 0, 2, 3, 3]]   # hihell
x_one_hot = [[[1, 0, 0, 0, 0],   # h 0
              [0, 1, 0, 0, 0],   # i 1
              [1, 0, 0, 0, 0],   # h 0
              [0, 0, 1, 0, 0],   # e 2
              [0, 0, 0, 1, 0],   # l 3
              [0, 0, 0, 1, 0]]]  # l 3

y_data = [1, 0, 2, 3, 3, 4]    # ihello 

# y_data: indicates the position in the one-hot vector of an example where the loss produced by softmax can be found!!!
# [1, ..] 1 because we want "i" which is index 1, so the loss is at position xx.xx [1, xx0xx, 0, 0, 0]

# As we have one batch of samples, we will change them to variables only once
inputs = Variable(torch.Tensor(x_one_hot))
labels = Variable(torch.LongTensor(y_data))

num_classes = 5
input_size = 5  # one-hot size
hidden_size = 5  # output from the RNN. 5 to directly predict one-hot
batch_size = 1   # one sentence
sequence_length = 6  # |ihello| == 6
num_layers = 1  # one-layer rnn

torch.manual_seed(0)

class RNN(nn.Module):

    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(RNN, self).__init__()

        self.num_classes = num_classes
        self.num_layers = num_layers
        self.hidden_size = hidden_size

        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, batch_first=True)

    def forward(self, x):
        # Initialize hidden and cell states
        # (num_layers * num_directions, batch, hidden_size) for batch_first=True
        h_0 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size))

        # Propagate input through RNN
        # Input: (batch, seq_len, input_size)

        out, _ = self.rnn(x, h_0)
        
        return out.view(-1, num_classes)


# Instantiate RNN model
rnn = RNN(num_classes, input_size, hidden_size, num_layers)

# Set loss and optimizer function
# CrossEntropyLoss = LogSoftmax + NLLLoss
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.1)

# Train the model
for epoch in range(10):
    outputs = rnn(inputs)
    optimizer.zero_grad()
    loss = criterion(outputs, labels)  # labels point to the index position that gives the loss
    loss.backward()
    optimizer.step()
    _, idx = outputs.max(1)   # idx is index of each array that gives maximum value !!!!!
    
    idx = idx.data.numpy()    # the plain indices

   
    
    result_str = [idx2char[c] for c in idx.squeeze()]
 #   break
 #   print("epoch: %d, loss: %1.3f" % (epoch + 1, loss.data))
    print("Predicted string: ", ''.join(result_str))

print("\n\nthe final suggestion",idx,"=",result_str,"according to vocab",idx2char)
