#### Beware: Keras takes targets first, then predictions! Behaves very weirdly:

In [119]:
import keras
from keras import backend as K

a = K.variable(value=[0.1749, 0.1749, 0.1749, 0.4754])
# K.reshape(a,(1,4))
b = K.variable(value=[0.0, 0.0, 0.0, 1.0])
# K.reshape(b,(4,1))
c = K.binary_crossentropy(b, a, from_logits=False)

print(K.eval(c))

print(K.eval(keras.losses.binary_crossentropy(b, a)))

[0.19225068 0.19225068 0.19225068 0.7435987 ]
0.3300877


#### You can use the canonical formula and compute cross-entropy based on probabilities:
$$
\displaystyle H(p,q)=-\sum _{x\in {\mathcal {X}}}p(x)\,\log q(x)
$$

In [120]:
import numpy as np
predictions = np.array([0.1749, 0.1749, 0.1749, 0.4754])
targets = np.array([0.0, 0.0, 0.0, 1.0])
-np.dot(targets, np.log(predictions))

0.7435987240560435

### Be careful with Pytorch as well: it takes LOGITS, NOT probabilities for predictions and class labels, NOT target probabilities:

In [124]:
import torch
from torch import nn
loss = nn.CrossEntropyLoss()
logits = [0.0, 0.0, 0.0, 1.0]
input = torch.tensor(logits, requires_grad=True).view(1,4)
target = torch.tensor(3, dtype=torch.long).view(1)
output = loss(input, target)
print("predicted logits: ", input)
print("predicted probabilities: ", nn.Softmax(dim=1)(input))
print(target)
print(output)


predicted logits:  tensor([[0., 0., 0., 1.]], grad_fn=<ViewBackward>)
predicted probabilities:  tensor([[0.1749, 0.1749, 0.1749, 0.4754]], grad_fn=<SoftmaxBackward>)
tensor([3])
tensor(0.7437, grad_fn=<NllLossBackward>)
