In [2]:
import torch
import torch.nn as nn

# Assume we have a vocabulary of size 5 and two samples in a batch
vocab_size_tgt = 5
batch_size = 1
seq_len = 4

# Create example tensors of  projection_output and decoder_target
torch.manual_seed(68) # for reproducible result of random process
projection_output = torch.randn(batch_size, seq_len, vocab_size_tgt)  # size: (batch_size, seq_len, vocab_size)
decoder_target = torch.tensor([[1, 2, 3, 1]])  # size: (batch_size, seq_len)

# Assume the ID of the [PAD] token is 0
pad_id = 0

# Initialize loss function
loss_function = nn.CrossEntropyLoss(ignore_index=pad_id, label_smoothing=0.1)

# Calculate loss
output = projection_output.view(-1, vocab_size_tgt)
target = decoder_target.view(-1)
loss = loss_function(output, target)

# Print loss
print("output: \n", output)
print("target: \n", target)
print("loss: \n", loss)

output: 
 tensor([[-0.6377,  0.7509,  0.3260, -0.8004,  0.8127],
        [-0.6685, -1.6276,  0.8615, -0.2605,  1.0766],
        [ 0.3043, -0.9517, -1.0953,  0.2755,  0.6838],
        [ 0.5590, -0.1398,  0.0232,  0.6718, -0.6836]])
target: 
 tensor([1, 2, 3, 1])
loss: 
 tensor(1.4373)


## Simple example

In [1]:
import torch
import torch.nn as nn

# Define the loss function
loss_fn = nn.CrossEntropyLoss()

# Suppose we have output from our model for 3 classes
# and we have a batch size of 2
# Each row corresponds to a single sample
# Each column corresponds to a class
output = torch.tensor([[0.1, 0.2, 0.7], [0.8, 0.1, 0.1]])

# These are our true labels
# Each element corresponds to a single sample
target = torch.tensor([2, 0])

# Compute the loss
loss = loss_fn(output, target)

print(loss)

tensor(0.7288)


In [2]:
import numpy as np

# True distribution
y_true = np.array([0, 1])

# Predicted probabilities
y_pred = np.array([0.3, 0.7])

# Cross-entropy loss
loss = -np.sum(y_true * np.log(y_pred))
print(loss)

0.35667494393873245


In [5]:
-np.log(0.7) - np.log(0.8)

0.5798184952529422

In [6]:
import torch
import torch.nn.functional as F

# Suppose we have output from our model for 3 classes
# and we have a batch size of 2
# Each row corresponds to a single sample
# Each column corresponds to a class
output = torch.tensor([[0.1, 0.2, 0.7], [0.8, 0.1, 0.1]])

# These are our true labels
# Each element corresponds to a single sample
target = torch.tensor([2, 0])

# Apply softmax to the output to get probabilities
probabilities = F.softmax(output, dim=1)

# Select the probabilities corresponding to the true classes
selected_probabilities = probabilities[range(len(target)), target]

# Compute the negative log likelihood loss
manual_loss = -torch.log(selected_probabilities).mean()

print(manual_loss)

tensor(0.7288)


In [7]:
probabilities

tensor([[0.2546, 0.2814, 0.4640],
        [0.5017, 0.2491, 0.2491]])

In [8]:
selected_probabilities

tensor([0.4640, 0.5017])

In [10]:
range(len(target))

range(0, 2)

In [11]:
target

tensor([2, 0])

In [13]:
(-np.log(0.4640) - np.log(0.5017))/2

0.7288118371239057