In [1]:
# https://www.d2l.ai/chapter_appendix-mathematics-for-deep-learning/information-theory.html
import torch
from torch.nn import NLLLoss

In [4]:
def nansum(x):
    return x[~torch.isnan(x)].sum()

def self_information(p):
    return -torch.log2(torch.tensor(p)).item()

self_information(1/64)

6.0

In [5]:
def entropy(p):
    entropy = - p * torch.log2(p)
    return nansum(entropy)

entropy(torch.Tensor([0.1, 0.5, 0.1, 0.3]))

tensor(1.6855)

In [7]:
def joint_entropy(p_xy):
    joint_entropy = - p_xy * torch.log2(p_xy)
    return nansum(joint_entropy)

joint_entropy(torch.Tensor([[0.1, 0.5], [0.1, 0.3]]))

tensor(1.6855)

In [11]:
def conditional_entropy(p_xy, p_x):
    p_y_given_x = p_xy / p_x
    conditional_entropy = - p_xy * torch.log2(p_y_given_x)
    return nansum(conditional_entropy)
    

    
conditional_entropy(torch.Tensor([[0.1, 0.5], [0.2, 0.3]]), torch.Tensor([0.2, 0.8]))

tensor(0.8635)

In [15]:
def mutual_information(p_xy, p_x, p_y):
    p = p_xy/(p_x * p_y)
    mutual_information =  p_xy * torch.log2(p)
    return nansum(mutual_information)

mutual_information(torch.Tensor([[0.1, 0.5], [0.1, 0.3]]), torch.Tensor([0.2, 0.8]), torch.Tensor([[0.75, 0.25]]))

tensor(0.7195)

In [16]:
def kl_divergence(p, q):
    kl = p * torch.log2(p / q)
    return nansum(kl).abs().item()

In [22]:
torch.manual_seed(1)

tensor_len = 10000
p = torch.normal(0, 1, (tensor_len, ))
q1 = torch.normal(-1, 1, (tensor_len, ))
q2 = torch.normal(1, 1, (tensor_len, ))

p = torch.sort(p)[0]
q1 = torch.sort(q1)[0]
q2 = torch.sort(q2)[0]

kl_pq1 = kl_divergence(p, q1)
kl_pq2 = kl_divergence(p, q2)
similar_percentage = abs(kl_pq1 - kl_pq2) / ((kl_pq1 + kl_pq2) / 2) * 100

kl_pq1, kl_pq2, similar_percentage

(8582.0341796875, 8828.3095703125, 2.8290698237936858)