In [1]:
import scipy.stats
import numpy as np

### data

In [2]:
# Elements of Information Theory, Cover, Thomas Example 2.2.1
joint = np.array([[1/8, 1/16, 1/32, 1/32],
                  [1/16, 1/8, 1/32, 1/32],
                  [1/16, 1/16, 1/16, 1/16],
                  [1/4, 0, 0, 0]])
joint

array([[0.125  , 0.0625 , 0.03125, 0.03125],
       [0.0625 , 0.125  , 0.03125, 0.03125],
       [0.0625 , 0.0625 , 0.0625 , 0.0625 ],
       [0.25   , 0.     , 0.     , 0.     ]])

In [3]:
p, q = joint.sum(axis=0), joint.sum(axis=1)

### entropy

In [4]:
def entropy(p):
    p = np.array(p)
    return sum(-p * np.log2(p))

entropy(p), entropy(q)

(1.75, 2.0)

In [5]:
scipy.stats.entropy(p, base=2), scipy.stats.entropy(q, base=2)

(1.75, 2.0)

### KL Divergence

https://www.countbayesie.com/blog/2017/5/9/kullback-leibler-divergence-explained

In [6]:
def kl_divergence(p, q):
    p, q = np.array(p), np.array(q)
    sum_ = 0
    for i, pi in enumerate(p):
        if pi != 0:
            sum_ += pi * np.log2(pi/q[i])
    return sum_

In [7]:
kl_divergence(p, q), kl_divergence(q, p)

(0.25, 0.25)

In [8]:
scipy.stats.entropy(pk=p, qk=q, base=2), scipy.stats.entropy(pk=q, qk=p, base=2)

(0.24999999999999997, 0.25)

### mutual information

#### definition of mutual information
https://en.wikipedia.org/wiki/Mutual_information#Definition

In [9]:
def mutual_information(joint):
    p, q = joint.sum(axis=0), joint.sum(axis=1)
    mi = 0
    for i, pi in enumerate(p):
        for j, qj in enumerate(q):
            if joint[j, i] != 0:
                mi += joint[j, i] * np.log2(joint[j, i] / (pi * qj))
    return mi

In [10]:
# Elements of Information Theory, Cover, Thomas Example 2.4.1
mutual_information(joint)

0.375

#### mutual information v.s. KL Divergence

Elements of Information Theory, Cover, Thomas (2.29)

In [11]:
def mutual_information_kl(joint):
    p, q = joint.sum(axis=0), joint.sum(axis=1)
    p_m_q = []
    joint_flat = []
    for i, pi in enumerate(p):
        for j, qj in enumerate(q):
            p_m_q.append(pi*qj)
            joint_flat.append(joint[j, i])
    return kl_divergence(joint_flat, p_m_q)

In [12]:
mutual_information_kl(joint)

0.375

Reference:
- Elements of Information Theory, Cover, Thomas
- https://www.countbayesie.com/blog/2017/5/9/kullback-leibler-divergence-explained
- https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html