In [2]:
import numpy as np
import scipy
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(1234)

## Problem 1

Because entropies are nonnegative
$$
I(X;Y) = H(X) - H(X|Y) \leq H(X)
$$
$$
I(X;Y) = H(Y) - H(Y|X) \leq H(Y)
$$
so
$$
I(X;Y) \leq \min(H(X), H(Y))
$$

## Problem 2

$$
\begin{aligned}
I(XZ;Y) &= H(XZ) - H(XZ|Y) \\
&= H(Z) + H(X|Z) - (H(Z|Y) + H(X|Y,Z)) \\
&= I(Z;Y) + I(X;Y|Z)
\end{aligned}
$$
Iteratively applying above equation to $I(X_1,\dots,X_n;Y)$, we get
$$
I(X_1,\dots,X_n;Y) = \sum_{i=1}^n I(X_i;Y|X_1,\dots,X_{i-1})
$$

## Problem 3

$$
\begin{aligned}
I(X;Y) - I(X;Z) &= (H(X) - H(X|Y)) - (H(X) - H(X|Z)) \\
&= H(X|Z) - H(X|Y) \\
&= H(Y|Z) \geq 0
\end{aligned}
$$
so $I(X;Y) \geq I(X;Z)$.
Similarly $I(Y;Z) \geq I(X;Z)$.

## Problem 4

First we know that for binary variable X,
$$
0 \leq H(X) \leq 1\;\mathrm{bit}
$$
From $I(X;Y) = 0$, we know $X \perp Y$. And
$$
I(X;Y|Z) = H(Y|Z) - H(Y|X,Z) = 1 \mathrm{bit}
$$
Because $H(Y|Z) \leq 1 \mathrm{bit}$, $H(Y|X,Z) \geq 0$
$$
H(Y|Z) = 1 \mathrm{bit}
$$
And because $H(Y|Z) \leq H(Y) \leq 1 \mathrm{bit}$, we get
$$
H(Y) = H(Y|Z)
$$
which means $Y \perp Z$. The same way we have $X \perp Z$. Thus every pair is independent, but given Z, X,Y are not independent. In fact, because $H(Y|X,Z) = 0$, The relation between Y and X is deterministic given Z.
One example is 
$$
p(X=1) = 0.5,\;
p(Y=1) = 0.5,\;
X \perp Y,\;
Z = X \oplus Y
$$

| X,Y,Z | probability |
| --- | :---: |
| 0,0,0 | 0.25 |
| 0,0,1 | 0 |
| 0,1,0 | 0 |
| 0,1,1 | 0.25 |
| 1,0,0 | 0 |
| 1,0,1 | 0.25 |
| 1,1,0 | 0.25 |
| 1,1,1 | 0 |


## Problem 5

$$
\begin{aligned}
p(x|y,z) &= \frac{p(x,y,z)}{p(y,z)} \\
&= \frac{p(x)p(y|x)p(z|y)}{p(z|y)p(y)} \\
&= \frac{p(x)p(y|x)}{p(y)} \\
&= p(x|y)
\end{aligned}
$$
Thus it's direct to show that
$$
H(X|Y,Z) = H(X|Y)
$$
So
$$
\begin{aligned}
I(X;Y) - I(X;Y|Z) &= H(X) - H(X|Y) - H(X|Z) + H(X|Y,Z) \\
&= H(X) - H(X|Z) \\
&\geq 0
\end{aligned}
$$

## Problem 6

The lagrange objective
$$
\mathcal{L}(p, \lambda) = -\int_a^b p(x)\log p(x)dx + \lambda(\int_a^b p(x)dx - 1)
$$
According to Euler-Lagrange formula
$$
\log p(x) + 1 - \lambda = 0
$$
so we get
$$
p(x) = e^{\lambda - 1}
$$
which means that $p(x)$ is a uniform distribution. We can further determine
$$
\lambda = 1 - \ln(b-a)
$$

## Problem 7

#### (1)
$$
\begin{aligned}
H(X_1,X_2) &= -\int p(x_1,x_2)\log p(x_1,x_2)\;dx_1dx_2 \\
&= -\int p(x_1,x_2)\left[-\ln 2\pi - \frac{1}{2}\ln |K| - \frac{1}{2}(x-\mu)^TK^{-1}(x-\mu)\right]\;dx_1dx_2 \\
&= \ln 2\pi + \frac{1}{2}\ln |K| + \frac{1}{2}\mathbb{E}\left[(x-\mu)^TK^{-1}(x-\mu)\right]
\end{aligned}
$$
Note that $(x-\mu)^TK^{-1}(x-\mu)$ is the sum of the square of k independent standard normal variables, which is distributed according to $\chi_2^2$.
$$
H(X_1,X_2) = \ln 2\pi + \frac{1}{2}\ln |K| + \frac{1}{2}\cdot 2 = \ln 2\pi + \frac{1}{2}\ln |K| + 1
$$

#### (2)
$$
X_1 \sim \mathcal{N}(\mu_1, K_{00})
$$
So $H(X_1) = \frac{1}{2}\ln (2\pi K_{00}e)$
$$
X_2 \sim \mathcal{N}(\mu_2, K_{11})
$$
$H(X_2) = \frac{1}{2}\ln (2\pi K_{11}e)$
$$
I(X_1,X_2) = H(X_1) + H(X_2) - H(X_1,X_2) = \frac{1}{2}\ln \frac{K_{00}K_{11}}{|K|}
$$

## Problem 8

#### (1)
$$
K = \begin{pmatrix}
1 & \rho \\
\rho & 1
\end{pmatrix}
$$
According to results in Problem 7
$$
I(X_1,X_2) = \frac{1}{2}\ln\frac{1}{1-\rho^2}
$$
So for different values of $\rho$, $I(X_1, X_2)$ are

In [6]:
for p in [.2, .5, .9]:
    print 'Corr = %s: I(X1, X2) = %.2f' % (p, -0.5*np.log(1. - p**2))

Corr = 0.2: I(X1, X2) = 0.02
Corr = 0.5: I(X1, X2) = 0.14
Corr = 0.9: I(X1, X2) = 0.83


#### (2)
$$
I(X_1, X_2) = \mathbb{E}_{p(x_1, x_2)}\ln\frac{p(x_1,x_2)}{p(x_1)p(x_2)}
$$
So we use 1000 samples to estimate the expectation

In [14]:
for p in [.2, .5, .9]:
    mean = [0, 0]
    cov = [[1, p], [p, 1]]
    samples = np.random.multivariate_normal(mean, cov, size=1000)
    I = np.log(stats.multivariate_normal.pdf(samples, mean, cov)) - \
               np.log(stats.norm.pdf(samples[:, 0])) - np.log(stats.norm.pdf(samples[:, 1]))
    print 'Corr = %s, Estimated I(X1, X2) = %.2f' % (p, I.mean())

Corr = 0.2, Estimated I(X1, X2) = 0.02
Corr = 0.5, Estimated I(X1, X2) = 0.15
Corr = 0.9, Estimated I(X1, X2) = 0.81
