### Office hours : covariance and mutual information

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Suppose we're interested in studying the association between smoking and developing cancer. We perform a random experiment by sampling adults of the US.

Let $X$ be the indicator of being a current smoker and $Y$ the indicator of him/her developing cancer during his/her life. 

In [14]:
arr = np.array([
    [0.72, 0.03],
    [0.2 , 0.05]]
)

In [16]:
arr

array([[0.72, 0.03],
       [0.2 , 0.05]])

Let's make a dataframe for adding labels. 

In [18]:
df_joint = pd.DataFrame(
    arr, 
    index = pd.Index(data = ['X = 0', 'X = 1'], name = 'smoker'),
    columns = pd.Index(data = ['Y = 0', 'Y = 1'], name = 'cancer')
)

In [19]:
df_joint

cancer,Y = 0,Y = 1
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
X = 0,0.72,0.03
X = 1,0.2,0.05


Let's get the marginals. 

In [20]:
df_x = df_joint.sum(axis = 'columns')

In [21]:
df_x

smoker
X = 0    0.75
X = 1    0.25
dtype: float64

In [26]:
df_y = df_joint.sum(axis = 'rows')

In [27]:
df_y

cancer
Y = 0    0.92
Y = 1    0.08
dtype: float64

In [30]:
df_x[1]

0.25

In [31]:
df_y

cancer
Y = 0    0.92
Y = 1    0.08
dtype: float64

In [32]:
mean_x = df_x[1]
mean_y = df_y[1]

In [34]:
mean_x

0.25

The covariance between $X$ and $Y$, denoted $\text{Cov}(X,Y)$ is defined as
$$
\text{Cov}(X,Y) = \mathbb{E} \left[ (X - \mathbb{E}(X))( Y - \mathbb{E}(Y)) \right] \\[1em]
= \mathbb{E}_{P(X,Y)}(XY) - \mathbb{E}_X(X) \mathbb{E}_Y (Y)
$$
but a slightly easier to compute formula is the following: 


$$
\text{Cov}(X,Y) = \frac{1}{n} \sum_{i = 1}^n \left( (x_i - \mathbb{E}(X)) (y_i - \mathbb{E}(y)) \right)
$$

In [40]:
cov = 1/4 * ((1-mean_x)*(1-mean_y) + (0-mean_x)*(0 - mean_y))

cov = np.round(cov, 4)

print(f'The covariance is {cov}')

The covariance is 0.1775


Remember that we can get the joints by: 

$$
P(X|Y) = \frac{P(X,Y)}{P(Y)}\\[2em]
P(Y|X) = \frac{P(X,Y)}{P(X)}
$$

In [42]:
df_joint

cancer,Y = 0,Y = 1
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
X = 0,0.72,0.03
X = 1,0.2,0.05


In [45]:
df_xgy = df_joint.div(df_y, axis = 1)

In [46]:
df_xgy

cancer,Y = 0,Y = 1
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
X = 0,0.782609,0.375
X = 1,0.217391,0.625


In [47]:
df_ygx = df_joint.div(df_x, axis = 0)

In [48]:
df_ygx

cancer,Y = 0,Y = 1
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
X = 0,0.96,0.04
X = 1,0.8,0.2


In [56]:
df_joint

cancer,Y = 0,Y = 1
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
X = 0,0.72,0.03
X = 1,0.2,0.05


In [53]:
df_xgy.values * df_y.values

array([[0.72, 0.03],
       [0.2 , 0.05]])

In [55]:
df_ygx.values * df_x.values.reshape(-1,1)

array([[0.72, 0.03],
       [0.2 , 0.05]])

Let's now calculate the mutual information by computing first the conditional entropy $H(X|Y)$ , and then the mutual information. 

In [57]:
def element_wise_information(px):
    """
    Returns a numpy array with element wise information calculated as -log_2(p_i).
    This quantity is also know as information or self-information:
    https://en.wikipedia.org/wiki/Information_content
    
    Params
    ------
    px (np.array)
        Array of individual probabilities, i.e. a probability vector or distribution.
    
    Returns
    -------
    information (np.array)
        Array of element-wise information content.
    """
    if isinstance(px, list):
        px = np.array(px)
        
    # Make a copy of input array
    information_content = px.copy()
    
    # Get indices of nonzero probability values
    nz = np.nonzero(information_content)
    
    # Compute -log_2(p_i) element-wise
    information_content[nz] *= - np.log2(information_content[nz])
    
    return information_content

In [62]:
def entropy(px): 
    
    # Get nonzero indices 
    nz = np.nonzero(px)
    
    # Compute entropy 
    entropy = np.sum(-px[nz]*np.log2(px[nz]))
    
    return np.round(entropy, 4)

In [67]:
df_joint

cancer,Y = 0,Y = 1
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
X = 0,0.72,0.03
X = 1,0.2,0.05


In [80]:
h_x = entropy(df_x.values)

print(f'Entropy of smoker random variable is: {h_x} bits.')

Entropy of smoker random variable is: 0.8113 bits.


In [83]:
p_y = df_y.values
p_xgy = df_xgy.values
h_xgy = np.sum(p_y*element_wise_information(p_xgy))

In [84]:
h_xgy

0.7713000997905117

In [85]:
mi = np.round(h_x - h_xgy, 5)

In [86]:
print(f'Mutual information of smoking and cancer random variates is : {mi} bits')

Mutual information of smoking and cancer random variates is : 0.04 bits


For a perfectly correlated pair of Bernoulli   random variables we have 

In [87]:
bern = np.array([
    [0.5, 0], 
    [0, 0.5]
])

In [88]:
bern_x = bern.sum(axis = 1)

In [89]:
bern_x

array([0.5, 0.5])

In [90]:
bern_y = bern.sum(axis = 0)

In [91]:
bern_y

array([0.5, 0.5])

In [92]:
bern_xgy = bern / bern_y

In [93]:
bern_xgy

array([[1., 0.],
       [0., 1.]])

In [94]:
h_xgy = np.sum(bern_y*element_wise_information(bern_xgy))

In [95]:
h_xgy

0.0

In [96]:
h_x = entropy(bern_x)

In [97]:
h_x

1.0

In [98]:
mi = h_x - h_xgy
mi

1.0