# Mutual information

In [16]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from concurrent.futures import ProcessPoolExecutor, as_completed

In [17]:
path_data_prot_coding='/home/thomas/Documents/git/medulloblastoma_cavalli_kaggle/data/in/protein_coding/'
path_metadata_prot_coding = path_data_prot_coding + 'GSE85217_Cavalli_subgroups_information_protein_coding.csv'
path_exp_mat_prot_coding_nocorr = path_data_prot_coding + 'GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab_protein_coding_nocorr.csv'

In [18]:
expr_mat = pd.read_csv(path_exp_mat_prot_coding_nocorr,index_col=0)

In [19]:
expr_mat=np.array(expr_mat)
toys_mat = expr_mat[:,0:500]

In [20]:
def joint_entropies(data, nbins=None):
    n_variables = data.shape[-1]
    n_samples = data.shape[0]
    if nbins == None:
        nbins = int((n_samples/5)**.5)
    histograms2d = np.zeros((n_variables, n_variables, nbins, nbins))
    for i in range(n_variables):
        for j in range(n_variables):
            histograms2d[i,j] = np.histogram2d(data[:,i], data[:,j], bins=nbins)[0]
    probs = histograms2d / len(data) + 1e-100
    joint_entropies = -(probs * np.log2(probs)).sum((2,3))
    return joint_entropies

def mutual_info_matrix(df, nbins=None, normalized=True):
    data = df.to_numpy()
    n_variables = data.shape[-1]
    j_entropies = joint_entropies(data, nbins)
    entropies = j_entropies.diagonal()
    entropies_tile = np.tile(entropies, (n_variables, 1))
    sum_entropies = entropies_tile + entropies_tile.T
    mi_matrix = sum_entropies - j_entropies
    if normalized:
        mi_matrix = mi_matrix * 2 / sum_entropies    
    return pd.DataFrame(mi_matrix, index=df.columns, columns=df.columns)

In [21]:
def joint_emtropies(x,y):

    n_vars_x = x.shape[-1]
    n_vars_y = y.shape[-1]
    
    x_samples = x.shape[0]
    y_samples = y.shape[0]

    x_bins = int((x_samples/5)**.5)
    y_bins = int((y_samples/5)**.5)
    n_bins = min(x_bins,y_bins)

    histograms2d = np.zeros((n_vars_x, n_vars_y, x_bins, y_bins))
    idx = np.tril_indices(n = n_vars_x, m = n_vars_y)

    for i, j in zip(*idx):
        hist2d, _, _ = np.histogram2d(x[:, i], y[:, j], bins=n_bins)
        histograms2d[i,j] = hist2d
        if i != j:
            histograms2d[j,i] = hist2d.T

    probs = histograms2d / x_samples + 1e-100
    joint_entropies = -(probs * np.log2(probs)).sum(axis=(2, 3))

    return joint_entropies

In [22]:
joint_emtropies(x=expr_mat[:,0:10],y=expr_mat[:,0:10])

array([[2.10917168, 4.05118404, 4.06754949, 4.42740567, 4.82868203,
        4.62563451, 4.37622491, 4.55168455, 4.315386  , 4.46203186],
       [4.05118404, 2.00113194, 3.97553811, 4.37402397, 4.73728055,
        4.57222393, 4.29366451, 4.4960331 , 4.2587107 , 4.38521888],
       [4.06754949, 3.97553811, 2.02957698, 4.36921036, 4.74570411,
        4.5736708 , 4.33403581, 4.52819363, 4.26582651, 4.40839873],
       [4.42740567, 4.37402397, 4.36921036, 2.43728789, 5.1315224 ,
        4.92059574, 4.70664034, 4.88281827, 4.6335852 , 4.78250227],
       [4.82868203, 4.73728055, 4.74570411, 5.1315224 , 2.80030109,
        5.3456241 , 5.06284049, 5.27511703, 5.01826948, 5.17423853],
       [4.62563451, 4.57222393, 4.5736708 , 4.92059574, 5.3456241 ,
        2.62896627, 4.87097201, 5.07966543, 4.83032238, 4.93839758],
       [4.37622491, 4.29366451, 4.33403581, 4.70664034, 5.06284049,
        4.87097201, 2.36022609, 4.84782667, 4.56621329, 4.70320886],
       [4.55168455, 4.4960331 , 4.5281936

In [23]:
def chunkify_expr_mat(exp_mat,start=0,step=500):
    
    checkpoint = list(range(start,exp_mat.shape[1],step))+[exp_mat.shape[1]]
    
    for i in range(len(checkpoint)-1):
        yield checkpoint[i],checkpoint[i+1]

In [24]:
chunks = [(x,y) for x,y in chunkify_expr_mat(toys_mat,start=0,step=100)]
chunks_array = np.array([[chunks[i],chunks[j]] for i in range(len(chunks)) for j in range(i,len(chunks))])

In [25]:
def gen_chunk_array(expr_mat,chunks_array):

    for chunk in chunks_array:
        chunk1=chunk[0]
        chunk2=chunk[1]

        yield expr_mat[:,chunk1[0]:chunk1[1]], expr_mat[:,chunk2[0]:chunk2[1]]

In [26]:
chunks_expr_mat = [(x,y) for x,y in gen_chunk_array(expr_mat,chunks_array)]

In [27]:
chunks_expr_mat

[(array([[2.09136039, 2.00955258, 2.23931435, ..., 2.80325351, 2.72554551,
          2.81117572],
         [2.23847366, 2.56127124, 2.23058575, ..., 3.57149086, 3.18151574,
          2.75082082],
         [2.08200658, 1.99494175, 2.24246076, ..., 3.31454708, 2.66996392,
          2.90301979],
         ...,
         [4.58311727, 2.42024706, 3.37500261, ..., 2.94409582, 2.85645582,
          2.93011417],
         [2.43373795, 2.30724349, 2.42414621, ..., 3.14292281, 2.8271654 ,
          3.23565709],
         [2.092384  , 2.37511898, 2.62565216, ..., 3.16080361, 3.09407803,
          2.91714686]]),
  array([[2.09136039, 2.00955258, 2.23931435, ..., 2.80325351, 2.72554551,
          2.81117572],
         [2.23847366, 2.56127124, 2.23058575, ..., 3.57149086, 3.18151574,
          2.75082082],
         [2.08200658, 1.99494175, 2.24246076, ..., 3.31454708, 2.66996392,
          2.90301979],
         ...,
         [4.58311727, 2.42024706, 3.37500261, ..., 2.94409582, 2.85645582,
          2.9

In [28]:
%%time
with ProcessPoolExecutor(max_workers=7) as executor:
    futures=[executor.submit(joint_emtropies,chunk[0],chunk[1]) for chunk in chunks_expr_mat]

CPU times: user 40.4 ms, sys: 81.2 ms, total: 122 ms
Wall time: 3.25 s


In [None]:
#with ProcessPoolExecutor(max_workers=7) as executor: 
#[joint_emtropies(x=,y=)  for chunk in chunks_array]

In [None]:
%%time
res=joint_entropies(data=expr_mat[:,0:10])