In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
import numpy as np
import scipy.sparse
from sklearn.metrics.cluster.supervised import contingency_matrix
from sklearn.metrics.cluster.expected_mutual_info_fast import (
    expected_mutual_information,
)
from sklearn.metrics.cluster import adjusted_mutual_info_score

In [8]:
C = scipy.sparse.load_npz('test.npz')
n_samples = C.sum()
n_samples, C.shape

(14975, (5372, 5361))

In [7]:
seed = 42
k = 10
n_samples = 10000
random_labels = np.random.RandomState(seed).randint
distribution_a = random_labels(low=0, high=k, size=n_samples)
distribution_b = random_labels(low=0, high=k, size=n_samples)
distribution_b[distribution_b > 5] = 5
C = contingency_matrix(distribution_a, distribution_b, sparse=True)
n_samples = C.sum()
C.shape, n_samples

((10, 6), 10000)

In [9]:
emi = expected_mutual_information(C, n_samples)
emi

6.4158670938439775

In [10]:
#adjusted_mutual_info_score(distribution_a, distribution_b)

In [None]:
from childes_mi.information_theory.expected_mutual_information_numpy import expected_mutual_information as expected_mutual_information_numpy
emi_numpy = expected_mutual_information_numpy(C, n_samples)
emi_numpy


 types {'gln_nij': [<class 'numpy.ndarray'>], 'gln_N': [<class 'numpy.float64'>], 'gln_Nb': [<class 'numpy.ndarray'>], 'gln_Na': [<class 'numpy.ndarray'>], 'gln_b': [<class 'numpy.ndarray'>], 'gln_a': [<class 'numpy.ndarray'>], 'log_Nnij': [<class 'numpy.ndarray'>], 'log_ab_outer': [<class 'numpy.ndarray'>], 'term1': [<class 'numpy.ndarray'>], 'nijs': [<class 'numpy.ndarray'>], 'a': [<class 'numpy.ndarray'>], 'C': [<class 'int'>], 'R': [<class 'int'>], 'n_samples': [<class 'numpy.int64'>], 'contingency': [<class 'scipy.sparse.csr.csr_matrix'>], 'b': [<class 'numpy.ndarray'>], 'N': [<class 'float'>]}
sizes {'gln_nij': ['0.0 Gb', (128,)], 'gln_Nb': ['0.0 Gb', (5361,)], 'gln_Na': ['0.0 Gb', (5372,)], 'gln_b': ['0.0 Gb', (5361,)], 'gln_a': ['0.0 Gb', (5372,)], 'log_Nnij': ['0.0 Gb', (128,)], 'log_ab_outer': ['0.23 Gb', (5372, 5361)], 'term1': ['0.0 Gb', (128,)], 'nijs': ['0.0 Gb', (128,)], 'a': ['0.0 Gb', (5372,)], 'b': ['0.0 Gb', (5361,)]}


HBox(children=(IntProgress(value=0, max=5372), HTML(value='')))

In [12]:
from math import log
from scipy.special import gammaln
import numpy as np
import time
from tqdm.autonotebook import tqdm
from joblib import Parallel, delayed

In [13]:
def nij_op(s1i, s2, l2, N, term1, nijs, i, gln_a, gln_b, gln_Na, gln_Nb, gln_N, gln_nij):
    emif = 0
    for j in range(l2):
        s2j = s2[j]
        min_nij = np.max([1, s1i + s2j - N])
        max_nij = np.min([s1i, s2j])
        nij = np.arange(min_nij, max_nij) + 1
        t1 = term1[nij]

        t2 = np.log(N * nijs[nij]) - np.log(s1i * s2j)

        gln = (
            gln_a[i] + 
            gln_b[j] + 
            gln_Na[i] + 
            gln_Nb[j] - 
            gln_N - 
            gln_nij[nij] - 
            gammaln(s1i - nij + 1) - 
            gammaln(s2j - nij + 1) -
            gammaln(N - s1i- s2j + nij + 1)
        )

        t3 = np.exp(gln)
        emi = sum(t1 * t2 * t3)
        emif += emi
    return emif

In [14]:
def emi_parallel(contingency, n_samples):
    """
    EMI without pregenerating lookup table for reduced memory
    https://github.com/clajusch/ClEvaR/blob/master/R/Calculations.R
    """
    
    print("EMI reduced memory parallel")
    s1 = np.array(np.sum(contingency, axis=1, dtype="int").flatten()).flatten()
    s2 = np.array(np.sum(contingency, axis=0, dtype="int").flatten()).flatten()
    N = n_samples
    l1 = len(s1)
    l2 = len(s2)

    nijs = np.arange(0, max(np.max(s1), np.max(s2)) + 1, dtype="float")
    nijs[0] = 1
    term1 = nijs / N

    gln_a = gammaln(s1 + 1)
    gln_b = gammaln(s2 + 1)
    gln_Na = gammaln(N - s1 + 1)
    gln_Nb = gammaln(N - s2 + 1)
    gln_N = gammaln(N + 1)
    gln_nij = gammaln(nijs + 1)

    with Parallel(n_jobs=-1, verbose=0, prefer=None) as parallel:
        emi = parallel(
            delayed(nij_op)(s1[i], s2, l2, N, term1, nijs, i, gln_a, gln_b, gln_Na, gln_Nb, gln_N, gln_nij)
            for i in tqdm(range(l1), desc="compute emi", mininterval=0.25)
        )

    return np.sum(emi)

In [15]:
emi_parallel(C, n_samples)

EMI reduced memory parallel


HBox(children=(IntProgress(value=0, description='compute emi', max=5372, style=ProgressStyle(description_width…




0.02389203129531103