In [1]:
import operator as op
from collections import Counter
from functools import reduce

import numpy as np

# Simplest case: all classes have equal size

In [2]:
def ncr(n, r):
    r = min(r, n-r)
    numer = reduce(op.mul, range(n, n-r, -1), 1)
    denom = reduce(op.mul, range(1, r+1), 1)
    return numer // denom

ncr(4, 2)

6

In [3]:
def pe(e, size, p):
    """
    Вероятность иметь e связей из size
    """
    assert 0 <= e <= size
    return ncr(size, e) * p**e * (1-p)**(size-e)
    
pe(30, 100, 0.3)

0.08678386475342761

In [4]:
def p_interval(l, h, size, p):
    return np.sum([pe(e, size, p) for e in range(l, h+1)])

p_interval(0, 9, 9, 0.01)

0.9999999999999999

In [5]:
def score(n, k, p_in, p_out):
    size = n // k
    sum_ = 0
    for e in range(size):
        sum_ += pe(e, size-1, p_in) * (p_interval(0, e-1, size, p_out) ** (k - 1))
    return sum_

In [6]:
score(100, 2, 0.3, 0.15)

0.9499097082948508

# Partition

In [7]:
A = np.array([
    [0, 1, 1, 0, 0, 0],
    [1, 0, 1, 0, 0, 0],
    [1, 1, 0, 1, 1, 0],
    [0, 0, 1, 0, 1, 1],
    [0, 0, 1, 1, 0, 1],
    [0, 0, 0, 1, 1, 0]
])
partition = [1, 1, 1, 2, 2, 2]

In [53]:
def score(A, partition):
    # calculate cluster_sizes and all p
    n_clusters = len(set(partition))
    cluster_sizes = dict(Counter(partition))
    class_mapping = {v: k for k, v in enumerate(cluster_sizes.keys())}
    cluster_sizes = list(cluster_sizes.values())
    
    incidence = np.zeros((n_clusters, n_clusters))
    for i in range(A.shape[0]):
        ki = class_mapping[partition[i]]
        for j in range(i + 1, A.shape[1]):
            if A[i, j] == 1:
                kj = class_mapping[partition[j]]
                if ki == kj:
                    incidence[ki, ki] += 1
                else:
                    incidence[ki, kj] += 1
                    incidence[kj, ki] += 1
    
    p = np.zeros((n_clusters, n_clusters))
    for i in range(n_clusters):
        for j in range(i, n_clusters):
            if i == j:
                p_in = incidence[i, j] / (cluster_sizes[i] * (cluster_sizes[i] - 1) / 2)
                p[i, i] = p_in
            else:
                p_out = incidence[i, j] / (cluster_sizes[i] * cluster_sizes[j])
                p[i, j], p[j, i] = p_out, p_out
                
    # let's calculate score for every class, and than weight it
    score = np.zeros((n_clusters,))
    for ki in range(n_clusters):
        for e in range(cluster_sizes[ki]):
            class_score_e = pe(e, cluster_sizes[ki]-1, p[ki, ki])
            for kj in range(n_clusters):
                if ki != kj:
                    class_score_e *= p_interval(0, min(e-1, cluster_sizes[kj]), cluster_sizes[kj], p[ki, kj])
            score[ki] += class_score_e
        
            
    overall = np.sum([sc * size for sc, size in zip(score, cluster_sizes)]) / len(partition)
    return overall

In [54]:
score(A, partition)

0.9382061983750067

In [55]:
import sys
sys.path.append('../../pygkernels')
from pygkernels.data import Datasets

In [56]:
(A, partition), info = Datasets()['eurosis']

In [57]:
score(A, partition)

0.9382061983750067