In [1]:
import numpy as np
import pandas as pd
import glob

## Read

In [134]:
path_list = glob.glob('data/cluster*')

In [136]:
# read partition data
partitions = pd.read_table('data/partitions.txt', sep =' ', header=None, index_col=0)
partitions.columns = ['partition']

# read clustering and partitions data
data_list = []
for path in path_list:
    clustering = pd.read_table(path, sep=' ', header=None, index_col=0)
    clustering.columns = ['cluster']
    
    # merge partition and clustering
    data = partitions.merge(clustering, left_index=True, right_index=True)
    data_list.append(data)

## Prob

In [209]:
def cal_nmi(data):
    n = data.shape[0]
    
    # create partition x cluster prob matrix
    unique_partition = np.unique(data.partition)
    unique_cluster = np.unique(data.cluster)

    pij = np.zeros([len(unique_partition), len(unique_cluster)])
    
    # calculate pij for each cluster
    pij_pd = (data.groupby(['cluster','partition']).size().to_frame('count')/n).reset_index() #.reset_index()

    # the probability of getting cluter i: #ni/n
    pci = data.groupby('cluster').count()/n

    # the probability of getting partition j : #nj/n
    ptj = data.groupby('partition').count()/n
    
    # fill prob matrix
    for partition in unique_partition:
        for cluster in unique_cluster:
            count = pij_pd.iloc[np.where((pij_pd.partition==partition) & (pij_pd.cluster==cluster))]['count']
            if len(count)>0:
                pij[partition, cluster] = count

    # fill zero with smallest value
    pij[np.where(pij==0)]=1e-7
    
    # calculate mi, H(T), H(C)
    # pij * log(pij/ pci* ptj)
    mi = np.sum(pij*np.log2(np.divide(pij ,np.dot(pci, ptj.T).T)))
    HC = -np.sum(pci*np.log2(pci))
    HT = -np.sum(ptj*np.log2(ptj))

    nmi = mi/np.sqrt(HT[0] *HC[0])
    
    return nmi

In [210]:
nmi =[cal_nmi(data) for data in data_list]

TypeError: return arrays must be of ArrayType

In [208]:
mi/np.sqrt(HT[0]*HC[0])

0.6456368113477221

## Save

In [205]:
with open('socres.txt', 'w')as fp:
    [fp.write(str(line)+ ' '+'0'+'\n') for line in nmi]