In [1]:
import networkx as nx
import networkx.algorithms.community as nx_comm
from networkx.generators.community import LFR_benchmark_graph
from networkx.algorithms import bipartite
import numpy as np
import scipy as sp
from scipy.sparse import coo_array
from scipy import sparse
from cdlib import algorithms
from cdlib import evaluation
import sklearn
from utils import *
from distances import *
from consensus import *
import math
import itertools
import random
import time
from pathlib import Path
from sklearn.metrics.cluster import normalized_mutual_info_score

Note: to be able to use all crisp methods, you need to install some additional packages:  {'ASLPAw'}


In [2]:
cons_name = "lf"

## Parameter configurations for clustering generation

In [2]:
alg_params = {
    "louvain": {
        "resolution": [0.5, 0.75, 1.0, 1.25, 1.5],
        "randomize": [314159, 2718, 1234, 4321, 987654321]
    }
}

## Enumerate clusterings

In [3]:
clustering_enumeration = []
count = 0
for alg, params in alg_params.items():
    param_combinations = []
    param_names = []
    if params is not None:
        iterables = []
        param_names = []
        for param in params.keys():
            iterables.append(list(params[param]))
            param_names.append(param)
        param_combinations = list(itertools.product(*iterables))
    if len(param_combinations) > 0:
        for param_combination in param_combinations:
            expr = "algorithms."+alg+"(G"
            for i in range(len(param_names)):
                expr = expr + "," + param_names[i] + "=" + str(param_combination[i])
            expr = expr + ")"
            clustering_enumeration.append((expr,count))
            count = count + 1      
    else:
        expr = "algorithms."+alg+"(G)"
        clustering_enumeration.append((expr,count))
        count = count + 1
print(clustering_enumeration)

[('algorithms.louvain(G,resolution=0.5,randomize=314159)', 0), ('algorithms.louvain(G,resolution=0.5,randomize=2718)', 1), ('algorithms.louvain(G,resolution=0.5,randomize=1234)', 2), ('algorithms.louvain(G,resolution=0.5,randomize=4321)', 3), ('algorithms.louvain(G,resolution=0.5,randomize=987654321)', 4), ('algorithms.louvain(G,resolution=0.75,randomize=314159)', 5), ('algorithms.louvain(G,resolution=0.75,randomize=2718)', 6), ('algorithms.louvain(G,resolution=0.75,randomize=1234)', 7), ('algorithms.louvain(G,resolution=0.75,randomize=4321)', 8), ('algorithms.louvain(G,resolution=0.75,randomize=987654321)', 9), ('algorithms.louvain(G,resolution=1.0,randomize=314159)', 10), ('algorithms.louvain(G,resolution=1.0,randomize=2718)', 11), ('algorithms.louvain(G,resolution=1.0,randomize=1234)', 12), ('algorithms.louvain(G,resolution=1.0,randomize=4321)', 13), ('algorithms.louvain(G,resolution=1.0,randomize=987654321)', 14), ('algorithms.louvain(G,resolution=1.25,randomize=314159)', 15), ('al

# v5

In [4]:
from scipy.sparse import coo_array
from scipy.sparse import csr_array

# DeltaSOD is calculated following the paper titled "Integrating Microarray Data by Consensus Clustering"
# by Filkov and Skiena
# Assumes the elements of the cluster are named as 0-based indices
def v5_consensus(P_list, niter=10, starting_partition=None, verbose=False):
    G = nx.Graph(P_list[0]["graph"])
    n = len(list(G.nodes()))
    k = len(P_list)
    print("Number of edges in G:", len(list(G.edges())))

    t1 = time.time()
    A = nx.to_scipy_sparse_array(G, format="coo")
    t2 = time.time()
    print("Time to get sparse matrix of the graph:", t2-t1)

    nz_rows = A.row 
    nz_cols = A.col
    
    t1 = time.time()
    P_list_asn = []
    c = np.zeros((n,k))
    for i in range(k):
        clust_lst = P_list[i]["partition"]
        clust_asn = clust_lst_to_asn(clust_lst)
        c[:,i] = np.array(clust_asn)
    t2 = time.time()
    print("Time to generate cluster assignment matrix:", t2-t1)
    
    Aw_rows = A.row
    Aw_cols = A.col
    Aw_vals = A.data
    #nz_elems = []
    t1 = time.time()
    for i in range(len(nz_rows)):
        Aw_vals[i] = np.sum( c[nz_rows[i],:] == c[nz_cols[i],:] )
        #nz_elems.append((nz_rows[i], nz_cols[i], Aw_vals[i]))
    #Gw = nx.from_scipy_sparse_array(coo_array((Aw_vals, (Aw_rows, Aw_cols)), shape=(n, n)))
    #nz_elems = sorted(nz_elems, key=lambda x: x[2], reverse=True)
    Aw = csr_array((Aw_vals, (Aw_rows, Aw_cols)), shape=(n, n))
    t2 = time.time()
    print("Time to generate weighted consensus graph:", t2-t1)
    print("Number of non-zeroes in Aw:", Aw.count_nonzero())
    
    t1 = time.time()
    refined_partition = None
    if starting_partition:
        refined_partition = list(starting_partition)
    else:
        refined_partition = []
        for i in range(n):
            refined_partition.append([str(i)])
    
    refined_partition_map = clust_lst_to_map(refined_partition)
    items = list(refined_partition_map.keys())
    t2 = time.time()
    print("Time to initialize:", t2-t1)
    
    tSearch = 0
    tUpdate = 0
    tMovement = 0
    count = 0
    it = 1
    last_valid = np.zeros(n)
    last_deltaS = np.zeros(n)
    while(it <= niter):
        potential_moves = {
            "from": np.arange(n),
            "to": np.arange(n),
            "attractor": np.arange(n),
            "deltaS": np.zeros(n),
            "valid": np.zeros(n)
        }
        for u in range(n):
            row_start = Aw.indptr[u]
            row_end = Aw.indptr[u+1]
            for j in range(row_start, row_end):
                v = Aw.indices[j]
                w = Aw.data[j]
                
                t1 = time.time()
                
                a = refined_partition_map[str(u)]
                b = refined_partition_map[str(v)]
                
                Mua = 0
                Mub = 0
                for elem in refined_partition[a]:
                    if str(elem) != str(u):
                        Mua = Mua + (k - 2 * np.sum( (c[int(u),:] == c[int(elem),:]) ) )
                for elem in refined_partition[b]:    
                    if str(elem) != str(u):
                        Mub = Mub + (k - 2 * np.sum( (c[int(u),:] == c[int(elem),:]) ) )
                
                deltaS = Mub - Mua
                t2 = time.time()
                tSearch = tSearch + t2-t1
                
                if (deltaS is not None) and (deltaS < 0) and (a != b):
                    potential_moves["from"][u] = a
                    potential_moves["to"][u] = b
                    potential_moves["attractor"][u] = v
                    potential_moves["deltaS"][u] = deltaS
                    potential_moves["valid"][u] = 1
        
        t1 = time.time()
        for u in range(n):
            v = potential_moves["attractor"][u]
            if (potential_moves["valid"][u] == 1) and (potential_moves["attractor"][v] == u) and (potential_moves["valid"][v] == 1):
                # Question mark
                if potential_moves["deltaS"][u] < potential_moves["deltaS"][v]:
                    potential_moves["valid"][v] = 0
                    potential_moves["deltaS"][v] = 0
                else:
                    potential_moves["valid"][u] = 0
                    potential_moves["deltaS"][u] = 0
        t2 = time.time()
        tSearch = tSearch + (t2 - t1)
        
        flag = False
        if np.sum( (potential_moves["valid"] != last_valid) ) == 0:
            # Same set of elements are being moved
            if np.sum(potential_moves["deltaS"]) < np.sum(last_deltaS):
                last_valid = potential_moves["valid"]
                last_deltaS = potential_moves["deltaS"]
                flag = True
        else:
            last_valid = potential_moves["valid"]
            last_deltaS = potential_moves["deltaS"]
            flag = True
        
        if flag == True:
            for u in range(n):
                if potential_moves["valid"][u] == True:
                    a = potential_moves["from"][u]
                    b = potential_moves["to"][u]

                    t1 = time.time()
                    if verbose:
                        print("---")
                        print("Iteration:", it, "Move Count:", count+1, ">> results in deltaS", potential_moves["deltaS"][u])
                        print("Move:", u)
                        print("From partition", a, ":", refined_partition[a])
                        print("To partition", b, ":", refined_partition[b])
                        print("---")
                    refined_partition[a].remove(str(u))
                    refined_partition[b].append(str(u))
                    refined_partition_map[str(u)] = b
                    t2 = time.time()
                    tUpdate = tUpdate + (t2-t1)

                    count = count + 1
        
        if flag == False:
            break
        
        it = it + 1
    print("Time to search moves:", tSearch)
    print("Time to update M:", tUpdate)
    
    t1 = time.time()
    empty_clusters = []
    for i in range(len(refined_partition)):
        if len(refined_partition[i]) == 0:
            empty_clusters.append(i)
            
    empty_clusters.sort(reverse=True)
    for e in empty_clusters:
        del refined_partition[e]
    t2 = time.time()
    print("Time to delete empty partitions:", t2-t1)
    
    Gw = nx.from_scipy_sparse_array(Aw)
    return {"graph": nx.Graph(Gw), "partition": list(refined_partition)}

# BOEM

In [5]:
# DeltaSOD is calculated following the paper titled "Integrating Microarray Data by Consensus Clustering"
# by Filkov and Skiena
# Assumes the elements of the cluster are named as 0-based indices
def boem_consensus(P_list, niter=10, starting_partition=None, verbose=False):
    G = nx.Graph(P_list[0]["graph"])
    n = len(list(G.nodes()))
    k = len(P_list)
    
    row = []
    col = []
    val = []
    for x in P_list:
        graph = x["graph"]
        partition = x["partition"]
        for cluster in partition:
            for i in range(len(cluster)):
                for j in range(i+1, len(cluster)):
                    item_1 = cluster[i]
                    item_2 = cluster[j]
                    row.append(min(int(item_1), int(item_2)))
                    col.append(max(int(item_1), int(item_2)))
                    val.append(int(1))
                    
    r = coo_array((val, (row, col)), shape=(n, n))
    r = r.tocsr()
    R = r.sum()
    if verbose:
        print("R:", R)
    
    rDense = r.toarray() # Should be upper triangular
    rDense = rDense + rDense.T # Making symmetric
    
    K = k - 2 * rDense
    np.fill_diagonal(K, -k) # Adding diagonal entries
    
    refined_partition = None
    if starting_partition:
        refined_partition = list(starting_partition)
    else:
        refined_partition = []
        for i in range(n):
            refined_partition.append([str(i)])
    
    refined_partition_map = clust_lst_to_map(refined_partition)
    
    items = list(refined_partition_map.keys())
    M = np.zeros((n, len(refined_partition)))
    for item_1 in items:
        for partition_id in range(len(refined_partition)):
            for item_2 in refined_partition[partition_id]:
                if(item_1 != item_2):
                    M[int(item_1),partition_id] = M[int(item_1),partition_id] + K[int(item_1), int(item_2)]
    
    mv = np.min(M, axis=1)
    mb = np.argmin(M, axis=1)
    
    count = 0
    it = 1
    while(it <= niter):
        opt_item = items[0]
        opt_deltaS = 0
        opt_a = refined_partition_map[items[0]]
        opt_b = refined_partition_map[items[0]]
        opt_x = int(opt_item)
        for item in items:
            a = refined_partition_map[item]
            x = int(item)
            b = mb[x]
            deltaS = M[x,b] - M[x,a]
            #print("Moving", item, "from", refined_partition[a], "to", refined_partition[b], "results in ", deltaS)
            if deltaS < opt_deltaS:
                opt_item = item
                opt_deltaS = deltaS
                opt_a = a
                opt_b = b
                opt_x = x
        if (opt_deltaS < 0) and (opt_a != opt_b):
            if verbose:
                print("---")
                print("Move Count:", count+1, "Optimum move results in", opt_deltaS)
                print("Move:", opt_item)
                print("From", opt_a, ":", refined_partition[opt_a])
                print("To", opt_b, ":", refined_partition[opt_b])
            for item in items:
                y = int(item)
                if y != opt_x:
                    M[y, opt_a] = M[y, opt_a] - K[y, opt_x]
                    M[y, opt_b] = M[y, opt_b] + K[y, opt_x]
            
            mv = np.min(M, axis=1)
            mb = np.argmin(M, axis=1)
            
            refined_partition[opt_a].remove(opt_item)
            refined_partition[opt_b].append(opt_item)
            refined_partition_map[opt_item] = opt_b
            if verbose:
                print("---")
            
            count = count + 1
        else:
            break

        it = it + 1
    print("Move count:", count)
    empty_clusters = []
    for i in range(len(refined_partition)):
        if len(refined_partition[i]) == 0:
            empty_clusters.append(i)
            
    empty_clusters.sort(reverse=True)
    for e in empty_clusters:
        del refined_partition[e]
        
    G = nx.from_scipy_sparse_array(r)
    
    return {"graph": nx.Graph(G), "partition": list(refined_partition)}

In [10]:
n = 5000
fileprefix = "LFR/" + "n" + str(n) + "/"
mus = [1, 2, 3, 4]
#mus = [1]
gammas = [30]
betas = [11]
for mu in mus:
    for gamma in gammas:
        for beta in betas:
            fname = "LFR_n" + str(n) + "_mu0" + str(mu) + "_gamma" + str(gamma) + "_beta" + str(beta)
            graph_file = fileprefix + fname + ".mtx"
            G = None
            with open(graph_file) as f:
                G = nx.from_scipy_sparse_array(spio.mmread(f), create_using=nx.Graph)
                count = 0
                comms = None
                for alg, params in alg_params.items():
                    param_combinations = []
                    param_names = []
                    if params is not None:
                        iterables = []
                        param_names = []
                        for param in params.keys():
                            iterables.append(list(params[param]))
                            param_names.append(param)
                        param_combinations = list(itertools.product(*iterables))
                    if len(param_combinations) > 0:
                        for param_combination in param_combinations:
                            expr = "algorithms."+alg+"(G"
                            for i in range(len(param_names)):
                                expr = expr + "," + param_names[i] + "=" + str(param_combination[i])
                            expr = expr + ")"
                            try:
                                coms = eval(expr)
                                print(count, expr, len(coms.communities))
                                write_clust_lst(coms.communities, "LFR-lf-cons-evaluation/" + "n" + str(n) + "/" + fname + "." + str(count))
                                count = count + 1
                            except:
                                print("UNSUCCESSFUL", expr)       
                    else:
                        expr = "algorithms."+alg+"(G)"
                        try:
                            coms = eval(expr)
                            print(count, expr, len(coms.communities))
                            write_clust_lst(coms.communities, "LFR-lf-cons-evaluation/" + "n" + str(n) + "/" + fname + "." + str(count))
                            count = count + 1
                        except:
                            print("UNSUCCESSFUL", expr)

                    #coms = eval()
                #write_clust_lst(coms.communities, fileprefix + fname + "." + alg)

0 algorithms.louvain(G,resolution=0.5,randomize=314159) 201
1 algorithms.louvain(G,resolution=0.5,randomize=2718) 202
2 algorithms.louvain(G,resolution=0.5,randomize=1234) 203
3 algorithms.louvain(G,resolution=0.5,randomize=4321) 203
4 algorithms.louvain(G,resolution=0.5,randomize=987654321) 201
5 algorithms.louvain(G,resolution=0.75,randomize=314159) 100
6 algorithms.louvain(G,resolution=0.75,randomize=2718) 95
7 algorithms.louvain(G,resolution=0.75,randomize=1234) 97
8 algorithms.louvain(G,resolution=0.75,randomize=4321) 99
9 algorithms.louvain(G,resolution=0.75,randomize=987654321) 97
10 algorithms.louvain(G,resolution=1.0,randomize=314159) 113
11 algorithms.louvain(G,resolution=1.0,randomize=2718) 116
12 algorithms.louvain(G,resolution=1.0,randomize=1234) 115
13 algorithms.louvain(G,resolution=1.0,randomize=4321) 113
14 algorithms.louvain(G,resolution=1.0,randomize=987654321) 112
15 algorithms.louvain(G,resolution=1.25,randomize=314159) 127
16 algorithms.louvain(G,resolution=1.25,r

# Iterative LF runs

In [16]:
def prep_consensus_graph(P_list):
    G = nx.Graph(P_list[0]["graph"])
    n = len(list(G.nodes()))
    k = len(P_list)
    #print("Number of nodes", n)
    
    row = []
    col = []
    val = []
    for x in P_list:
        graph = x["graph"]
        partition = x["partition"]
        for cluster in partition:
            for i in range(len(cluster)):
                for j in range(i+1, len(cluster)):
                    item_1 = cluster[i]
                    item_2 = cluster[j]
                    row.append(int(item_1))
                    col.append(int(item_2))
                    val.append(int(1))
                    
    r = coo_array((val, (row, col)), shape=(n, n))
    rDense = r.toarray()
    threshold = k / 2
    rDense[np.abs(rDense) < threshold] = 0
    
    G = nx.from_numpy_array(rDense)
    return G

n = 5000
fileprefix = "LFR-lf-cons-evaluation/" + "n" + str(n) + "/"
mus = [1, 2, 3, 4]
#mus = [1]
gammas = [30]
betas = [11]

stats = []

for mu in mus:
    for gamma in gammas:
        for beta in betas:
            P_list = []
            fname = "LFR_n" + str(n) + "_mu0" + str(mu) + "_gamma" + str(gamma) + "_beta" + str(beta)
            graph_file = "LFR/" + "n" + str(n) + "/" + fname + ".mtx"
            print(graph_file)
            G = None
            with open(graph_file) as f:
                G = nx.from_scipy_sparse_array(spio.mmread(f), create_using=nx.Graph)
                new_adj_mat = nx.to_numpy_array(G)
                old_adj_mat = np.zeros(new_adj_mat.shape)
                diff_mat = old_adj_mat - new_adj_mat
                old_adj_mat = np.array(new_adj_mat)
                norm = np.linalg.norm(diff_mat)
                P_star = None
                for it in range(20):
                    P_list = []
                    if it > 0:
                        for k in clustering_enumeration:
                            try:
                                coms = eval(k[0])
                                print("mu:", mu, "it:", it, k[0], len(coms.communities))
                                P_list.append({"graph": nx.Graph(G), "partition": list(coms.communities)})
                                #stats.append({"mu": mu, "it": it, "norm": norm, "alg": k[0], "ncluster": len(coms.communities)})
                                count = count + 1
                            except:
                                print("UNSUCCESSFUL", expr)
                    else:
                        for k in clustering_enumeration:
                            clust_file = fileprefix + fname + "." + str(k[1])
                            if Path(clust_file).is_file():
                                partition = read_clust_lst(clust_file)
                                print("mu:", mu, "it:", it, k[0], len(partition))
                                P_list.append({"graph": nx.Graph(G), "partition": list(partition)})
                                #stats.append({"mu": mu, "it": it, "norm": norm, "alg": k[0], "ncluster": len(partition)})
                    G = prep_consensus_graph(P_list)
                    
                    new_adj_mat = nx.to_numpy_array(G)
                    diff_mat = old_adj_mat - new_adj_mat
                    norm = np.linalg.norm(diff_mat)
                    old_adj_mat = np.array(new_adj_mat)
                    #stats.append({"mu": mu, "it": it, "norm": norm, "alg": "lf-louvain", "ncluster": len(P_star["partition"])})
                    print("mu:", mu, "it:", it, "norm:", norm)
                    if norm < 1e-3:
                        P_star = P_list[0]
                        break
                        
                write_clust_lst(P_star["partition"], fileprefix + fname + ".lf-louvain")
        
#df = pd.DataFrame(stats)
#df.to_csv("benchmark-lf-convergence-multi-alg.csv", index=False)

LFR/n5000/LFR_n5000_mu01_gamma30_beta11.mtx
mu: 1 it: 0 algorithms.louvain(G,resolution=0.5,randomize=314159) 201
mu: 1 it: 0 algorithms.louvain(G,resolution=0.5,randomize=2718) 202
mu: 1 it: 0 algorithms.louvain(G,resolution=0.5,randomize=1234) 203
mu: 1 it: 0 algorithms.louvain(G,resolution=0.5,randomize=4321) 203
mu: 1 it: 0 algorithms.louvain(G,resolution=0.5,randomize=987654321) 201
mu: 1 it: 0 algorithms.louvain(G,resolution=0.75,randomize=314159) 100
mu: 1 it: 0 algorithms.louvain(G,resolution=0.75,randomize=2718) 95
mu: 1 it: 0 algorithms.louvain(G,resolution=0.75,randomize=1234) 97
mu: 1 it: 0 algorithms.louvain(G,resolution=0.75,randomize=4321) 99
mu: 1 it: 0 algorithms.louvain(G,resolution=0.75,randomize=987654321) 97
mu: 1 it: 0 algorithms.louvain(G,resolution=1.0,randomize=314159) 113
mu: 1 it: 0 algorithms.louvain(G,resolution=1.0,randomize=2718) 116
mu: 1 it: 0 algorithms.louvain(G,resolution=1.0,randomize=1234) 115
mu: 1 it: 0 algorithms.louvain(G,resolution=1.0,randomi

mu: 2 it: 1 algorithms.louvain(G,resolution=1.25,randomize=314159) 157
mu: 2 it: 1 algorithms.louvain(G,resolution=1.25,randomize=2718) 157
mu: 2 it: 1 algorithms.louvain(G,resolution=1.25,randomize=1234) 157
mu: 2 it: 1 algorithms.louvain(G,resolution=1.25,randomize=4321) 157
mu: 2 it: 1 algorithms.louvain(G,resolution=1.25,randomize=987654321) 157
mu: 2 it: 1 algorithms.louvain(G,resolution=1.5,randomize=314159) 157
mu: 2 it: 1 algorithms.louvain(G,resolution=1.5,randomize=2718) 157
mu: 2 it: 1 algorithms.louvain(G,resolution=1.5,randomize=1234) 157
mu: 2 it: 1 algorithms.louvain(G,resolution=1.5,randomize=4321) 157
mu: 2 it: 1 algorithms.louvain(G,resolution=1.5,randomize=987654321) 157
mu: 2 it: 1 norm: 2482.549495981903
mu: 2 it: 2 algorithms.louvain(G,resolution=0.5,randomize=314159) 157
mu: 2 it: 2 algorithms.louvain(G,resolution=0.5,randomize=2718) 157
mu: 2 it: 2 algorithms.louvain(G,resolution=0.5,randomize=1234) 157
mu: 2 it: 2 algorithms.louvain(G,resolution=0.5,randomize=4

mu: 4 it: 0 algorithms.louvain(G,resolution=0.75,randomize=314159) 55
mu: 4 it: 0 algorithms.louvain(G,resolution=0.75,randomize=2718) 51
mu: 4 it: 0 algorithms.louvain(G,resolution=0.75,randomize=1234) 56
mu: 4 it: 0 algorithms.louvain(G,resolution=0.75,randomize=4321) 53
mu: 4 it: 0 algorithms.louvain(G,resolution=0.75,randomize=987654321) 52
mu: 4 it: 0 algorithms.louvain(G,resolution=1.0,randomize=314159) 56
mu: 4 it: 0 algorithms.louvain(G,resolution=1.0,randomize=2718) 52
mu: 4 it: 0 algorithms.louvain(G,resolution=1.0,randomize=1234) 52
mu: 4 it: 0 algorithms.louvain(G,resolution=1.0,randomize=4321) 54
mu: 4 it: 0 algorithms.louvain(G,resolution=1.0,randomize=987654321) 54
mu: 4 it: 0 algorithms.louvain(G,resolution=1.25,randomize=314159) 72
mu: 4 it: 0 algorithms.louvain(G,resolution=1.25,randomize=2718) 77
mu: 4 it: 0 algorithms.louvain(G,resolution=1.25,randomize=1234) 72
mu: 4 it: 0 algorithms.louvain(G,resolution=1.25,randomize=4321) 78
mu: 4 it: 0 algorithms.louvain(G,reso

# Run v5 consensus 

In [6]:
n = 5000
fileprefix = "LFR-lf-cons-evaluation/" + "n" + str(n) + "/"
mus = [1, 2, 3, 4]
#mus = [1]
gammas = [30]
betas = [11]

stats = []

for mu in mus:
    for gamma in gammas:
        for beta in betas:
            P_list = []
            fname = "LFR_n" + str(n) + "_mu0" + str(mu) + "_gamma" + str(gamma) + "_beta" + str(beta)
            graph_file = "LFR/" + "n" + str(n) + "/" + fname + ".mtx"
            print(graph_file)
            G = None
            with open(graph_file) as f:
                G = nx.from_scipy_sparse_array(spio.mmread(f), create_using=nx.Graph)
                coms = None
                for k in clustering_enumeration:
                    clust_file = fileprefix + fname + "." + str(k[1])
                    if Path(clust_file).is_file():
                        partition = read_clust_lst(clust_file)
                        P_list.append({"graph": nx.Graph(G), "partition": list(partition)})
                t1 = time.time()
                P_star = boem_consensus(P_list, niter=100, starting_partition=None, verbose=False)
                t2 = time.time()
                print("mu", mu, ", number of clusters", len(P_star["partition"]))
                print("Time:", t2-t1)
                write_clust_lst(P_star["partition"], fileprefix + fname + "." + "boem")

LFR/n5000/LFR_n5000_mu01_gamma30_beta11.mtx
Move count: 100
mu 1 , number of clusters 4900
Time: 28.411213397979736
LFR/n5000/LFR_n5000_mu02_gamma30_beta11.mtx
Move count: 100
mu 2 , number of clusters 4900
Time: 31.276736736297607
LFR/n5000/LFR_n5000_mu03_gamma30_beta11.mtx
Move count: 100
mu 3 , number of clusters 4900
Time: 34.40173625946045
LFR/n5000/LFR_n5000_mu04_gamma30_beta11.mtx
Move count: 100
mu 4 , number of clusters 4902
Time: 41.37412118911743


### Quality of all clusterings of all benchmark graphs

In [8]:
stats = []

distance_metrics = ["split_joint_distance", "mirkin_distance", "variation_of_info_distance"]
consensus_methods = ["lf-louvain", "v5", "boem"]
#consensus_methods = ["v4"]
ns = [5000]
mus = [1, 2, 3, 4]
#mus = [2]
gammas = [30]
betas = [11]
for n in ns:
    for mu in mus:
        for gamma in gammas:
            for beta in betas:
                P_list = []
                fileprefix = "LFR-lf-cons-evaluation/" + "n" + str(n) + "/"
                fname = "LFR_n" + str(n) + "_mu0" + str(mu) + "_gamma" + str(gamma) + "_beta" + str(beta)
                graph_file = fileprefix + fname + ".mtx"
                print(graph_file)

                gt_clust_lst = read_clust_lst(fileprefix + fname + ".gt")
                gt_clust_asn = clust_lst_to_asn(gt_clust_lst)

                common_stat = {}
                common_stat["mu"] = mu
                common_stat["n"] = n
                common_stat["gamma"] = gamma
                common_stat["beta"] = beta

                for cons_method in consensus_methods:
                    #print(cons_method)
                    clust_file = fileprefix + fname + "." + cons_method
                    if Path(clust_file).is_file():
                        clust_lst = read_clust_lst(clust_file)
                        clust_asn = clust_lst_to_asn(clust_lst)
                            
                        stat = dict(common_stat)
                        stat["cons_method"] = cons_method
                        stat["ncluster"] = len(clust_lst)

                        F, precision, recall = fscore(gt_clust_lst, clust_lst)

                        stat["fscore"] = F
                        stat["precision"] = precision
                        stat["recall"] = recall

                        #clust_lst_temp = clust_asn_to_lst(clust_asn)
                        #modularity = nx.community.modularity(G, clust_lst_temp)
                        #stat["modularity"] = modularity

                        stat["nmi"] = normalized_mutual_info_score(gt_clust_asn, clust_asn)

                        stats.append(stat)
                        print(stat)

df = pd.DataFrame(stats)
filename = "lf-cons-evaluation-quality-stats.csv"
df.to_csv(filename, index=False, mode='w', header=True)
#df.to_csv(filename, index=False, mode='a', header=not os.path.exists(filename))

LFR-lf-cons-evaluation/n5000/LFR_n5000_mu01_gamma30_beta11.mtx
{'mu': 1, 'n': 5000, 'gamma': 30, 'beta': 11, 'cons_method': 'lf-louvain', 'ncluster': 150, 'fscore': 0.8733361339494689, 'precision': 0.7751523415860571, 'recall': 1.0, 'nmi': 0.9729867621660679}
{'mu': 1, 'n': 5000, 'gamma': 30, 'beta': 11, 'cons_method': 'v5', 'ncluster': 181, 'fscore': 0.9489540403776932, 'precision': 0.902866360590644, 'recall': 1.0, 'nmi': 0.9899316224321142}
{'mu': 1, 'n': 5000, 'gamma': 30, 'beta': 11, 'cons_method': 'boem', 'ncluster': 4900, 'fscore': 0.029532183368011274, 'precision': 0.634737449509521, 'recall': 0.015117781259448614, 'nmi': 0.760795082926948}
LFR-lf-cons-evaluation/n5000/LFR_n5000_mu02_gamma30_beta11.mtx
{'mu': 2, 'n': 5000, 'gamma': 30, 'beta': 11, 'cons_method': 'lf-louvain', 'ncluster': 157, 'fscore': 0.860864626961028, 'precision': 0.7557175796098959, 'recall': 1.0, 'nmi': 0.973949704756225}
{'mu': 2, 'n': 5000, 'gamma': 30, 'beta': 11, 'cons_method': 'v5', 'ncluster': 186, '

In [17]:
import networkx as nx
import networkx.algorithms.community as nx_comm
from networkx.generators.community import LFR_benchmark_graph
from networkx.algorithms import bipartite
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
import math

quality_metrics = ["precision", "recall", "fscore", "nmi"]
consensus_methods = ["lf-louvain", "v5"]
consensus_methods_colors = ["tab:gray", "tab:brown"]
mus = [1, 2, 3, 4]
ns = [5000]
for n in ns:
    df = pd.read_csv("lf-cons-evaluation-quality-stats.csv")
    df = df[df["n"] == n]
    
    fig = plt.figure(figsize=(9, 6))
    naxr = 2 
    naxc = 2
    gs = GridSpec(nrows=naxr, ncols=naxc)
    axes = []
    for i in range(naxr):
        axr = []
        for j in range(naxc):
            axr.append(fig.add_subplot(gs[i,j]))
        axes.append(axr)
        
    group_items = list(consensus_methods)
    group_width = 0.7
    bar_width = group_width/len(group_items)
    middle_bar = math.floor(len(group_items) / 2.0)
    even = len(group_items) % 2 == 0
    for i in range(naxr):
        for j in range(naxc):
            idx = (i * naxr + j)
            quality_metric = quality_metrics[idx]
            for k in range(len(consensus_methods)):
                df_target = df[df["cons_method"] == consensus_methods[k]]
                offset = None
                if(even):
                    offset = bar_width / 2.0 + (k - middle_bar) * bar_width
                else:
                    offset = (k - middle_bar) * bar_width
                axes[i][j].bar(df_target["mu"] + offset, df_target[quality_metric], color=consensus_methods_colors[k], width=bar_width, alpha=0.5, edgecolor='black', linewidth=bar_width/10.0, label=consensus_methods[k])
            axes[i][j].set_xlabel("$\mu$")
            axes[i][j].set_ylabel(quality_metric)
            axes[i][j].set_xticks(np.array(mus))
            axes[i][j].set_xticklabels(np.array(mus) / 10.0)
            axes[i][j].grid(axis='y')
            axes[i][j].legend(loc = "lower right")
                
    plt.tight_layout()
    plt.savefig("lf-cons-evaluation-quality-"+ "n"+str(n)+".pdf")

### Calculate distance distribution for all benchmark graphs, all conensus methods and all distance metrics

In [15]:
import os

stats = []

distance_metrics = ["split_joint_distance", "mirkin_distance", "variation_of_info_distance"]
consensus_methods = ["lf-louvain", "v5"]
#consensus_methods = ["v4"]
ns = [5000]
mus = [1, 2, 3, 4]
#mus = [2]
gammas = [30]
betas = [11]
for n in ns:
    for mu in mus:
        for gamma in gammas:
            for beta in betas:
                P_list = []
                fileprefix = "LFR-lf-cons-evaluation/" + "n" + str(n) + "/"
                fname = "LFR_n" + str(n) + "_mu0" + str(mu) + "_gamma" + str(gamma) + "_beta" + str(beta)
                graph_file = fileprefix + fname + ".mtx"
                print(graph_file)
                G = None
                if True:
                #with open(graph_file) as f:
                    #G = nx.from_scipy_sparse_array(spio.mmread(f), create_using=nx.Graph)

                    for k in clustering_enumeration:
                        alg_clust_file = fileprefix + fname + "." + str(k[1])
                        alg_partition = None
                        if Path(alg_clust_file).is_file():
                            alg_partition = read_clust_lst(alg_clust_file)

                            for cons_method in consensus_methods:
                                for distance_metric in distance_metrics:
                                    cons_partition = None
                                    optimized_distance = None
                                    cons_clust_file = fileprefix + fname + "." + cons_method
                                    common_stat = {}
                                    common_stat["mu"] = mu
                                    common_stat["n"] = n
                                    common_stat["gamma"] = gamma
                                    common_stat["beta"] = beta
                                    common_stat["alg"] = k[1]
                                    common_stat["cons_method"] = cons_method
                                    common_stat["distance_metric"] = distance_metric

                                    if cons_method in ["best_candidate"]:
                                        """
                                        for optimized_distance in distance_metrics:
                                            cons_partition = read_clust_lst(clust_file_prefix + "." + optimized_distance)
                                            #print(clust_file_prefix + "." + optimized_distance)
                                            stat = dict(common_stat)
                                            stat["optimized_distance"] = optimized_distance
                                            distance = eval(distance_metric)(alg_partition, cons_partition)
                                            stat["distance"] = distance

                                            stats.append(stat)
                                        """
                                        pass
                                    else:
                                         if Path(cons_clust_file).is_file():
                                            # No need to append any extension
                                            cons_partition = read_clust_lst(cons_clust_file)
                                            stat = dict(common_stat)
                                            stat["optimized_distance"] = "none"
                                            distance = eval(distance_metric)(alg_partition, cons_partition)
                                            stat["distance"] = distance

                                            stats.append(stat)

                
df = pd.DataFrame(stats)
filename = "lf-cons-evaluation-distance-stats.csv"
df.to_csv(filename, index=False, mode='w', header=True)
#df.to_csv(filename, index=False, mode='a', header=not os.path.exists(filename))

LFR-lf-cons-evaluation/n5000/LFR_n5000_mu01_gamma30_beta11.mtx
LFR-lf-cons-evaluation/n5000/LFR_n5000_mu02_gamma30_beta11.mtx
LFR-lf-cons-evaluation/n5000/LFR_n5000_mu03_gamma30_beta11.mtx
LFR-lf-cons-evaluation/n5000/LFR_n5000_mu04_gamma30_beta11.mtx


In [16]:
import networkx as nx
import networkx.algorithms.community as nx_comm
from networkx.generators.community import LFR_benchmark_graph
from networkx.algorithms import bipartite
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})

def plot_distance_distribution(data, ax, colors, distribution_labels, ylabel, title):
    # https://stackoverflow.com/questions/26291479/changing-the-color-of-matplotlibs-violin-plots
    violin_parts = ax.violinplot(data, showmeans=True, showmedians=True, points=20)

    for k in range(len(data)):
        violin_parts["bodies"][k].set_facecolor(colors[k])
        violin_parts["bodies"][k].set_edgecolor("black")
    violin_parts["cmeans"].set_color("magenta")
    violin_parts["cmedians"].set_color("aqua")
    violin_parts["cbars"].set_color("gray")
    violin_parts["cmaxes"].set_color("lightgray")
    violin_parts["cmins"].set_color("lightgray")

    ax.xaxis.set_ticks(range(1, len(data)+1))
    if (distribution_labels is not None) and (len(distribution_labels) == len(data)):
        ax.xaxis.set_ticklabels(distribution_labels)
    else:
        ax.xaxis.set_ticklabels([])
    
    if ylabel is not None:
        ax.set_ylabel(ylabel)
    
    if title is not None:
        ax.set_title(title)

distance_metrics = ["split_joint_distance", "mirkin_distance", "variation_of_info_distance"]

#consensus_methods = ["mcla", "hbgf", "nmf", "boem", "v3", "v4", "v5"]
#consensus_methods_colors = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple", "tab:brown", "tab:pink"]

consensus_methods = ["lf-louvain", "v5"]
consensus_methods_colors = ["tab:gray", "tab:pink"]

mus = [1, 2, 3, 4]
ns = [5000]
for n in ns:
    df = pd.read_csv("lf-cons-evaluation-distance-stats.csv")
    df = df[df["n"] == n]
    for distance_metric in distance_metrics:
        naxr = 2 
        naxc = 2

        fig = plt.figure(figsize=(6, 6))
        gs = GridSpec(nrows=naxr, ncols=naxc)

        axes = []
        for i in range(naxr):
            axr = []
            for j in range(naxc):
                axr.append(fig.add_subplot(gs[i,j]))
            axes.append(axr)
        
        for i in range(naxr):
            for j in range(naxc):
                idx = (i * naxr + j)
                mu = mus[idx]
                data = []
                
                for k in range(len(consensus_methods)):
                    cons_method = consensus_methods[k]
                    mask = None
                    if cons_method == "best_candidate":
                        mask = (df["mu"] == mu) & (df["distance_metric"] == distance_metric) & (df["cons_method"] == cons_method) & (df["optimized_distance"] == distance_metric)
                    else:
                        mask = (df["mu"] == mu) & (df["distance_metric"] == distance_metric) & (df["cons_method"] == cons_method) & (df["optimized_distance"] == "none")
                    df_target = df[mask]
                    data.append(df_target["distance"])
                
                plot_distance_distribution(data, axes[i][j], consensus_methods_colors, consensus_methods, distance_metric, "$\mu:" + str(mu*1.0/10.0)+"$")
        
        plt.tight_layout()
        plt.savefig("lf-cons-evaluation-distribution-"+ "n"+str(n)+ "-" + distance_metric +".pdf")