In [1]:
import networkx as nx
import networkx.algorithms.community as nx_comm
from networkx.generators.community import LFR_benchmark_graph
from networkx.algorithms import bipartite
import numpy as np
import scipy as sp
from scipy.sparse import coo_array
from scipy import sparse
from cdlib import algorithms
from cdlib import evaluation
import sklearn
from utils import *
from distances import *
from consensus import *
import math
import itertools
import random
import time
from pathlib import Path

Note: to be able to use all crisp methods, you need to install some additional packages:  {'ASLPAw'}


In [2]:
cons_name = "v8"

In [42]:
from scipy.sparse import coo_array
from scipy.sparse import csr_array

# DeltaSOD is calculated following the paper titled "Integrating Microarray Data by Consensus Clustering"
# by Filkov and Skiena
# Assumes the elements of the cluster are named as 0-based indices
# Same as v5 consensus except with following improvements
#    - Calculate Mua only once when a particular vertex u is starting to be considered. 
#      In v5 it was being calculated at every (u,v) pair consideration.
#    - Calculate deltaS only if u and v belong to different clusters in the current solution.
#      That means a and b are different
def v8_consensus(P_list, niter=10, starting_partition=None, verbose=False):
    G = nx.Graph(P_list[0]["graph"])
    n = len(list(G.nodes()))
    k = len(P_list)
    print("Number of edges in G:", len(list(G.edges())))

    t1 = time.time()
    A = nx.to_scipy_sparse_array(G, format="coo")
    t2 = time.time()
    print("Time to get sparse matrix of the graph:", t2-t1)

    nz_rows = A.row 
    nz_cols = A.col
    
    t1 = time.time()
    P_list_asn = []
    c = np.zeros((n,k))
    for i in range(k):
        clust_lst = P_list[i]["partition"]
        clust_asn = clust_lst_to_asn(clust_lst)
        c[:,i] = np.array(clust_asn)
    t2 = time.time()
    print("Time to generate cluster assignment matrix:", t2-t1)
    
    Aw_rows = A.row
    Aw_cols = A.col
    Aw_vals = A.data
    #nz_elems = []
    t1 = time.time()
    for i in range(len(nz_rows)):
        Aw_vals[i] = np.sum( c[nz_rows[i],:] == c[nz_cols[i],:] )
        #nz_elems.append((nz_rows[i], nz_cols[i], Aw_vals[i]))
    #Gw = nx.from_scipy_sparse_array(coo_array((Aw_vals, (Aw_rows, Aw_cols)), shape=(n, n)))
    #nz_elems = sorted(nz_elems, key=lambda x: x[2], reverse=True)
    Aw = csr_array((Aw_vals, (Aw_rows, Aw_cols)), shape=(n, n))
    t2 = time.time()
    print("Time to generate weighted consensus graph:", t2-t1)
    print("Number of non-zeroes in Aw:", Aw.count_nonzero())
    
    t1 = time.time()
    refined_partition = None
    if starting_partition:
        refined_partition = list(starting_partition)
    else:
        refined_partition = []
        for i in range(n):
            refined_partition.append([str(i)])
    
    refined_partition_map = clust_lst_to_map(refined_partition)
    items = list(refined_partition_map.keys())
    t2 = time.time()
    print("Time to initialize:", t2-t1)
    
    tSearch = 0
    tUpdate = 0
    tMovement = 0
    count = 0
    it = 1
    last_valid = np.zeros(n)
    last_deltaS = np.zeros(n)
    while(it <= niter):
        potential_moves = {
            "from": np.arange(n),
            "to": np.arange(n),
            "attractor": np.arange(n),
            "deltaS": np.zeros(n),
            "valid": np.zeros(n)
        }
        
        # Check all edges and figure out vertices that has different neighboring community other than it's own
        number_of_edges_to_be_probed = 0
        edges_to_be_probed = []
        t1 = time.time()
        for u in range(n):
            row_start = Aw.indptr[u]
            row_end = Aw.indptr [u+1]
            a = refined_partition_map[str(u)]
            edges_to_be_probed.append([])
            for j in range(row_start, row_end):
                v = Aw.indices[j]
                w = Aw.data[j]
                b = refined_partition_map[str(v)]
                if a != b:
                    edges_to_be_probed[u].append( (u, a, v, b, w) )
                    number_of_edges_to_be_probed = number_of_edges_to_be_probed + 1
        t2 = time.time()
        tSearch = tSearch + t2-t1
        #print("number_of_edges_to_be_probed:", number_of_edges_to_be_probed)
        
        # Compute Mua for all target u in current iteration
        t1 = time.time()
        Mua = np.zeros(n)
        for u in range(n):
            if len(edges_to_be_probed[u]) > 0:
                a = refined_partition_map[str(u)]
                for elem in refined_partition[a]:
                    if str(elem) != str(u):
                        Mua[u] = Mua[u] + (k - 2 * np.sum( (c[int(u),:] == c[int(elem),:]) ) )
            else:
                pass
        t2 = time.time()
        tSearch = tSearch + t2-t1
        
        #print(Mua)
            
        # Compute Mub for each potential moves and check feasibility with deltaS
        t1 = time.time()
        for u in range(n):
            for potential_move in edges_to_be_probed[u]:
                a = potential_move[1]
                v = potential_move[2]
                w = potential_move[4]
                b = potential_move[3]
                Mub = 0
                for elem in refined_partition[b]:    
                    if str(elem) != str(u):
                        Mub = Mub + (k - 2 * np.sum( (c[int(u),:] == c[int(elem),:]) ) )
                
                deltaS = Mub - Mua[u]

                if (deltaS is not None) and (deltaS < 0) and (deltaS < potential_moves["deltaS"][u]):
                    potential_moves["from"][u] = a
                    potential_moves["to"][u] = b
                    potential_moves["attractor"][u] = v
                    potential_moves["deltaS"][u] = deltaS
                    potential_moves["valid"][u] = 1
                    #break
        t2 = time.time()
        tSearch = tSearch + t2-t1
        
        t1 = time.time()
        for u in range(n):
            v = potential_moves["attractor"][u]
            if (potential_moves["valid"][u] == 1) and (potential_moves["attractor"][v] == u) and (potential_moves["valid"][v] == 1):
                # Question mark
                if potential_moves["deltaS"][u] < potential_moves["deltaS"][v]:
                    potential_moves["valid"][v] = 0
                    potential_moves["deltaS"][v] = 0
                else:
                    potential_moves["valid"][u] = 0
                    potential_moves["deltaS"][u] = 0
        t2 = time.time()
        tSearch = tSearch + (t2 - t1)
        
        flag = False
        if np.sum( (potential_moves["valid"] != last_valid) ) == 0:
            # Same set of elements are being moved
            if np.sum(potential_moves["deltaS"]) < np.sum(last_deltaS):
                last_valid = potential_moves["valid"]
                last_deltaS = potential_moves["deltaS"]
                flag = True
        else:
            last_valid = potential_moves["valid"]
            last_deltaS = potential_moves["deltaS"]
            flag = True
        
        if flag == True:
            for u in range(n):
                if potential_moves["valid"][u] == True:
                    a = potential_moves["from"][u]
                    b = potential_moves["to"][u]

                    t1 = time.time()
                    if verbose:
                        print("---")
                        print("Iteration:", it, "Move Count:", count+1, ">> results in deltaS", potential_moves["deltaS"][u])
                        print("Move:", u)
                        print("From partition", a, ":", refined_partition[a])
                        print("To partition", b, ":", refined_partition[b])
                        print("---")
                    refined_partition[a].remove(str(u))
                    refined_partition[b].append(str(u))
                    refined_partition_map[str(u)] = b
                    t2 = time.time()
                    tUpdate = tUpdate + (t2-t1)

                    count = count + 1
        
        if flag == False:
            break
        
        it = it + 1
    print("Time to search moves:", tSearch)
    print("Time to update M:", tUpdate)
    
    t1 = time.time()
    empty_clusters = []
    for i in range(len(refined_partition)):
        if len(refined_partition[i]) == 0:
            empty_clusters.append(i)
            
    empty_clusters.sort(reverse=True)
    for e in empty_clusters:
        del refined_partition[e]
    t2 = time.time()
    print("Time to delete empty partitions:", t2-t1)
    
    Gw = nx.from_scipy_sparse_array(Aw)
    return {"graph": nx.Graph(Gw), "partition": list(refined_partition)}

# n=200

In [43]:
n = 200
expected_clusters = []
for i in range(4):
    expected_clusters.append(random.randint(int(n ** (1. / 3)),3*int(n ** (1. / 2))))
    
alg_params = {
    "label_propagation": None,
    "leiden": None,
    "significance_communities": None,
    "surprise_communities": None,
    "greedy_modularity": None,
    "paris": None,
    "louvain": {
        "resolution": [0.75, 1.0, 1.25, 1.5],
        "randomize": [314159, 2718]
    },
    "infomap": None,
    "walktrap": None,
    "markov_clustering": {
        "inflation": [1.2, 1.5, 2, 2.5],
        "pruning_threshold": [0.01, 0.001],
        "convergence_check_frequency": [100]
    },
    "em": {
        "k": list(expected_clusters)
    },
    "sbm_dl": None,
    "spinglass": {
        "spins": list(expected_clusters)
    },
    "ricci_community": {
        "alpha": [0.3, 0.5, 0.6, 0.75]
    }
}

clustering_enumeration = []
count = 0
for alg, params in alg_params.items():
    param_combinations = []
    param_names = []
    if params is not None:
        iterables = []
        param_names = []
        for param in params.keys():
            iterables.append(list(params[param]))
            param_names.append(param)
        param_combinations = list(itertools.product(*iterables))
    if len(param_combinations) > 0:
        for param_combination in param_combinations:
            expr = "algorithms."+alg+"(G"
            for i in range(len(param_names)):
                expr = expr + "," + param_names[i] + "=" + str(param_combination[i])
            expr = expr + ")"
            clustering_enumeration.append((expr,count))
            count = count + 1      
    else:
        expr = "algorithms."+alg+"(G)"
        clustering_enumeration.append((expr,count))
        count = count + 1
        
print(clustering_enumeration)

[('algorithms.label_propagation(G)', 0), ('algorithms.leiden(G)', 1), ('algorithms.significance_communities(G)', 2), ('algorithms.surprise_communities(G)', 3), ('algorithms.greedy_modularity(G)', 4), ('algorithms.paris(G)', 5), ('algorithms.louvain(G,resolution=0.75,randomize=314159)', 6), ('algorithms.louvain(G,resolution=0.75,randomize=2718)', 7), ('algorithms.louvain(G,resolution=1.0,randomize=314159)', 8), ('algorithms.louvain(G,resolution=1.0,randomize=2718)', 9), ('algorithms.louvain(G,resolution=1.25,randomize=314159)', 10), ('algorithms.louvain(G,resolution=1.25,randomize=2718)', 11), ('algorithms.louvain(G,resolution=1.5,randomize=314159)', 12), ('algorithms.louvain(G,resolution=1.5,randomize=2718)', 13), ('algorithms.infomap(G)', 14), ('algorithms.walktrap(G)', 15), ('algorithms.markov_clustering(G,inflation=1.2,pruning_threshold=0.01,convergence_check_frequency=100)', 16), ('algorithms.markov_clustering(G,inflation=1.2,pruning_threshold=0.001,convergence_check_frequency=100)

In [44]:
import random

n = 200
fileprefix = "LFR/" + "n" + str(n) + "/"
mus = [1, 2, 3, 4]
#mus = [4]
gammas = [30]
betas = [11]
for mu in mus:
    for gamma in gammas:
        for beta in betas:
            P_list = []
            fname = "LFR_n" + str(n) + "_mu0" + str(mu) + "_gamma" + str(gamma) + "_beta" + str(beta)
            graph_file = fileprefix + fname + ".mtx"
            print(graph_file)
            G = None
            with open(graph_file) as f:
                G = nx.from_scipy_sparse_array(spio.mmread(f), create_using=nx.Graph)
                coms = None
                for k in clustering_enumeration:
                    clust_file = fileprefix + fname + "." + str(k[1])
                    if Path(clust_file).is_file():
                        partition = read_clust_lst(clust_file)
                        P_list.append({"graph": nx.Graph(G), "partition": list(partition)})
                t1 = time.time()
                P_star = v8_consensus(P_list, niter=100, starting_partition=None, verbose=False)
                t2 = time.time()
                print("mu", mu, ", number of clusters", len(P_star["partition"]))
                print("Time:", t2-t1)
                write_clust_lst(P_star["partition"], fileprefix + fname + "." + cons_name)

LFR/n200/LFR_n200_mu01_gamma30_beta11.mtx
Number of edges in G: 993
Time to get sparse matrix of the graph: 0.002145528793334961
Time to generate cluster assignment matrix: 0.0028738975524902344
Time to generate weighted consensus graph: 0.009280920028686523
Number of non-zeroes in Aw: 1924
Time to initialize: 5.745887756347656e-05
Time to search moves: 0.19180965423583984
Time to update M: 0.0001499652862548828
Time to delete empty partitions: 2.9802322387695312e-05
mu 1 , number of clusters 16
Time: 0.23626208305358887
LFR/n200/LFR_n200_mu02_gamma30_beta11.mtx
Number of edges in G: 1008
Time to get sparse matrix of the graph: 0.002307891845703125
Time to generate cluster assignment matrix: 0.002768278121948242
Time to generate weighted consensus graph: 0.008890390396118164
Number of non-zeroes in Aw: 1957
Time to initialize: 5.6743621826171875e-05
Time to search moves: 0.23517847061157227
Time to update M: 0.00017714500427246094
Time to delete empty partitions: 4.792213439941406e-05


# n=1000

In [45]:
n = 1000
expected_clusters = []
for i in range(4):
    expected_clusters.append(random.randint(int(n ** (1. / 3)),3*int(n ** (1. / 2))))
    
alg_params = {
    "label_propagation": None,
    "leiden": None,
    "significance_communities": None,
    "surprise_communities": None,
    "greedy_modularity": None,
    "paris": None,
    "louvain": {
        "resolution": [0.75, 1.0, 1.25, 1.5],
        "randomize": [314159, 2718]
    },
    "infomap": None,
    "walktrap": None,
    "markov_clustering": {
        "inflation": [1.2, 1.5, 2, 2.5],
        "pruning_threshold": [0.01, 0.001],
        "convergence_check_frequency": [100]
    },
    "em": {
        "k": list(expected_clusters)
    },
    "sbm_dl": None,
    "spinglass": {
        "spins": list(expected_clusters)
    },
    "ricci_community": {
        "alpha": [0.3, 0.5, 0.6, 0.75]
    }
}

clustering_enumeration = []
count = 0
for alg, params in alg_params.items():
    param_combinations = []
    param_names = []
    if params is not None:
        iterables = []
        param_names = []
        for param in params.keys():
            iterables.append(list(params[param]))
            param_names.append(param)
        param_combinations = list(itertools.product(*iterables))
    if len(param_combinations) > 0:
        for param_combination in param_combinations:
            expr = "algorithms."+alg+"(G"
            for i in range(len(param_names)):
                expr = expr + "," + param_names[i] + "=" + str(param_combination[i])
            expr = expr + ")"
            clustering_enumeration.append((expr,count))
            count = count + 1      
    else:
        expr = "algorithms."+alg+"(G)"
        clustering_enumeration.append((expr,count))
        count = count + 1
        
print(clustering_enumeration)

[('algorithms.label_propagation(G)', 0), ('algorithms.leiden(G)', 1), ('algorithms.significance_communities(G)', 2), ('algorithms.surprise_communities(G)', 3), ('algorithms.greedy_modularity(G)', 4), ('algorithms.paris(G)', 5), ('algorithms.louvain(G,resolution=0.75,randomize=314159)', 6), ('algorithms.louvain(G,resolution=0.75,randomize=2718)', 7), ('algorithms.louvain(G,resolution=1.0,randomize=314159)', 8), ('algorithms.louvain(G,resolution=1.0,randomize=2718)', 9), ('algorithms.louvain(G,resolution=1.25,randomize=314159)', 10), ('algorithms.louvain(G,resolution=1.25,randomize=2718)', 11), ('algorithms.louvain(G,resolution=1.5,randomize=314159)', 12), ('algorithms.louvain(G,resolution=1.5,randomize=2718)', 13), ('algorithms.infomap(G)', 14), ('algorithms.walktrap(G)', 15), ('algorithms.markov_clustering(G,inflation=1.2,pruning_threshold=0.01,convergence_check_frequency=100)', 16), ('algorithms.markov_clustering(G,inflation=1.2,pruning_threshold=0.001,convergence_check_frequency=100)

In [46]:
import random

n = 1000
fileprefix = "LFR/" + "n" + str(n) + "/"
mus = [1, 2, 3, 4]
#mus = [4]
gammas = [30]
betas = [11]
for mu in mus:
    for gamma in gammas:
        for beta in betas:
            P_list = []
            fname = "LFR_n" + str(n) + "_mu0" + str(mu) + "_gamma" + str(gamma) + "_beta" + str(beta)
            graph_file = fileprefix + fname + ".mtx"
            print(graph_file)
            G = None
            with open(graph_file) as f:
                G = nx.from_scipy_sparse_array(spio.mmread(f), create_using=nx.Graph)
                coms = None
                for k in clustering_enumeration:
                    clust_file = fileprefix + fname + "." + str(k[1])
                    if Path(clust_file).is_file():
                        partition = read_clust_lst(clust_file)
                        P_list.append({"graph": nx.Graph(G), "partition": list(partition)})
                t1 = time.time()
                P_star = v8_consensus(P_list, niter=1000, starting_partition=None, verbose=False)
                t2 = time.time()
                print("mu", mu, ", number of clusters", len(P_star["partition"]))
                print("Time:", t2-t1)
                write_clust_lst(P_star["partition"], fileprefix + fname + "." + cons_name)
                

LFR/n1000/LFR_n1000_mu01_gamma30_beta11.mtx
Number of edges in G: 9609
Time to get sparse matrix of the graph: 0.020252227783203125
Time to generate cluster assignment matrix: 0.012450218200683594
Time to generate weighted consensus graph: 0.08150577545166016
Number of non-zeroes in Aw: 17503
Time to initialize: 0.00027489662170410156
Time to search moves: 2.9038853645324707
Time to update M: 0.0009188652038574219
Time to delete empty partitions: 0.0001926422119140625
mu 1 , number of clusters 38
Time: 3.214815378189087
LFR/n1000/LFR_n1000_mu02_gamma30_beta11.mtx
Number of edges in G: 9791
Time to get sparse matrix of the graph: 0.020755290985107422
Time to generate cluster assignment matrix: 0.012275218963623047
Time to generate weighted consensus graph: 0.0812520980834961
Number of non-zeroes in Aw: 17779
Time to initialize: 0.0002827644348144531
Time to search moves: 3.5000622272491455
Time to update M: 0.0009756088256835938
Time to delete empty partitions: 0.00017189979553222656
mu

# n=5000

In [47]:
n = 5000
expected_clusters = []
for i in range(4):
    expected_clusters.append(random.randint(int(n ** (1. / 3)),3*int(n ** (1. / 2))))
    
alg_params = {
    "label_propagation": None,
    "leiden": None,
    "significance_communities": None,
    "surprise_communities": None,
    "greedy_modularity": None,
    "paris": None,
    "louvain": {
        "resolution": [0.75, 1.0, 1.25, 1.5],
        "randomize": [314159, 2718]
    },
    "infomap": None,
    "walktrap": None,
    "markov_clustering": {
        "inflation": [1.2, 1.5, 2, 2.5],
        "pruning_threshold": [0.01, 0.001],
        "convergence_check_frequency": [100]
    },
    "em": {
        "k": list(expected_clusters)
    },
    "sbm_dl": None,
    "spinglass": {
        "spins": list(expected_clusters)
    },
    "ricci_community": {
        "alpha": [0.3, 0.5, 0.6, 0.75]
    }
}

clustering_enumeration = []
count = 0
for alg, params in alg_params.items():
    param_combinations = []
    param_names = []
    if params is not None:
        iterables = []
        param_names = []
        for param in params.keys():
            iterables.append(list(params[param]))
            param_names.append(param)
        param_combinations = list(itertools.product(*iterables))
    if len(param_combinations) > 0:
        for param_combination in param_combinations:
            expr = "algorithms."+alg+"(G"
            for i in range(len(param_names)):
                expr = expr + "," + param_names[i] + "=" + str(param_combination[i])
            expr = expr + ")"
            clustering_enumeration.append((expr,count))
            count = count + 1      
    else:
        expr = "algorithms."+alg+"(G)"
        clustering_enumeration.append((expr,count))
        count = count + 1
        
print(clustering_enumeration)

[('algorithms.label_propagation(G)', 0), ('algorithms.leiden(G)', 1), ('algorithms.significance_communities(G)', 2), ('algorithms.surprise_communities(G)', 3), ('algorithms.greedy_modularity(G)', 4), ('algorithms.paris(G)', 5), ('algorithms.louvain(G,resolution=0.75,randomize=314159)', 6), ('algorithms.louvain(G,resolution=0.75,randomize=2718)', 7), ('algorithms.louvain(G,resolution=1.0,randomize=314159)', 8), ('algorithms.louvain(G,resolution=1.0,randomize=2718)', 9), ('algorithms.louvain(G,resolution=1.25,randomize=314159)', 10), ('algorithms.louvain(G,resolution=1.25,randomize=2718)', 11), ('algorithms.louvain(G,resolution=1.5,randomize=314159)', 12), ('algorithms.louvain(G,resolution=1.5,randomize=2718)', 13), ('algorithms.infomap(G)', 14), ('algorithms.walktrap(G)', 15), ('algorithms.markov_clustering(G,inflation=1.2,pruning_threshold=0.01,convergence_check_frequency=100)', 16), ('algorithms.markov_clustering(G,inflation=1.2,pruning_threshold=0.001,convergence_check_frequency=100)

In [48]:
import random

n = 5000
fileprefix = "LFR/" + "n" + str(n) + "/"
mus = [1, 2, 3, 4]
#mus = [4]
gammas = [30]
betas = [11]
for mu in mus:
    for gamma in gammas:
        for beta in betas:
            P_list = []
            fname = "LFR_n" + str(n) + "_mu0" + str(mu) + "_gamma" + str(gamma) + "_beta" + str(beta)
            graph_file = fileprefix + fname + ".mtx"
            print(graph_file)
            G = None
            with open(graph_file) as f:
                G = nx.from_scipy_sparse_array(spio.mmread(f), create_using=nx.Graph)
                coms = None
                for k in clustering_enumeration:
                    clust_file = fileprefix + fname + "." + str(k[1])
                    if Path(clust_file).is_file():
                        partition = read_clust_lst(clust_file)
                        P_list.append({"graph": nx.Graph(G), "partition": list(partition)})
                t1 = time.time()
                P_star = v8_consensus(P_list, niter=1000, starting_partition = None, verbose=False)
                t2 = time.time()
                print("mu", mu, ", number of clusters", len(P_star["partition"]))
                print("Time:", t2-t1)
                write_clust_lst(P_star["partition"], fileprefix + fname + "." + cons_name)
                

LFR/n5000/LFR_n5000_mu01_gamma30_beta11.mtx
Number of edges in G: 48950
Time to get sparse matrix of the graph: 0.11660981178283691
Time to generate cluster assignment matrix: 0.06601881980895996
Time to generate weighted consensus graph: 0.42052221298217773
Number of non-zeroes in Aw: 88672
Time to initialize: 0.0014939308166503906
Time to search moves: 14.39803171157837
Time to update M: 0.004987001419067383
Time to delete empty partitions: 0.0006864070892333984
mu 1 , number of clusters 201
Time: 16.092422008514404
LFR/n5000/LFR_n5000_mu02_gamma30_beta11.mtx
Number of edges in G: 50068
Time to get sparse matrix of the graph: 0.12063765525817871
Time to generate cluster assignment matrix: 0.06563997268676758
Time to generate weighted consensus graph: 0.42278361320495605
Number of non-zeroes in Aw: 84929
Time to initialize: 0.0014562606811523438
Time to search moves: 23.398688554763794
Time to update M: 0.005131244659423828
Time to delete empty partitions: 0.0007371902465820312
mu 2 ,