In [1]:
"""
Given a filename, reads list of clusters from the file.
Assumes that each line of the file represents a cluster, separating each 
individual element of the cluster with a blank space.
Returned data structure is a Python list representing list of clusters in a 
partition. 
Each cluster contains a list of elements.
Each element of a cluster is represented with a string by default but integer if 
`integer_node_id` is True.
"""
def read_clust_lst(filename, integer_node_id=False):
    lst = []
    with open(filename, mode="r") as infile:
        lines = infile.readlines()
        for l in lines:
            cl = []
            toks = l.split(" ")
            for t in toks:
                if t.isspace() == False:
                    if integer_node_id:
                        cl.append(int(t))
                    else:
                        cl.append(t)
            lst.append(cl)
    return lst

def write_clust_lst(clust_lst, filename):
    with open(filename, mode="w") as outfile:
        for lst in clust_lst:
            if len(lst) > 0:
                for e in lst:
                    outfile.write(str(e))
                    outfile.write(" ")
                outfile.write("\n")

def write_clust_asn(clust_asn, filename):
    with open(filename, mode="w") as outfile:
        for s in clust_asn:
            outfile.write("%s\n" % s)

def write_label_map(label_map, filename):
    with open(filename, mode="w") as outfile:
        for k in label_map.keys():
            line = str(k) + " " + str(label_map[k]) + "\n"
            outfile.write(line)

def read_label_map(filename, reverse=False):
    label_map = {}
    with open(filename, mode="r") as infile:
        lines = infile.readlines()
        for l in lines:
            l = l.strip()
            toks = l.split(" ")
            # print(toks)
            if reverse:
                label_map[toks[1]] = toks[0]
            else:
                label_map[toks[0]] = toks[1]
    return label_map

def clust_asn_to_lst(clust_asn):
    clust_lst = {}
    for i in range(len(clust_asn)):
        if clust_asn[i] not in clust_lst.keys():
            clust_lst[clust_asn[i]] = []
        clust_lst[clust_asn[i]].append(i)
    clust_lst = list(clust_lst.values())
    for i in range(len(clust_lst)):
        clust_lst[i] = set(clust_lst[i])
    return clust_lst

# print(clust_asn_to_lst([0, 0, 1, 0, 2, 1]))

def clust_lst_to_asn(clust_lst, nelem=None):
    
    if nelem == None:
        nelem = 0
        for l in clust_lst:
            nelem = nelem + len(l)   
    
    clust_map = clust_lst_to_map(clust_lst)
    keys = list(clust_map.keys())
    keys.sort(key=int)
    
    clust_asn = [-1] * nelem
    i = 0
    while i < nelem:
        clust_asn[i] = clust_map[keys[i]]
        i = i + 1
        
    return clust_asn

# print(clust_lst_to_asn([[0,1,3], [2,5], [4]]))

def clust_lst_to_map(clust_lst, nelem=None):
    clust_map = {}
    for l in range(len(clust_lst)):
        for e in clust_lst[l]:
            clust_map[e] = l
    return clust_map

# print(clust_lst_to_map([[0,1,3], [2,5], [4]]))

def read_matrix_market(filename):
	G = None
	with open(filename) as f:
		G = nx.from_scipy_sparse_matrix(spio.mmread(f))
	return G

def write_matrix_market(filename, G):
	A = nx.to_numpy_array(G)
	SA = csr_matrix(A)
	spio.mmwrite(filename, SA)
	return


In [None]:
from pathlib import Path

def plot_distance_distribution(data, ax, colors, distribution_labels, ylabel, title):
    # https://stackoverflow.com/questions/26291479/changing-the-color-of-matplotlibs-violin-plots
    violin_parts = ax.violinplot(data, showmeans=True, showmedians=True, points=20)

    for k in range(len(data)):
        violin_parts["bodies"][k].set_facecolor(colors[k])
        violin_parts["bodies"][k].set_edgecolor("black")
    violin_parts["cmeans"].set_color("magenta")
    violin_parts["cmedians"].set_color("aqua")
    violin_parts["cbars"].set_color("gray")
    violin_parts["cmaxes"].set_color("lightgray")
    violin_parts["cmins"].set_color("lightgray")

    ax.xaxis.set_ticks(range(1, len(data)+1))
    if (distribution_labels is not None) and (len(distribution_labels) == len(data)):
        ax.xaxis.set_ticklabels(distribution_labels)
    else:
        ax.xaxis.set_ticklabels([])
    
    if ylabel is not None:
        ax.set_ylabel(ylabel)
    
    if title is not None:
        ax.set_title(title)
        
mus = [1, 2, 3, 4]
ns = [200, 1000, 5000]
algs = ["v8", "v8-parallel", "boem", "saoem"]
distance_metrics = ["rand", "split-join", "vi"]

for n in ns:
    for mu in mus:
        directory = "data/LFR/" + "n"+str(n) + "/"
        fileprefix = "LFR_" + "n" + str(n) + "_mu0" + str(mu) + "_gamma30_beta11"
        for alg in algs:
            for distance_metric in distance_metrics:
                filename = directory + fileprefix + "." + alg + "." + distance_metric
                p = Path(filename)
                print(p.exists(), filename)

# Preprocessing threshold parameter study

In [22]:
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path
from matplotlib.ticker import MaxNLocator

RESULT_DIR="experiment-results"
OUTPUT_DIR=RESULT_DIR+"/"+"preprocess-study"
DATASET_NAME="LFR-louvain-1"

mus = [1, 2, 3, 4, 5, 6, 7]
#ns = [1000, 5000]
ns = [5000]

fig = plt.figure(figsize=(4, 3))
naxr = 1
naxc = 1
gs = GridSpec(nrows=naxr, ncols=naxc)
axes = []
for i in range(naxr):
    axr = []
    for j in range(naxc):
        axr.append(fig.add_subplot(gs[i,j]))
    axes.append(axr)
#print(type(axes), type(axes[0]))

for i in range(naxr):
    for j in range(naxc):
        idx = (i * naxr + j)
        n = ns[idx]
        for mu in mus:
            csv_file = OUTPUT_DIR+"/"+DATASET_NAME+".n"+str(n)+".mu0"+str(mu)+".csv"
            filepath = Path(csv_file)
            #print(csv_file, filepath.exists())
            if filepath.exists():
                df = pd.read_csv(csv_file)
                #print(df)
                #print(type(axes[idx]))
                axes[i][j].plot(df["threshold"], df["n_components"], label=str(float(mu)/10.0), marker=".")
                
        #axes[i][j].set_xlim(right=0.6)
        axes[i][j].minorticks_on()
        axes[i][j].grid(which="major", axis="both")
        #axes[i][j].legend(title="$\mu$")
        axes[i][j].set_xlabel("Threshold")
        axes[i][j].set_ylabel("Number of groups")
        #axes[i][j].set_title("n="+str(n))
        axes[i][j].set_title("(b) Slightly different partitions")
        axes[i][j].yaxis.set_major_locator(MaxNLocator(integer=True))

#plt.suptitle("Effect of preprocessing threshold on LFR")
plt.tight_layout()
plt.savefig("preprocess-study-louvain.pdf")
plt.close()

# Ground truth quality evaluation on benchmark graphs

In [2]:
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path
from matplotlib.transforms import Bbox

mus = [1, 2, 3, 4, 5, 6, 7]
#mus = [3]
algs = ["v8-parallel", "kirkley-newman", "lancichinetti-fortunato"]
ns = [5000]
metrics = ["vi", "split-join", "rand"]

HOME = "/nfs/nfs2/home/mth"
PROJECT_DIR = HOME + "/Codes/graph-consensus-clustering"
DATA_DIR = PROJECT_DIR + "/test/data"
RESULT_DIR = PROJECT_DIR + "/test/experiment-results/benchmark-study"
DATASET_NAME="LFR-louvain-10"
N = str(ns[0])

fig = plt.figure(figsize=(9, 2.5))
naxr = 1
naxc = 3
gs = GridSpec(nrows=naxr, ncols=naxc)
axes = []
for i in range(naxr):
    axr = []
    for j in range(naxc):
        axr.append(fig.add_subplot(gs[i,j]))
    axes.append(axr)
#print(type(axes), type(axes[0]))

for i in range(naxr):
    for j in range(naxc):
        idx = (i * naxr + j)
        METRIC = metrics[j]
        #N = str(ns[idx])
        input_x = []
        input_y = []
        consensus_stats = []
        for mu in mus:
            MU = "0"+str(mu)
            FILE_PREFIX = "LFR" + "_n" + N +"_mu" + MU + "_gamma30_beta11"
            INPUT_DIR = DATA_DIR + "/" + DATASET_NAME + "/" + "n" + N
            INPUT_CLUSTERING_PREFIX = INPUT_DIR + "/" + FILE_PREFIX
            
            inputs = []
            NUMBER_OF_INPUT_CLUSTERING=0
            for NUMBER_OF_INPUT_CLUSTERING in range(0,100):
                if(Path(INPUT_CLUSTERING_PREFIX+"."+str(NUMBER_OF_INPUT_CLUSTERING)).is_file()):
                    METRIC_FILE = INPUT_CLUSTERING_PREFIX+"."+str(NUMBER_OF_INPUT_CLUSTERING) + "." + METRIC+".gt"
                    if(Path(METRIC_FILE).is_file()):
                        with open(METRIC_FILE) as metric_f:
                            metric_v = float(metric_f.readline())
                            input_x.append(float(mu)/10.0)
                            input_y.append(metric_v)
                            inputs.append( (metric_v, NUMBER_OF_INPUT_CLUSTERING) )
                            if metric_v < 1e-9:
                                print(METRIC_FILE)
                else:
                    break
                    
            #inputs = sorted(inputs, key=lambda x: x[0])
            #for i in range(37):
            #    print(inputs[i][0], inputs[i][1])
        
            
            for ALG in algs:
                OUTPUT_DIR_NAME=ALG + "." + DATASET_NAME + ".n" + N + ".mu" + MU
                OUTPUT_DIR = RESULT_DIR + "/" + OUTPUT_DIR_NAME
                OUTPUT_PREFIX = OUTPUT_DIR + "/" + FILE_PREFIX
                SOLN=0
                METRIC_FILE=OUTPUT_PREFIX+".soln-"+str(SOLN)+"."+METRIC+".gt"
                stat = {"DATASET_NAME": DATASET_NAME, "ALG": ALG, "N": int(N), "MU": float(MU)/10.0}
                if(Path(METRIC_FILE).is_file()):
                    with open(METRIC_FILE) as metric_f:
                        metric_v = float(metric_f.readline())
                        stat["METRIC"] = metric_v
                        consensus_stats.append(stat)
                else:
                    print(METRIC_FILE, "file does not exist")
            
            
        df = pd.DataFrame(consensus_stats)
        for ALG in algs:
            target_df = df[df["ALG"]==ALG]
            lbl = ALG
            clr = None
            if ALG=="v8-parallel":
                lbl = "fmccg"
                clr = "#990000" # IU Crimson
            elif ALG=="lancichinetti-fortunato":
                clr = "tab:blue"
                lbl = "lf-consensus"
            elif ALG=="kirkley-newman":
                clr = "tab:orange"

            if not target_df.empty:
                if j == 0:
                    axes[i][j].plot(target_df["MU"], target_df["METRIC"], label=lbl, marker="^", color=clr)
                else:
                    axes[i][j].plot(target_df["MU"], target_df["METRIC"], marker="^", color=clr)
        
            
        #axes[i][j].scatter(input_x, input_y, marker="o", facecolor="none", color="#990000", linewidth=0.25)
        axes[i][j].scatter(input_x, input_y, marker="o", facecolor="none", color="#243142", linewidth=0.25)
        axes[i][j].minorticks_on()
        #axes[i][j].tick_params(axis='y', which='minor')
        #axes[i][j].yaxis.set_minor_formatter(FormatStrFormatter("%.3f"))
        axes[i][j].grid(which="major", axis="both")
        #axes[i][j].legend(title="consensus:")
        axes[i][j].set_xlabel("$\mu$")
        if METRIC == "vi":
            axes[i][j].set_ylabel("variation of information distance")
        elif METRIC == "rand":
            axes[i][j].set_ylabel("rand distance")
            axes[i][j].set_yscale("log", base=2)
        elif METRIC == "split-join":
            axes[i][j].set_ylabel("split-join distance")
        elif METRIC == "ami":
            axes[i][j].set_ylabel("adjusted mutual info")
        elif METRIC == "nmi":
            axes[i][j].set_ylabel("normalized mutual info")
        #axes[i][j].set_title("n="+N)

fig.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3, fancybox=True, shadow=True)        
        
plt.tight_layout()
plot_fname = DATASET_NAME+"_n"+str(N)+".pdf"
plt.savefig(plot_fname, bbox_inches= Bbox([[0.14, 0.15], [8.87, 2.75]]))

plt.close()

/nfs/nfs2/home/mth/Codes/graph-consensus-clustering/test/experiment-results/benchmark-study/kirkley-newman.LFR-louvain-10.n5000.mu07/LFR_n5000_mu07_gamma30_beta11.soln-0.vi.gt file does not exist
/nfs/nfs2/home/mth/Codes/graph-consensus-clustering/test/experiment-results/benchmark-study/kirkley-newman.LFR-louvain-10.n5000.mu07/LFR_n5000_mu07_gamma30_beta11.soln-0.split-join.gt file does not exist
/nfs/nfs2/home/mth/Codes/graph-consensus-clustering/test/experiment-results/benchmark-study/kirkley-newman.LFR-louvain-10.n5000.mu07/LFR_n5000_mu07_gamma30_beta11.soln-0.rand.gt file does not exist


## Collect quality stats from benchmark experiments

In [32]:
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path
from matplotlib.transforms import Bbox

mus = [1, 2, 3, 4, 5, 6, 7]
#mus = [3]
algs = ["v8-parallel", "kirkley-newman", "lancichinetti-fortunato"]
ns = [5000]
metrics = ["vi", "split-join", "rand"]

HOME = "/nfs/nfs2/home/mth"
PROJECT_DIR = HOME + "/Codes/graph-consensus-clustering"
DATA_DIR = PROJECT_DIR + "/test/data"
RESULT_DIR = PROJECT_DIR + "/test/experiment-results/benchmark-study"
DATASET_NAME="LFR-louvain-10"
N = str(ns[0])

data_points = []

for DATA in ["LFR", "LFR-louvain"]:
    for it in range(1,11):
        DATASET_NAME=DATA + "-" + str(it)
        #print(DATASET_NAME)
        for mu in mus:
            MU = "0"+str(mu)
            FILE_PREFIX = "LFR" + "_n" + N +"_mu" + MU + "_gamma30_beta11"
            INPUT_DIR = DATA_DIR + "/" + DATASET_NAME + "/" + "n" + N
            INPUT_CLUSTERING_PREFIX = INPUT_DIR + "/" + FILE_PREFIX
            
            NUMBER_OF_INPUT_CLUSTERING=0
            for NUMBER_OF_INPUT_CLUSTERING in range(0,100):
                if(Path(INPUT_CLUSTERING_PREFIX+"."+str(NUMBER_OF_INPUT_CLUSTERING)).is_file()):
                    data_point = {
                        "data": DATA,
                        "run": it,
                        "mu": float(mu)/10.0,
                        "n": int(N),
                        "input": NUMBER_OF_INPUT_CLUSTERING,
                        "cons": None,
                        "vi": None,
                        "rand": None,
                        "split-join": None
                    }

                    #data_point["input"] = NUMBER_OF_INPUT_CLUSTERING
                    f = False
                    for METRIC in metrics:
                        METRIC_FILE = INPUT_CLUSTERING_PREFIX+"."+str(NUMBER_OF_INPUT_CLUSTERING) + "." + METRIC+".gt"
                        if(Path(METRIC_FILE).is_file()):
                            with open(METRIC_FILE) as metric_f:
                                metric_v = float(metric_f.readline())
                                data_point[METRIC] = metric_v
                                f = True
                    if f:
                        #print(NUMBER_OF_INPUT_CLUSTERING, data_point)
                        data_points.append(data_point)
                else:
                    break
            
            for ALG in algs:
                if (DATA == "LFR") and (ALG == "lancichinetti-fortunato"):
                    continue
                else:
                    OUTPUT_DIR_NAME=ALG + "." + DATASET_NAME + ".n" + N + ".mu" + MU
                    OUTPUT_DIR = RESULT_DIR + "/" + OUTPUT_DIR_NAME
                    OUTPUT_PREFIX = OUTPUT_DIR + "/" + FILE_PREFIX
                    SOLN=0
                    data_point = {
                        "data": DATA,
                        "run": it,
                        "mu": float(mu)/10.0,
                        "n": int(N),
                        "input": None,
                        "cons": ALG,
                        "vi": None,
                        "rand": None,
                        "split-join": None
                    }
                    #data_point["cons"] = ALG
                    f = False
                    for METRIC in metrics:
                        METRIC_FILE=OUTPUT_PREFIX+".soln-"+str(SOLN)+"."+METRIC+".gt"
                        if(Path(METRIC_FILE).is_file()):
                            with open(METRIC_FILE) as metric_f:
                                metric_v = float(metric_f.readline())
                                data_point[METRIC] = metric_v
                                f = True
                        else:
                            #print(METRIC_FILE, "file does not exist")
                            pass
                    if f:
                        data_points.append(data_point)
df = pd.DataFrame(data_points)
df.to_csv(RESULT_DIR + '/' + 'data.csv',  index=False)
#print(df)

## Group appropriate columns

In [33]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path
from matplotlib.transforms import Bbox

mus = [1, 2, 3, 4, 5, 6, 7]
#mus = [3]
algs = ["v8-parallel", "kirkley-newman", "lancichinetti-fortunato"]
ns = [5000]
metrics = ["vi", "split-join", "rand"]

HOME = "/nfs/nfs2/home/mth"
PROJECT_DIR = HOME + "/Codes/graph-consensus-clustering"
DATA_DIR = PROJECT_DIR + "/test/data"
RESULT_DIR = PROJECT_DIR + "/test/experiment-results/benchmark-study"
DATASET_NAME="LFR-louvain-10"
N = str(ns[0])

data_points = []

for DATA in ["LFR-louvain", "LFR"]:
    for it in range(1,11):
        for n in ns:
            for mu in mus:
                mask = (df["data"] == DATA) & (df["run"] == it) & (df["n"] == n) & (df["mu"] == float(mu)/10.0) & (df["run"] == it) & (df["cons"].isnull())
                target_df = df[ mask ]
                if (target_df.empty == False):
                    avg_vi = target_df["vi"].mean()
                    avg_rand = target_df["rand"].mean()
                    avg_split_join = target_df["split-join"].mean()
                    data_point = {
                        "data": DATA,
                        "run": it,
                        "mu": float(mu)/10.0,
                        "n": int(N),
                        "method": "input",
                        "vi": avg_vi,
                        "rand": avg_rand,
                        "split-join": avg_split_join
                    }
                    #print(data_point)
                    data_points.append(data_point)

                for alg in algs:
                    mask = (df["data"] == DATA) & (df["run"] == it) & (df["n"] == n) & (df["mu"] == float(mu)/10.0) & (df["run"] == it) & (df["cons"] == alg)
                    target_df = df[ mask ]
                    if (target_df.empty == False):
                        #print(target_df)
                        avg_vi = target_df["vi"].mean()
                        avg_rand = target_df["rand"].mean()
                        avg_split_join = target_df["split-join"].mean()
                        data_point = {
                            "data": DATA,
                            "run": it,
                            "mu": float(mu)/10.0,
                            "n": int(N),
                            "method": alg,
                            "vi": avg_vi,
                            "rand": avg_rand,
                            "split-join": avg_split_join
                        }
                        #print(data_point)
                        data_points.append(data_point)

summarized_df = pd.DataFrame(data_points)
summarized_df.to_csv(RESULT_DIR + '/' + 'summarized-data.csv',  index=False)

In [18]:
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path
from matplotlib.transforms import Bbox

mus = [1, 2, 3, 4, 5, 6, 7]
#mus = [3]
algs = ["input", "v8-parallel", "kirkley-newman", "lancichinetti-fortunato"]
ns = [5000]
metrics = ["vi", "split-join", "rand"]

HOME = "/nfs/nfs2/home/mth"
PROJECT_DIR = HOME + "/Codes/graph-consensus-clustering"
DATA_DIR = PROJECT_DIR + "/test/data"
RESULT_DIR = PROJECT_DIR + "/test/experiment-results/benchmark-study"
DATASET_NAME="LFR-louvain"
N = str(ns[0])

fig = plt.figure(figsize=(9, 2.5))
naxr = 1
naxc = 3
gs = GridSpec(nrows=naxr, ncols=naxc)
axes = []
for i in range(naxr):
    axr = []
    for j in range(naxc):
        axr.append(fig.add_subplot(gs[i,j]))
    axes.append(axr)
#print(type(axes), type(axes[0]))

#df = pd.read_csv(RESULT_DIR + '/' + 'summarized-data.csv')

#print(df)

for i in range(naxr):
    for j in range(naxc):
        idx = (i * naxr + j)
        METRIC = metrics[j]
        #N = str(ns[idx])
        for method in algs:
            x = []
            y_avg = []
            y_max = []
            y_min = []
            df = None
            mask = None
            for mu in mus:
                if method == "input":
                    df = pd.read_csv(RESULT_DIR + '/' + 'data.csv')
                    if DATASET_NAME == "LFR":
                        mask = (df["data"] == DATASET_NAME) & (df["mu"] == float(mu)/10.0) & (df["input"] != 14.0)
                    else: 
                        mask = (df["data"] == DATASET_NAME) & (df["mu"] == float(mu)/10.0)
                else:
                    df = pd.read_csv(RESULT_DIR + '/' + 'summarized-data.csv')
                    mask = (df["data"] == DATASET_NAME) & (df["mu"] == float(mu)/10.0) & (df["method"] == method)
                target_df = df[mask]
                x.append(float(mu)/10.0)
                y_avg.append( target_df[METRIC].mean() )
                y_min.append( target_df[METRIC].min() )
                y_max.append( target_df[METRIC].max() )
                
                if(METRIC == "rand") and (mu == 1) and (method == "input"):
                    print(target_df[target_df[METRIC]==target_df[METRIC].min()])
            y_avg = np.array(y_avg)
            y_min = np.array(y_min)
            y_max = np.array(y_max)
            x = np.array(x)
            
            err_lo = np.absolute(y_avg - y_min)
            err_hi = np.absolute(y_max - y_avg)
            #print("y_avg", y_avg)
            #print("y_min", y_min)
            #print("y_max", y_max)
            #print("err_lo", err_lo)
            #print("err_hi", err_hi)
            #print("---")
            lbl = method
            clr = None
            if lbl=="v8-parallel":
                lbl = "median-consensus"
                clr = "#990000" # IU Crimson
            elif lbl=="lancichinetti-fortunato":
                clr = "tab:blue"
                lbl = "lf-consensus"
            elif lbl=="kirkley-newman":
                clr = "tab:orange"
            elif lbl == "input":
                clr = "#243142"
            
            if( DATASET_NAME == "LFR" and method == "lancichinetti-fortunato"):
                pass
            else:
                if method == "input":
                    if j == 0:
                        plot, caps, bars = axes[i][j].errorbar(x,y_avg, yerr=[err_lo, err_hi], label=lbl, marker=".", color=clr, capsize=8)
                    else:
                        plot, caps, bars = axes[i][j].errorbar(x,y_avg, yerr=[err_lo, err_hi], marker=".", color=clr, capsize=8)
                else:
                    if j == 0:
                        plot, caps, bars = axes[i][j].errorbar(x,y_avg, yerr=[err_lo, err_hi], label=lbl, marker=".", color=clr, capsize=5)
                        #caps[0].set_marker('_')
                        #caps[1].set_marker('_')
                        #axes[i][j].plot(x,y_avg, label=lbl, marker="^", color=clr)
                        #axes[i][j].fill_between(x, y_avg-err_lo, y_avg+err_hi, facecolor=clr)
                    else:
                        plot, caps, bars = axes[i][j].errorbar(x,y_avg, yerr=[err_lo, err_hi], marker=".", color=clr, capsize=5)
                        #caps[0].set_marker('_')
                        #caps[1].set_marker('_')
                        #axes[i][j].plot(x,y_avg, label=lbl, marker="^", color=clr)
                        #axes[i][j].fill_between(x, y_avg-err_lo, y_avg+err_hi, facecolor=clr)
        
#         df = pd.read_csv(RESULT_DIR + '/' + 'data.csv')
#         df = df[df["data"] == DATASET_NAME]
#         stat = {}
#         for mu in mus:
            
#         axes[i][j].scatter(df["mu"], df[METRIC], marker="o", facecolor="none", color="#243142", linewidth=0.25)
                
        axes[i][j].minorticks_on()
        #axes[i][j].tick_params(axis='y', which='minor')
        #axes[i][j].yaxis.set_minor_formatter(FormatStrFormatter("%.3f"))
        axes[i][j].grid(which="major", axis="both")
        #axes[i][j].legend(title="consensus:")
        axes[i][j].set_xlabel("$\mu$")
        if METRIC == "vi":
            axes[i][j].set_ylabel("variation of information distance")
        elif METRIC == "rand":
            axes[i][j].set_ylabel("rand distance")
            axes[i][j].set_yscale("log", base=2)
        elif METRIC == "split-join":
            axes[i][j].set_ylabel("split-join distance")
        
fig.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=4, fancybox=True, shadow=True)        
        
plt.tight_layout()
plot_fname = DATASET_NAME+"_n"+str(N)+".pdf"
plt.savefig(plot_fname, bbox_inches= Bbox([[0.14, 0.15], [8.87, 2.75]]))

plt.close()

             data  run   mu     n  input         cons        vi      rand  \
2888  LFR-louvain    3  0.1  5000    NaN  v8-parallel  0.105303  0.000927   

      split-join  
2888      0.0206  


# Cluster size distributions

In [7]:
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path

mus = [1, 2, 3, 4, 5, 6, 7]
algs = ["v8-parallel"]
ns = [5000]

HOME = "/nfs/nfs2/home/mth"
PROJECT_DIR = HOME + "/Codes/graph-consensus-clustering"
DATA_DIR = PROJECT_DIR + "/test/data"
RESULT_DIR = PROJECT_DIR + "/test/experiment-results/benchmark-study"
DATASET_NAME="LFR-1"
N = str(ns[0])

fig = plt.figure(figsize=(14, 2.3))
naxr = 1
naxc = len(mus)
gs = GridSpec(nrows=naxr, ncols=naxc)
axes = []
for i in range(naxr):
    axr = []
    for j in range(naxc):
        axr.append(fig.add_subplot(gs[i,j]))
    axes.append(axr)

#print(type(axes), type(axes[0]))

for i in range(naxr):
    for j in range(naxc):
        idx = (i * naxr + j)
        #N = str(ns[idx])
        input_x = []
        input_y = []
        consensus_stats = []
        
        MU = "0"+str(mus[j])
        FILE_PREFIX = "LFR" + "_n" + N +"_mu" + MU + "_gamma30_beta11"
        
        INPUT_DIR = DATA_DIR + "/" + DATASET_NAME + "/" + "n" + N
        INPUT_CLUSTERING_PREFIX = INPUT_DIR + "/" + FILE_PREFIX
        GT_FILE = INPUT_CLUSTERING_PREFIX + ".gt"
        
        gt_clust_lst = read_clust_lst(GT_FILE)
        size_count = {}
        for lst in gt_clust_lst:
            if len(lst) in size_count.keys():
                size_count[len(lst)] = size_count[len(lst)] + 1
            else:
                size_count[len(lst)] = 1
        axes[i][j].scatter(size_count.keys(), size_count.values(),marker="^", facecolor="tab:gray", color="tab:gray", linewidth=0.25, label=str(len(gt_clust_lst)) + " clusters" )

        ALG = "v8-parallel"
        OUTPUT_DIR_NAME=ALG + "." + DATASET_NAME + ".n" + N + ".mu" + MU
        OUTPUT_DIR = RESULT_DIR + "/" + OUTPUT_DIR_NAME
        OUTPUT_PREFIX = OUTPUT_DIR + "/" + FILE_PREFIX
        SOLN=0
        CONS_FILE=OUTPUT_PREFIX+".soln-"+str(SOLN)
        
        cons_clust_lst = read_clust_lst(CONS_FILE)
        size_count = {}
        for lst in cons_clust_lst:
            if len(lst) in size_count.keys():
                size_count[len(lst)] = size_count[len(lst)] + 1
            else:
                size_count[len(lst)] = 1
        axes[i][j].scatter(size_count.keys(), size_count.values(),marker="o", facecolor="none", color="#990000", linewidth=0.5, label=str(len(cons_clust_lst)) + " clusters"  )
        
        axes[i][j].set_yscale("log", base=2)
        axes[i][j].set_xscale("log", base=2)
        axes[i][j].minorticks_on()
        axes[i][j].grid(which="major", axis="both")
        axes[i][j].set_title("$\mu="+str(mus[j]/10.0)+"$")
        axes[i][j].legend()

plt.tight_layout()
plot_fname = DATASET_NAME+"_n"+str(N)+"_sizes.pdf"
print(plot_fname)
plt.savefig(plot_fname)

plt.close()

LFR-1_n5000_sizes.pdf


# Median quality evaluation on benchmark graphs

In [3]:
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path
import seaborn as sns
sns.color_palette("tab10")
from matplotlib.transforms import Bbox

#plt.rc('font', size=15)          # controls default text sizes

mus = [1, 2, 3, 4, 5, 6, 7]
algs = ["v8-parallel", "kirkley-newman", "boem", "lancichinetti-fortunato"]
#algs = ["v8-parallel", "kirkley-newman", "boem",]
#ns = [1000, 5000]
ns = [5000]
metrics = ["vi", "split-join", "rand"]
#metrics = ["rand"]

HOME = "/nfs/nfs2/home/mth"
PROJECT_DIR = HOME + "/Codes/graph-consensus-clustering"
DATA_DIR = PROJECT_DIR + "/test/data"
RESULT_DIR = PROJECT_DIR + "/test/experiment-results/benchmark-study"
DATASET_NAME="LFR-louvain"


fig = plt.figure(figsize=(9, 3))
naxr = 1
naxc = 3
gs = GridSpec(nrows=naxr, ncols=naxc)
axes = []
for i in range(naxr):
    axr = []
    for j in range(naxc):
        axr.append(fig.add_subplot(gs[i,j]))
    axes.append(axr)
#print(type(axes), type(axes[0]))

for i in range(naxr):
    N = str(ns[i])
    for j in range(naxc):
        METRIC = metrics[j]
        idx = (i * naxr + j)
        input_x = []
        input_y = []
        consensus_stats = []
        for mu in mus:
            MU = "0"+str(mu)
            FILE_PREFIX = "LFR" + "_n" + N +"_mu" + MU + "_gamma30_beta11"
            INPUT_DIR = DATA_DIR + "/" + DATASET_NAME + "/" + "n" + N
            INPUT_CLUSTERING_PREFIX = INPUT_DIR + "/" + FILE_PREFIX
            
            for ALG in algs:
                OUTPUT_DIR_NAME=ALG + "." + DATASET_NAME + ".n" + N + ".mu" + MU
                OUTPUT_DIR = RESULT_DIR + "/" + OUTPUT_DIR_NAME
                OUTPUT_PREFIX = OUTPUT_DIR + "/" + FILE_PREFIX
                SOLN=0 # Consider the 0th solution to be the best consensus solution
                METRIC_FILE=OUTPUT_PREFIX+".soln-"+str(SOLN)+"."+METRIC
                #stat = {"DATASET_NAME": DATASET_NAME, "ALG": ALG, "N": int(N), "MU": float(MU)/10.0}
                
                # Iterate over all possible input partition numbers
                for K in range(100):
                    METRIC_FILE=OUTPUT_PREFIX+".soln-"+str(SOLN)+"."+METRIC+"."+str(K)
                    if(Path(METRIC_FILE).is_file()):
                        with open(METRIC_FILE) as metricf:
                            lines = metricf.readlines()
                            #metric_v = float(metric_f.readline())
                            #stat["METRIC"] = metric_v
                            #consensus_stats.append(stat)
                            if len(lines) > 0:
                                values = []
                                for line in lines:
                                    stat = {"DATASET_NAME": DATASET_NAME, "ALG": ALG, "N": int(N), "MU": float(MU)/10.0}
                                    if ALG == "v8-parallel":
                                        stat["ALG"] = "median-consensus"
                                    if ALG == "lancichinetti-fortunato":
                                        stat["ALG"] = "lf-consensus"
                                    stat["METRIC"] = METRIC
                                    stat["DIST"] = float(line)
                                    consensus_stats.append(stat)
                    else:
                        #print(METRIC_FILE, "file does not exist")
                        break
            
            
        df = pd.DataFrame(consensus_stats)
        #print(df)
        legendyes = False
        if j == 0:
            legendyes = True
        sns.boxplot(x="MU", y="DIST", hue="ALG", data=df, ax=axes[i][j],
                    linewidth=0.5,
                    showfliers=False, 
                    showmeans=True,
                    meanprops={'marker':'o',
                       'markerfacecolor':'cyan', 
                       'markeredgecolor':'black',
                       'markeredgewidth': 0.5,
                       'markersize':'3'},
                    legend = legendyes
                   )
            
        for m in range(len(mus)):
            x = mus[m]
            axes[i][j].axvline(x-0.5, color = 'red', linestyle='-', linewidth=0.6)
        
        #for i, line in enumerate(median_lines):
        #    line.set_color(median_colors[i % len(median_colors)])

        #axes[i][j].minorticks_on()
#         if j == 0:
#             axes[i][j].legend(title="consensus:")
        axes[i][j].set_xlabel("$\mu$")
        axes[i][j].grid(which="major", axis="y")
        if METRIC == "vi":
            axes[i][j].set_ylabel("variation of information distance")
        elif METRIC == "split-join":
            axes[i][j].set_ylabel("split-join distance")
        elif METRIC == "rand":
            axes[i][j].set_ylabel("rand distance")
            #axes[i][j].set_yscale("log", base=2)
            #axes[i][j].set_xscale("log", base=2)
        #axes[i][j].set_title("n="+N)

fig.legend(loc='upper center', bbox_to_anchor=(0.5, 1.07), ncol=4, fancybox=True, shadow=True)

for i in range(naxr):
    for j in range(naxc):
        lg = axes[i][j].get_legend()
        if lg is not None:
            lg.remove()
        
plt.tight_layout()
plot_fname = DATASET_NAME+"_n"+str(N)+"_median.pdf"
plt.savefig(plot_fname, bbox_inches= Bbox([[0.14, 0.15], [8.87, 3.17]]))
#plt.savefig(plot_fname)

plt.close()

# Runtime evaluation

In [57]:
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path
import math
import seaborn as sns

mus = [4]
algs = ["v8", "v8-parallel", "boem", "lancichinetti-fortunato", "kirkley-newman"]

runtime_data = [
    {"n": 200, "mu": 0.4, "alg": "v8", "runtime": 0.009201},
    {"n": 1000, "mu": 0.4, "alg": "v8", "runtime": 0.071906},
    {"n": 5000, "mu": 0.4, "alg": "v8", "runtime": 0.809469},
    {"n": 25000, "mu": 0.4, "alg": "v8", "runtime": 5.245938},
    {"n": 125000, "mu": 0.4, "alg": "v8", "runtime": 26.215312},
    {"n": 200, "mu": 0.4, "alg": "v8-parallel", "runtime": 0.005011},
    {"n": 1000, "mu": 0.4, "alg": "v8-parallel", "runtime": 0.014268},
    {"n": 5000, "mu": 0.4, "alg": "v8-parallel", "runtime": 0.108672},
    {"n": 25000, "mu": 0.4, "alg": "v8-parallel", "runtime": 0.430918},
    {"n": 125000, "mu": 0.4, "alg": "v8-parallel", "runtime": 2.034328},
    {"n": 200, "mu": 0.4, "alg": "boem", "runtime": 0.004191},
    {"n": 1000, "mu": 0.4, "alg": "boem", "runtime": 0.055053},
    {"n": 5000, "mu": 0.4, "alg": "boem", "runtime": 1.236402},
    {"n": 25000, "mu": 0.4, "alg": "boem", "runtime": 28.491077},
    {"n": 200, "mu": 0.4, "alg": "lancichinetti-fortunato", "runtime": 1.6746854782104492},
    {"n": 1000, "mu": 0.4, "alg": "lancichinetti-fortunato", "runtime": 10.075507640838623},
    {"n": 5000, "mu": 0.4, "alg": "lancichinetti-fortunato", "runtime": 133.313547372818},
    {"n": 25000, "mu": 0.4, "alg": "lancichinetti-fortunato", "runtime": 982.6499273777008},
    {"n": 200, "mu": 0.4, "alg": "kirkley-newman", "runtime": 1.0153427124023438},
    {"n": 1000, "mu": 0.4, "alg": "kirkley-newman", "runtime": 0.8788154125213623},
    {"n": 5000, "mu": 0.4, "alg": "kirkley-newman", "runtime": 0.9371051788330078},
    {"n": 25000, "mu": 0.4, "alg": "kirkley-newman", "runtime": 1.7962682247161865},
    {"n": 125000, "mu": 0.4, "alg": "kirkley-newman", "runtime": 6.556192636489868}
]

df = pd.DataFrame(runtime_data)

fig = plt.figure(figsize=(3, 2.5))
naxr = 1
naxc = 1
gs = GridSpec(nrows=naxr, ncols=naxc)
axes = []
for i in range(naxr):
    axr = []
    for j in range(naxc):
        axr.append(fig.add_subplot(gs[i,j]))
    axes.append(axr)

for i in range(naxr):
    for j in range(naxc):
        idx = (i * naxr + j)
        mu = mus[idx]
        for alg in algs:
            target_df = df[df["alg"] == alg]
            lbl = alg
            if alg == "v8":
                lbl = "median-consensus\n(sequential)"
            elif alg == "v8-parallel":
                lbl = "median-consensus\n(parallel, 64 cores)"
            elif alg == "lancichinetti-fortunato":
                lbl = "lf-consensus"
            axes[i][j].plot(target_df["n"], target_df["runtime"], label=lbl, marker="o")
        
#         for cplx in ["n", "nlgn", "n2", "n3"]:
#             xs = []
#             ys = []
#             #for x in [1, 5, 25, 125, 625, 1000, 5000, 25000, 125000]:
#             for x in target_df["n"]:
#                 xs.append(x)
#                 if cplx == "n":
#                     ys.append(x)
#                 elif cplx == "nlgn":
#                     ys.append(math.log(x, 2))
#                 elif cplx == "n2":
#                     ys.append(x*x)
#                 elif cplx == "n3":
#                     ys.append(x*x*x)
#             axes[i][j].plot(xs, ys, color="black", linestyle="dashed")
                
        
        axes[i][j].set_xscale("log", base=2)
        axes[i][j].set_yscale("log", base=2)
        axes[i][j].minorticks_on()
        axes[i][j].grid(which="major", axis="both")
        #axes[i][j].legend()
        
        axes[i][j].set_xlabel("n")
        axes[i][j].set_ylabel("runtime (s)")

fig.legend(loc='right', bbox_to_anchor=(1.55, 0.50), ncol=1, fancybox=True, shadow=True)
        
plt.tight_layout()
#plt.savefig("benchmark-runtime.pdf")
plt.savefig("benchmark-runtime.pdf", bbox_inches= Bbox([[0.12, 0.15], [4.7, 2.4]]))

plt.close()

## Strong scaling of Samusik_all

In [4]:
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path
import seaborn as sns

mus = [4]
algs = ["v8-parallel"]

runtime_data = [
    {"p": 1, "alg": "v8-parallel", "runtime": 7985.998796, "speedup": 1},
    {"p": 4, "alg": "v8-parallel", "runtime": 2220.056281, "speedup": 3.59},
    {"p": 16, "alg": "v8-parallel", "runtime": 672.626491, "speedup": 11.872},
    {"p": 64, "alg": "v8-parallel", "runtime": 225.039627, "speedup": 35.487},
#     {"p": 128, "alg": "v8-parallel", "runtime": 261.293097}
]

df = pd.DataFrame(runtime_data)

fig = plt.figure(figsize=(3, 2.5))
naxr = 1
naxc = 1
gs = GridSpec(nrows=naxr, ncols=naxc)
axes = []
for i in range(naxr):
    axr = []
    for j in range(naxc):
        axr.append(fig.add_subplot(gs[i,j]))
    axes.append(axr)

for i in range(naxr):
    for j in range(naxc):
        idx = (i * naxr + j)
        #mu = mus[idx]
        for alg in algs:
            target_df = df[df["alg"] == alg]
            axes[i][j].plot(target_df["p"], target_df["runtime"], label="median-consensus", marker="o", color="tab:orange")
        
        axes[i][j].set_xscale("log", base=2)
        axes[i][j].set_yscale("log", base=2)
        axes[i][j].minorticks_on()
        axes[i][j].grid(which="major", axis="both")
        axes[i][j].legend()
        
        axes[i][j].set_xlabel("number of cores")
        axes[i][j].set_ylabel("runtime (s)", color="tab:orange")
        
        ax2 = axes[i][j].twinx()
        ax2.set_yscale("log", base=2)
        ax2.plot(target_df["p"], target_df["speedup"], marker="o", color="tab:blue")
        ax2.set_ylabel('speedup', color="tab:blue")  # we already handled the x-label with ax1
        #ax2.plot(t, data2, color=color)
        
plt.tight_layout()
plt.savefig("samusik_all-scaling.pdf")
plt.close()