### Distance distribution of candidate methods from experiments on benchmark network

In [None]:
"""
import networkx as nx
import networkx.algorithms.community as nx_comm
from networkx.generators.community import LFR_benchmark_graph
from networkx.algorithms import bipartite
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})

def plot_distance_distribution(data, ax, colors, distribution_labels, ylabel, title):
    # https://stackoverflow.com/questions/26291479/changing-the-color-of-matplotlibs-violin-plots
    violin_parts = ax.violinplot(data, showmeans=True, showmedians=True, points=20)

    for k in range(len(data)):
        violin_parts["bodies"][k].set_facecolor(colors[k])
        violin_parts["bodies"][k].set_edgecolor("black")
    violin_parts["cmeans"].set_color("magenta")
    violin_parts["cmedians"].set_color("aqua")
    violin_parts["cbars"].set_color("gray")
    violin_parts["cmaxes"].set_color("lightgray")
    violin_parts["cmins"].set_color("lightgray")

    ax.xaxis.set_ticks(range(1, len(data)+1))
    if (distribution_labels is not None) and (len(distribution_labels) == len(data)):
        ax.xaxis.set_ticklabels(distribution_labels)
    else:
        ax.xaxis.set_ticklabels([])
    
    if ylabel is not None:
        ax.set_ylabel(ylabel)
    
    if title is not None:
        ax.set_title(title)

distance_metrics = ["split_joint_distance", "mirkin_distance", "variation_of_info_distance"]

consensus_methods = ["mcla", "hbgf", "nmf", "boem", "v8"]
consensus_methods_colors = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:brown"]

mus = [1, 2, 3, 4]
ns = [200, 1000, 5000]
for n in ns:
    df = pd.read_csv("benchmark-consensus-distance-stats.csv")
    df = df[df["n"] == 200]
    for distance_metric in distance_metrics:
        naxr = 2 
        naxc = 2

        fig = plt.figure(figsize=(6, 6))
        gs = GridSpec(nrows=naxr, ncols=naxc)

        axes = []
        for i in range(naxr):
            axr = []
            for j in range(naxc):
                axr.append(fig.add_subplot(gs[i,j]))
            axes.append(axr)
        
        for i in range(naxr):
            for j in range(naxc):
                idx = (i * naxr + j)
                mu = mus[idx]
                data = []
                
                for k in range(len(consensus_methods)):
                    cons_method = consensus_methods[k]
                    mask = None
                    if cons_method == "best_candidate":
                        mask = (df["mu"] == mu) & (df["distance_metric"] == distance_metric) & (df["cons_method"] == cons_method) & (df["optimized_distance"] == distance_metric)
                    else:
                        mask = (df["mu"] == mu) & (df["distance_metric"] == distance_metric) & (df["cons_method"] == cons_method) & (df["optimized_distance"] == "none")
                    df_target = df[mask]
                    data.append(df_target["distance"])
                
                plot_distance_distribution(data, axes[i][j], consensus_methods_colors, consensus_methods, distance_metric, "$\mu:" + str(mu*1.0/10.0)+"$")
        
        plt.tight_layout()
        plt.savefig("benchmark-distance-distribution-"+ "n"+str(n)+ "-" + distance_metric +".pdf")
        """

In [None]:
from pathlib import Path

def plot_distance_distribution(data, ax, colors, distribution_labels, ylabel, title):
    # https://stackoverflow.com/questions/26291479/changing-the-color-of-matplotlibs-violin-plots
    violin_parts = ax.violinplot(data, showmeans=True, showmedians=True, points=20)

    for k in range(len(data)):
        violin_parts["bodies"][k].set_facecolor(colors[k])
        violin_parts["bodies"][k].set_edgecolor("black")
    violin_parts["cmeans"].set_color("magenta")
    violin_parts["cmedians"].set_color("aqua")
    violin_parts["cbars"].set_color("gray")
    violin_parts["cmaxes"].set_color("lightgray")
    violin_parts["cmins"].set_color("lightgray")

    ax.xaxis.set_ticks(range(1, len(data)+1))
    if (distribution_labels is not None) and (len(distribution_labels) == len(data)):
        ax.xaxis.set_ticklabels(distribution_labels)
    else:
        ax.xaxis.set_ticklabels([])
    
    if ylabel is not None:
        ax.set_ylabel(ylabel)
    
    if title is not None:
        ax.set_title(title)
        
mus = [1, 2, 3, 4]
ns = [200, 1000, 5000]
algs = ["v8", "v8-parallel", "boem", "saoem"]
distance_metrics = ["rand", "split-join", "vi"]

for n in ns:
    for mu in mus:
        directory = "data/LFR/" + "n"+str(n) + "/"
        fileprefix = "LFR_" + "n" + str(n) + "_mu0" + str(mu) + "_gamma30_beta11"
        for alg in algs:
            for distance_metric in distance_metrics:
                filename = directory + fileprefix + "." + alg + "." + distance_metric
                p = Path(filename)
                print(p.exists(), filename)

# Preprocessing threshold parameter study

In [None]:
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path

RESULT_DIR="experiment-results"
OUTPUT_DIR=RESULT_DIR+"/"+"preprocess-study"
DATASET_NAME="LFR-louvain"

mus = [1, 2, 3, 4, 5, 6, 7]
ns = [1000, 5000]

fig = plt.figure(figsize=(10, 4))
naxr = 1
naxc = 2
gs = GridSpec(nrows=naxr, ncols=naxc)
axes = []
for i in range(naxr):
    axr = []
    for j in range(naxc):
        axr.append(fig.add_subplot(gs[i,j]))
    axes.append(axr)
#print(type(axes), type(axes[0]))

for i in range(naxr):
    for j in range(naxc):
        idx = (i * naxr + j)
        n = ns[idx]
        for mu in mus:
            csv_file = OUTPUT_DIR+"/"+DATASET_NAME+".n"+str(n)+".mu0"+str(mu)+".csv"
            filepath = Path(csv_file)
            #print(csv_file, filepath.exists())
            if filepath.exists():
                df = pd.read_csv(csv_file)
                #print(df)
                #print(type(axes[idx]))
                axes[i][j].plot(df["threshold"], df["n_components"], label=str(float(mu)/10.0), marker=".")
                
        #axes[i][j].set_xlim(right=0.6)
        axes[i][j].minorticks_on()
        axes[i][j].grid(which="major", axis="both")
        axes[i][j].legend(title="$\mu$")
        axes[i][j].set_xlabel("Threshold")
        axes[i][j].set_ylabel("Number of groups")
        axes[i][j].set_title("n="+str(n))

#plt.suptitle("Effect of preprocessing threshold on LFR")
plt.tight_layout()
plt.savefig("preprocess-study-louvain.pdf")
plt.close()

# Ground truth quality evaluation on benchmark graphs

In [None]:
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path

mus = [1, 2, 3, 4, 5, 6, 7]
algs = ["v8-parallel", "kirkley-newman", "lancichinetti-fortunato"]
ns = [1000, 5000]

HOME = "/nfs/nfs2/home/mth"
PROJECT_DIR = HOME + "/Codes/graph-consensus-clustering"
DATA_DIR = PROJECT_DIR + "/test/data"
RESULT_DIR = PROJECT_DIR + "/test/experiment-results/benchmark-study"
DATASET_NAME="LFR"

fig = plt.figure(figsize=(10, 4))
naxr = 1
naxc = 2
gs = GridSpec(nrows=naxr, ncols=naxc)
axes = []
for i in range(naxr):
    axr = []
    for j in range(naxc):
        axr.append(fig.add_subplot(gs[i,j]))
    axes.append(axr)
#print(type(axes), type(axes[0]))

for i in range(naxr):
    for j in range(naxc):
        idx = (i * naxr + j)
        N = str(ns[idx])
        input_x = []
        input_y = []
        consensus_stats = []
        for mu in mus:
            MU = "0"+str(mu)
            FILE_PREFIX = "LFR" + "_n" + N +"_mu" + MU + "_gamma30_beta11"
            INPUT_DIR = DATA_DIR + "/" + DATASET_NAME + "/" + "n" + N
            INPUT_CLUSTERING_PREFIX = INPUT_DIR + "/" + FILE_PREFIX
            
            NUMBER_OF_INPUT_CLUSTERING=0
            for NUMBER_OF_INPUT_CLUSTERING in range(0,100):
                if(Path(INPUT_CLUSTERING_PREFIX+"."+str(NUMBER_OF_INPUT_CLUSTERING)).is_file()):
                    NMI_FILE = INPUT_CLUSTERING_PREFIX+"."+str(NUMBER_OF_INPUT_CLUSTERING) + ".ami"
                    if(Path(NMI_FILE).is_file()):
                        with open(NMI_FILE) as nmif:
                            nmi = float(nmif.readline())
                            #print(float(mu)/10.0, nmi)
                            input_x.append(float(mu)/10.0)
                            input_y.append(nmi)
                else:
                    break
        
            
            for ALG in algs:
                OUTPUT_DIR_NAME=ALG + "." + DATASET_NAME + ".n" + N + ".mu" + MU
                OUTPUT_DIR = RESULT_DIR + "/" + OUTPUT_DIR_NAME
                OUTPUT_PREFIX = OUTPUT_DIR + "/" + FILE_PREFIX
                SOLN=0
                NMI_FILE=OUTPUT_PREFIX+".soln-"+str(SOLN)+".ami"
                stat = {"DATASET_NAME": DATASET_NAME, "ALG": ALG, "N": int(N), "MU": float(MU)/10.0}
                if(Path(NMI_FILE).is_file()):
                    with open(NMI_FILE) as nmif:
                        nmi = nmif.readline()
                        stat["NMI"] = float(nmi)
                        consensus_stats.append(stat)
                else:
                    print(NMI_FILE, "file does not exist")
            
            
        df = pd.DataFrame(consensus_stats)
        for ALG in algs:
            target_df = df[df["ALG"]==ALG]
            lbl = ALG
            clr = None
            if ALG=="v8-parallel":
                lbl = "our-method"
                clr = "#990000" # IU Crimson
            if not target_df.empty:
                axes[i][j].plot(target_df["MU"], target_df["NMI"], label=lbl, marker="x", color=clr)
        
            
        #axes[i][j].scatter(input_x, input_y, marker="o", facecolor="none", color="#990000", linewidth=0.25)
        axes[i][j].scatter(input_x, input_y, marker="o", facecolor="none", color="#243142", linewidth=0.25)
        axes[i][j].minorticks_on()
        axes[i][j].grid(which="major", axis="both")
        axes[i][j].legend(title="consensus:")
        axes[i][j].set_xlabel("$\mu$")
        axes[i][j].set_ylabel("AMI")
        axes[i][j].set_title("n="+N)

plt.savefig("benchmark-quality-all.pdf")

plt.close()

# Median quality evaluation on benchmark graphs

In [None]:
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path
import seaborn as sns

plt.rc('font', size=15)          # controls default text sizes

mus = [1, 2, 3, 4, 5, 6, 7]
#algs = ["v8-parallel", "kirkley-newman", "boem", "lancichinetti-fortunato"]
algs = ["v8-parallel", "kirkley-newman", "boem",]
ns = [1000, 5000]
metrics = ["vi", "split-join", "rand"]
#metrics = ["rand"]

HOME = "/nfs/nfs2/home/mth"
PROJECT_DIR = HOME + "/Codes/graph-consensus-clustering"
DATA_DIR = PROJECT_DIR + "/test/data"
RESULT_DIR = PROJECT_DIR + "/test/experiment-results/benchmark-study"
DATASET_NAME="LFR"


fig = plt.figure(figsize=(16, 10))
naxr = 2
naxc = 3
gs = GridSpec(nrows=naxr, ncols=naxc)
axes = []
for i in range(naxr):
    axr = []
    for j in range(naxc):
        axr.append(fig.add_subplot(gs[i,j]))
    axes.append(axr)
#print(type(axes), type(axes[0]))

for i in range(naxr):
    N = str(ns[i])
    for j in range(naxc):
        METRIC = metrics[j]
        idx = (i * naxr + j)
        input_x = []
        input_y = []
        consensus_stats = []
        for mu in mus:
            MU = "0"+str(mu)
            FILE_PREFIX = "LFR" + "_n" + N +"_mu" + MU + "_gamma30_beta11"
            INPUT_DIR = DATA_DIR + "/" + DATASET_NAME + "/" + "n" + N
            INPUT_CLUSTERING_PREFIX = INPUT_DIR + "/" + FILE_PREFIX
            
            for ALG in algs:
                OUTPUT_DIR_NAME=ALG + "." + DATASET_NAME + ".n" + N + ".mu" + MU
                OUTPUT_DIR = RESULT_DIR + "/" + OUTPUT_DIR_NAME
                OUTPUT_PREFIX = OUTPUT_DIR + "/" + FILE_PREFIX
                SOLN=0
                METRIC_FILE=OUTPUT_PREFIX+".soln-"+str(SOLN)+"."+METRIC
                stat = {"DATASET_NAME": DATASET_NAME, "ALG": ALG, "N": int(N), "MU": float(MU)/10.0}
                if(Path(METRIC_FILE).is_file()):
                    with open(METRIC_FILE) as metricf:
                        lines = metricf.readlines()
                        if len(lines) > 0:
                            values = []
                            for line in lines:
                                stat = {"DATASET_NAME": DATASET_NAME, "ALG": ALG, "N": int(N), "MU": float(MU)/10.0}
                                if ALG == "v8-parallel":
                                    stat["ALG"] = "our method"
                                stat["METRIC"] = METRIC
                                stat["DIST"] = float(line)
                                consensus_stats.append(stat)
                else:
                    print(METRIC_FILE, "file does not exist")
            
            
        df = pd.DataFrame(consensus_stats)
        #print(df)
        sns.boxplot(x="MU", y="DIST", hue="ALG", data=df, ax=axes[i][j],
                    linewidth=0.5,
                    showfliers=False, 
                    showmeans=True,
                    meanprops={'marker':'o',
                       'markerfacecolor':'cyan', 
                       'markeredgecolor':'black',
                       'markeredgewidth': 0.5,
                       'markersize':'5'})
        
        """
        for ALG in algs:
            for METRIC in metrics:
                target_df = df[df["ALG"]==ALG]
                lbl = ALG
                clr = None
                if ALG=="v8-parallel":
                    lbl = "our-method"
                    clr = "#990000" # IU Crimson
                if not target_df.empty:
                    axes[i][j].plot(target_df["MU"], target_df[METRIC], label=lbl, marker="x", color=clr)
        """
        
            
        for m in range(len(mus)):
            x = mus[m]
            axes[i][j].axvline(x-0.5, color = 'red', linestyle='-', linewidth=0.6)

        #axes[i][j].minorticks_on()
        axes[i][j].grid(which="major", axis="y")
        axes[i][j].legend(title="consensus:")
        axes[i][j].set_xlabel("$\mu$")
        if METRIC == "vi":
            axes[i][j].set_ylabel("variation of information distance")
        elif METRIC == "split-join":
            axes[i][j].set_ylabel("split-join distance")
        elif METRIC == "rand":
            axes[i][j].set_ylabel("rand distance")
        axes[i][j].set_title("n="+N)

plt.tight_layout()
plt.savefig("benchmark-median-all.pdf")
plt.close()

# Runtime evaluation

In [None]:
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path
import seaborn as sns

mus = [4]
algs = ["v8", "v8-parallel", "boem"]

runtime_data = [
    {"n": 200, "mu": 0.4, "alg": "v8", "runtime": 0.005842},
    {"n": 1000, "mu": 0.4, "alg": "v8", "runtime": 0.070531},
    {"n": 5000, "mu": 0.4, "alg": "v8", "runtime": 0.500963},
    {"n": 25000, "mu": 0.4, "alg": "v8", "runtime": 3.306961},
    {"n": 125000, "mu": 0.4, "alg": "v8", "runtime": 44.137992},
    {"n": 200, "mu": 0.4, "alg": "v8-parallel", "runtime": 0.045735},
    {"n": 1000, "mu": 0.4, "alg": "v8-parallel", "runtime": 0.034897},
    {"n": 5000, "mu": 0.4, "alg": "v8-parallel", "runtime": 0.165114},
    {"n": 25000, "mu": 0.4, "alg": "v8-parallel", "runtime": 0.912563},
    {"n": 125000, "mu": 0.4, "alg": "v8-parallel", "runtime": 8.836543},
    {"n": 200, "mu": 0.4, "alg": "boem", "runtime": 0.003432},
    {"n": 1000, "mu": 0.4, "alg": "boem", "runtime": 0.064385},
    {"n": 5000, "mu": 0.4, "alg": "boem", "runtime": 1.490204},
    {"n": 25000, "mu": 0.4, "alg": "boem", "runtime": 41.420836}
]

df = pd.DataFrame(runtime_data)

fig = plt.figure(figsize=(6, 4))
naxr = 1
naxc = 1
gs = GridSpec(nrows=naxr, ncols=naxc)
axes = []
for i in range(naxr):
    axr = []
    for j in range(naxc):
        axr.append(fig.add_subplot(gs[i,j]))
    axes.append(axr)

for i in range(naxr):
    for j in range(naxc):
        idx = (i * naxr + j)
        mu = mus[idx]
        for alg in algs:
            target_df = df[df["alg"] == alg]
            axes[i][j].plot(target_df["n"], target_df["runtime"], label=alg, marker="o")
        
        axes[i][j].set_xscale("log", base=2)
        axes[i][j].set_yscale("log", base=2)
        axes[i][j].minorticks_on()
        axes[i][j].grid(which="major", axis="both")
        axes[i][j].legend(title="consensus:")
        axes[i][j].set_xlabel("n")
        axes[i][j].set_ylabel("runtime")
plt.savefig("benchmark-runtime.pdf")
plt.close()