### Distance distribution of candidate methods from experiments on benchmark network

In [None]:
"""
import networkx as nx
import networkx.algorithms.community as nx_comm
from networkx.generators.community import LFR_benchmark_graph
from networkx.algorithms import bipartite
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})

def plot_distance_distribution(data, ax, colors, distribution_labels, ylabel, title):
    # https://stackoverflow.com/questions/26291479/changing-the-color-of-matplotlibs-violin-plots
    violin_parts = ax.violinplot(data, showmeans=True, showmedians=True, points=20)

    for k in range(len(data)):
        violin_parts["bodies"][k].set_facecolor(colors[k])
        violin_parts["bodies"][k].set_edgecolor("black")
    violin_parts["cmeans"].set_color("magenta")
    violin_parts["cmedians"].set_color("aqua")
    violin_parts["cbars"].set_color("gray")
    violin_parts["cmaxes"].set_color("lightgray")
    violin_parts["cmins"].set_color("lightgray")

    ax.xaxis.set_ticks(range(1, len(data)+1))
    if (distribution_labels is not None) and (len(distribution_labels) == len(data)):
        ax.xaxis.set_ticklabels(distribution_labels)
    else:
        ax.xaxis.set_ticklabels([])
    
    if ylabel is not None:
        ax.set_ylabel(ylabel)
    
    if title is not None:
        ax.set_title(title)

distance_metrics = ["split_joint_distance", "mirkin_distance", "variation_of_info_distance"]

consensus_methods = ["mcla", "hbgf", "nmf", "boem", "v8"]
consensus_methods_colors = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:brown"]

mus = [1, 2, 3, 4]
ns = [200, 1000, 5000]
for n in ns:
    df = pd.read_csv("benchmark-consensus-distance-stats.csv")
    df = df[df["n"] == 200]
    for distance_metric in distance_metrics:
        naxr = 2 
        naxc = 2

        fig = plt.figure(figsize=(6, 6))
        gs = GridSpec(nrows=naxr, ncols=naxc)

        axes = []
        for i in range(naxr):
            axr = []
            for j in range(naxc):
                axr.append(fig.add_subplot(gs[i,j]))
            axes.append(axr)
        
        for i in range(naxr):
            for j in range(naxc):
                idx = (i * naxr + j)
                mu = mus[idx]
                data = []
                
                for k in range(len(consensus_methods)):
                    cons_method = consensus_methods[k]
                    mask = None
                    if cons_method == "best_candidate":
                        mask = (df["mu"] == mu) & (df["distance_metric"] == distance_metric) & (df["cons_method"] == cons_method) & (df["optimized_distance"] == distance_metric)
                    else:
                        mask = (df["mu"] == mu) & (df["distance_metric"] == distance_metric) & (df["cons_method"] == cons_method) & (df["optimized_distance"] == "none")
                    df_target = df[mask]
                    data.append(df_target["distance"])
                
                plot_distance_distribution(data, axes[i][j], consensus_methods_colors, consensus_methods, distance_metric, "$\mu:" + str(mu*1.0/10.0)+"$")
        
        plt.tight_layout()
        plt.savefig("benchmark-distance-distribution-"+ "n"+str(n)+ "-" + distance_metric +".pdf")
        """

In [None]:
from pathlib import Path

def plot_distance_distribution(data, ax, colors, distribution_labels, ylabel, title):
    # https://stackoverflow.com/questions/26291479/changing-the-color-of-matplotlibs-violin-plots
    violin_parts = ax.violinplot(data, showmeans=True, showmedians=True, points=20)

    for k in range(len(data)):
        violin_parts["bodies"][k].set_facecolor(colors[k])
        violin_parts["bodies"][k].set_edgecolor("black")
    violin_parts["cmeans"].set_color("magenta")
    violin_parts["cmedians"].set_color("aqua")
    violin_parts["cbars"].set_color("gray")
    violin_parts["cmaxes"].set_color("lightgray")
    violin_parts["cmins"].set_color("lightgray")

    ax.xaxis.set_ticks(range(1, len(data)+1))
    if (distribution_labels is not None) and (len(distribution_labels) == len(data)):
        ax.xaxis.set_ticklabels(distribution_labels)
    else:
        ax.xaxis.set_ticklabels([])
    
    if ylabel is not None:
        ax.set_ylabel(ylabel)
    
    if title is not None:
        ax.set_title(title)
        
mus = [1, 2, 3, 4]
ns = [200, 1000, 5000]
algs = ["v8", "v8-parallel", "boem", "saoem"]
distance_metrics = ["rand", "split-join", "vi"]

for n in ns:
    for mu in mus:
        directory = "data/LFR/" + "n"+str(n) + "/"
        fileprefix = "LFR_" + "n" + str(n) + "_mu0" + str(mu) + "_gamma30_beta11"
        for alg in algs:
            for distance_metric in distance_metrics:
                filename = directory + fileprefix + "." + alg + "." + distance_metric
                p = Path(filename)
                print(p.exists(), filename)

In [None]:
import pandas as pd
from pathlib import Path
import os
import shutil

HOME = "/nfs/nfs2/home/mth"
PROJECT_DIR = HOME + "/Codes/graph-consensus-clustering"
DATA_DIR = PROJECT_DIR + "/test/data"
RESULT_DIR = PROJECT_DIR + "/test/experiment-results"
#print(HOME)
#print(PROJECT_DIR)
#print(DATA_DIR)
#print(RESULT_DIR)

experiment_stats = []

#for DATASET_NAME in ["LFR-louvain", "LFR-mcl", "LFR", "LFR-preprocessed"]:
for DATASET_NAME in ["LFR"]:
    #print(DATASET_NAME)
    for ALG in ["kirkley-newman", "v8-parallel"]:
        #for N in ["200", "1000", "5000"]:
        for N in ["1000"]:
            #print(N)
            for MU in ["01", "02", "03", "04", "05", "06", "07"]:
            #for MU in ["04"]:
                FILE_PREFIX = "LFR" + "_n" + N +"_mu" + MU + "_gamma30_beta11"
                
                INPUT_DIR = DATA_DIR + "/" + DATASET_NAME + "/" + "n" + N
                INPUT_CLUSTERING_PREFIX = INPUT_DIR + "/" + FILE_PREFIX
                
                NUMBER_OF_INPUT_CLUSTERING=0
                for NUMBER_OF_INPUT_CLUSTERING in range(0,100):
                    if(Path(INPUT_CLUSTERING_PREFIX+"."+str(NUMBER_OF_INPUT_CLUSTERING)).is_file()):
                        pass
                    else:
                        break
                #print(NUMBER_OF_INPUT_CLUSTERING)
                
                OUTPUT_DIR_NAME=ALG + "." + DATASET_NAME + ".n" + N + ".mu" + MU
                OUTPUT_DIR = RESULT_DIR + "/" + OUTPUT_DIR_NAME
                OUTPUT_PREFIX = OUTPUT_DIR + "/" + FILE_PREFIX
                """
                if(Path(OUTPUT_DIR).is_dir()):
                    print(OUTPUT_DIR, "exists")
                    try:
                        shutil.rmtree(OUTPUT_DIR)
                    except OSError as e:
                        print("Error: %s - %s." % (e.filename, e.strerror))
                os.makedirs(OUTPUT_DIR)
                """
                
                stat = {"DATASET_NAME": DATASET_NAME, "ALG": ALG, "N": int(N), "MU": float(MU)/10.0}
                SOLN=0
                NMI_FILE=OUTPUT_PREFIX+".soln-"+str(SOLN)+".nmi"
                print(NMI_FILE)
                if(Path(NMI_FILE).is_file()):
                    
                    with open(NMI_FILE) as nmif:
                        nmi = nmif.readline()
                        stat["NMI"] = float(nmi)
                        #print(stat, nmi)
                        experiment_stats.append(stat)
    
                    #print("NMI_FILE exists")
                else:
                    print("NMI_FILE does not exist")
df = pd.DataFrame(experiment_stats)
print(df)

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path

#df_plot = df[(df["N"]==5000) & (df["DATASET_NAME"] == "LFR")]
#print(df_plot)

fig = plt.figure(figsize=(6, 4))
naxr = 1
naxc = 1
gs = GridSpec(nrows=naxr, ncols=naxc)
axes = []
for i in range(naxr):
    axr = []
    for j in range(naxc):
        axr.append(fig.add_subplot(gs[i,j]))
    axes.append(axr)
for i in range(naxr):
    for j in range(naxc):
        idx = (i * naxr + j)
        df_plot = df[(df["N"]==5000)]
        df_target = df_plot[ (df_plot["ALG"] == "v8-parallel") & (df_plot["DATASET_NAME"] == "LFR")]
        axes[i][j].plot(df_target["MU"], df_target["NMI"], label="our consesus", marker="o")
        df_target = df_plot[ (df_plot["ALG"] == "kirkley-newman") & (df_plot["DATASET_NAME"] == "LFR")]
        axes[i][j].plot(df_target["MU"], df_target["NMI"], label="kirkley-newman", marker="o")
        df_target = df_plot[ (df_plot["ALG"] == "v8-parallel") & (df_plot["DATASET_NAME"] == "LFR-preprocessed")]
        axes[i][j].plot(df_target["MU"], df_target["NMI"], label="kirkley-newman preprocess + our consensus", marker="x", linestyle=":", color="tab:red")
        axes[i][j].set_xlabel("$\mu$")
        axes[i][j].set_ylabel("$NMI$")
        axes[i][j].legend()
        axes[i][j].set_title("LFR($n=5000$) - 37 different clusterings")
#plt.show()
plt.savefig("benchmark-quality-"+ "n"+str(5000)+".pdf")

# Preprocessing threshold parameter study

In [None]:
import numpy as np
import scipy as sp
from scipy import sparse
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rc
import matplotlib.colors as mcolors
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams.update({
    "text.usetex": True,
    #"font.family": "Helvetica"
    "font.family": "Arial"
})
from pathlib import Path

In [None]:
RESULT_DIR="experiment-results"
OUTPUT_DIR=RESULT_DIR+"/"+"preprocess-study"
DATASET_NAME="LFR"

mus = [1, 2, 3, 4, 5, 6, 7]
ns = [1000, 5000]

fig = plt.figure(figsize=(10, 4))
naxr = 1
naxc = 2
gs = GridSpec(nrows=naxr, ncols=naxc)
axes = []
for i in range(naxr):
    axr = []
    for j in range(naxc):
        axr.append(fig.add_subplot(gs[i,j]))
    axes.append(axr)
#print(type(axes), type(axes[0]))

for i in range(naxr):
    for j in range(naxc):
        idx = (i * naxr + j)
        n = ns[idx]
        for mu in mus:
            csv_file = OUTPUT_DIR+"/"+DATASET_NAME+".n"+str(n)+".mu0"+str(mu)+".csv"
            filepath = Path(csv_file)
            #print(csv_file, filepath.exists())
            if filepath.exists():
                df = pd.read_csv(csv_file)
                #print(df)
                #print(type(axes[idx]))
                axes[i][j].plot(df["threshold"], df["n_components"], label=str(float(mu)/10.0), marker=".")
                
        axes[i][j].set_xlim(right=0.6)
        axes[i][j].minorticks_on()
        axes[i][j].grid(which="major", axis="both")
        axes[i][j].legend(title="$\mu$")
        axes[i][j].set_xlabel("Threshold")
        axes[i][j].set_ylabel("Number of groups")
        axes[i][j].set_title("n="+str(n))

#plt.suptitle("Effect of preprocessing threshold on LFR")
plt.tight_layout()
plt.savefig("preprocess-study.pdf")
plt.close()