In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from IPython.display import display
from collections import defaultdict
import matplotlib
import matplotlib.pyplot as plt
import os
plt.rcParams.update({'font.size': 13})

### Data Analysis

In [None]:
columns = ["E", "V", 
 "deg score", "deg coef", "sz score", "sz coef",
 "pd score", "pd coef", "its score", "its coef", 
 "cc score", "cc coef", "cch score", "cch coef", 
 "dst score", "dst coef", "ov score", "ov coef",
 "lsv", "sv score", "sv coef", "eff"]

property_list = ["NumHedge", "NumNode", "degree", "size", "pairdeg", "intersection",
             "clusteringcoef", "clusteringcoef_hedge", "density_dist", "overlapness_dist",
             "LargestSV", "sv", "effdiam"]

column_mapping = {
    "NumHedge": "E", "NumNode": "V",
    "degree": "deg", "pairdeg": "pd",
    "intersection": "its", "size": "sz",
    "clusteringcoef": "cc", "clusteringcoef_hedge": "cch",
    "density_dist": "dst", "overlapness_dist": "ov",
    "LargestSV": "lsv", "sv": "sv", 
    "effdiam": "eff"
}

In [None]:
from collections import defaultdict
from sklearn.linear_model import LinearRegression
import numpy as np
import os

def linearregression(X, Y, nolog=False):
    if len(X) == 0:
        return 0, 0, [0], 0
    X = np.array(X).reshape(-1, 1)
    Y = np.array(Y).reshape(-1, 1)
    if nolog is False:
        X = np.log2(X)
        Y = np.log2(Y)
    reg = LinearRegression().fit(X, Y)
    score = reg.score(X, Y)
    coef = reg.coef_
    assert len(coef) == 1
    coef = coef[0][0]
    intercept = reg.intercept_[0]
    pred = reg.predict(X).flatten()
    pred = np.exp2(pred)

    return score, coef, pred, intercept

def read_properties(dataname, algoname, modelindex=-1):
    if "answer" == algoname:
        graphpath = "../dataset/" + dataname + ".txt"
        outputdir = "../results/{}/{}/".format(algoname, dataname)
    elif "hypercl" == algoname:
        graphpath = "../dataset/" + dataname + "_cl.txt"
        outputdir = "../results/{}/{}/".format(algoname, dataname)
    elif "hyperunif" == algoname:
        graphpath = "../dataset/" + dataname + "_unif.txt"
        outputdir = "../results/{}/{}/".format(algoname, dataname)
    elif "hyperlap" == algoname:
        graphpath = "../dataset/" + dataname + "_lap.txt"
        outputdir = "../results/{}/{}/".format(algoname, dataname)
    elif "hyperpa" == algoname:
        graphpath = "../dataset/" + dataname + "_pa.txt"
        outputdir = "../results/{}/{}/".format(algoname, dataname)
    elif "thera" == algoname:
        graphpath = "../dataset/{}_{}_{}.txt".format(dataname, algoname, modelindex)
        outputdir = "../results/{}/{}/{}/".format(algoname, dataname, modelindex)
    elif "hyperff" == algoname:
        graphpath = "../dataset/{}_{}_{}.txt".format(dataname, algoname, modelindex)
        outputdir = "../results/{}/{}/{}/".format(algoname, dataname, modelindex)
    else:
        graphpath = "../results/{}/{}/{}/hypergraph.txt".format(algoname, dataname, modelindex)
        outputdir = "../results/{}/{}/{}/".format(algoname, dataname, modelindex)

    return_dict = {}
    dist = {}
    print(graphpath)
    
    # Num Nodes & Num Edges
    numhedge = 0
    nodeset = set()
    with open(graphpath, "r") as f:
        for line in f.readlines():
            hedge = line.rstrip().split(",")
            for v in hedge:
                nodeset.add(int(v))
            numhedge += 1
    numnode = len(nodeset)
    return_dict["NumHedge"] = numhedge
    return_dict["NumNode"] = numnode  
    dist["NumHedge"] = numhedge
    dist["NumNode"] = numnode    
    
    for distname in ["degree", "pairdeg", "intersection", "size"]:
        dist[distname] = {}
        X = []
        with open(outputdir + distname + ".txt", "r") as f:
            for line in f.readlines():
                val, pdf = line.rstrip().split(",")
                val, pdf = float(val), float(pdf)
                if pdf == 0.0 or val == 0.0:
                    continue
                dist[distname][val] = pdf
                X.append(val)
        X = sorted(X)
        Y = [dist[distname][x] for x in X]
        score, coef, pred, _ = linearregression(X, Y)
        return_dict[distname] = (score, coef)
        
    for distname in ["clusteringcoef_hedge", "density_dist", "overlapness_dist"]:
        dist[distname] = defaultdict(list)
        X = []
        try:
            with open(outputdir + distname + ".txt", "r") as f:
                for line in f.readlines():
                    val, pdf = line.rstrip().split(",")
                    val, pdf = float(val), float(pdf)
                    if val == 0.0 or pdf == 0.0:
                        continue
                    dist[distname][val].append(pdf)
                    X.append(val)
            X = sorted(X)
            Y = []
            for x in X:
                y = np.mean(dist[distname][x])
                dist[distname][x] = y
                if y > 0:
                    Y.append(y)
                else:
                    Y.append(1)
            score, coef, pred, _ = linearregression(X, Y)
            return_dict[distname] = (score, coef)
        except:
            pass

    # SV    
    with open(outputdir + "sv.txt", "r") as f:
        tmp = {}
        X = []
        lsv = 0
        for li, line in enumerate(f.readlines()):
            sv = float(line.rstrip())
            if li == 0:
                lsv = sv
            tmp[li + 1] = sv
            X.append(li + 1)
        X = sorted(X)
        if dataname not in ["tags-ask-ubuntu", "tags-math-sx", "threads-ask-ubuntu", "threads-math-sx"]:
            X = X[:min(1000, int(len(X) * 0.5))]
        elif dataname in ["tags-ask-ubuntu", "tags-math-sx", "threads-ask-ubuntu", "threads-math-sx"]:
            X = X[:1000]
        elif dataname in [ "coauth-MAG-Geology-full", "coauth-MAG-History-full"]:
            X = X[:500]
        Y = [tmp[x] for x in X]

        dist["sv"] = {}
        for x,y in zip(X, Y):
            dist["sv"][x] = y
        dist["LargestSV"] = lsv
        score, coef, pred, _ = linearregression(X, Y)
        return_dict["sv"] = (score, coef)


    # EffDiam
    with open(outputdir + "effdiameter.txt", "r") as f:
        effdiam = 0
        for line in f.readlines():
            effdiam = float(line.rstrip())
        return_dict["effdiam"] = effdiam
        dist["effdiam"] = effdiam
    
    # SAVE
    with open(outputdir + "property.txt", "w") as f:
        f.write(",".join(columns) + "\n")
        tmp = []
        for name in property_list:
            if name in ["LargestSV", "effdiam", "NumHedge", "NumNode"]:
                tmp.append(str(return_dict[name]))
            else:
                tmp1, tmp2 = return_dict[name]
                tmp.append(str(tmp1))
                tmp.append(str(tmp2))
        f.write(",".join(tmp))
        f.write("\n")

    return return_dict, dist

In [None]:
def get_cdf(_dict):
    cumulated_x = sorted(list(_dict.keys()))
    cdf = {}
    cum = 0

    for _x in cumulated_x:
        cum += _dict[_x]
        cdf[_x] = cum
        assert cum < 1.1
        
    return cdf

def get_cumul_dist(dict_x1, dict_x2):
    cdf1 = get_cdf(dict_x1)
    x1 = list(cdf1.keys())
    cdf2 = get_cdf(dict_x2)
    x2 = list(cdf2.keys())
    
    cum1, cum2 = 0, 0
    maxdiff = 0
    maxdiff_pos = 0
    for x in sorted(list(set(x1 + x2))):
        if x in x1:
            cum1 = cdf1[x]
        if x in x2:
            cum2 = cdf2[x]
        if abs(cum1 - cum2) > maxdiff:
            maxdiff = abs(cum1 - cum2)
            maxdiff_pos = x
    print(x)
    
    return maxdiff

    
def get_rmse_dist(dict_x1, dict_x2, set_length=False, normalize=False):
    total = 0
    maxy1 = 0
    
    x1s = list(dict_x1.keys())
    x2s = list(dict_x2.keys())
    
    if set_length:
        keys = x1s
    else:
        keys = set(x1s+x2s)
    
    for x in keys:
        y1, y2 = 0, 0
        if x in x1s:
            y1 = dict_x1[x]
            if maxy1 < y1:
                maxy1 = y1
        if x in x2s:
            y2 = dict_x2[x]
        
        total += (y1 - y2) ** 2
    
    total /= len(keys)
    total = total ** 0.5
    
    if normalize:
        total /= maxy1
        
    return total

In [None]:
# dataname = "email-Enron-full"
# dataname = "email-Eu-full" 
# dataname = "NDC-classes-full"
# dataname = "NDC-substances-full"
# dataname = "contact-primary-school"
# dataname = "contact-high-school"
# dataname = "tags-ask-ubuntu"
dataname = "tags-math-sx"
# dataname = "threads-ask-ubuntu"
# dataname = "threads-math-sx"

In [None]:
fflist = {
    "email-Enron-full": ["0.51_0.2"],
    "email-Eu-full": ["0.51_0.3"],
    "contact-high-school": ["0.51_0.3"],
    "contact-primary-school": ["0.51_0.3"],
    "NDC-classes-full": ["0.45_0.3"],
    "NDC-substances-full": ["0.45_0.3"],
    "tags-ask-ubuntu": ["0.51_0.3"],
    "tags-math-sx": ["0.51_0.3"],
    "threads-ask-ubuntu": ["0.45_0.2"],
    "threads-math-sx": ["0.45_0.2"]
}

trlist = {
    "email-Enron-full": ["12_0.7_6.0"], 
    "email-Eu-full": ["15_0.5_6.0"], 
    "contact-high-school": ["15_0.7_2.0"], 
    "contact-primary-school": ["15_0.5_2.0"], 
    "NDC-classes-full": ["15_0.9_6.0"],
    "NDC-substances-full": ["15_0.5_6.0"], 
    "tags-ask-ubuntu": ["8_0.5_2.0"], 
    "tags-math-sx": ["8_0.9_2.0"],
    "threads-ask-ubuntu": ["8_0.5_6.0"], 
    "threads-math-sx": ["8_0.5_6.0"], 
}

namelist = [("answer", -1)]
if len(trlist[dataname]) > 0:
    namelist.append(("thera", trlist[dataname][0]))
namelist.append(("hyperlap", -1))    
if dataname != "email-Enron-full":
    namelist.append(("hyperpa", -1))    
namelist.append(("hypercl", -1))
if len(fflist[dataname]) > 0:
    namelist.append(("hyperff", fflist[dataname][0]))
if len(kronlist[dataname]) > 0:
    namelist.append(("hyperk", 0))

# WRITE OUTPUT

In [None]:
property_list = ["degree", "size", "pairdeg", "intersection","sv", 
             "clusteringcoef_hedge", "density_dist", "overlapness_dist", "effdiam"]

evallist = ['deg', 'sz', 'pd', 'its', 'cch', 'dst', 'ov', 'sv', 'eff'] 

In [None]:
outputdir = "csv/sv_fit/"
outputpath = outputdir + dataname + ".txt"
if os.path.isdir(outputdir) is False:
    os.makedirs(outputdir)

In [None]:
columns = [column_mapping[prop] for prop in property_list]
with open(outputpath, "w") as f:
    f.write(",".join(["AlgoName", "AlgoOpt"] + columns) + "\n")

for name, modelindex in namelist:
    ret, dist = read_properties(dataname, name, modelindex)
    if name == "answer":
        ret_answer = ret
        dist_answer = dist   
        continue

    difflist = []
    for prop in property_list:
        if prop in ["degree", "size", "pairdeg", "intersection"]:
            diff = get_cumul_dist(dist_answer[prop], dist[prop])
        elif prop in ["sv"]:
            diff = get_rmse_dist(dist_answer[prop], dist[prop], set_length=True, normalize=False)
        elif prop in ["clusteringcoef_hedge", "density_dist", "overlapness_dist"]:
            diff = get_rmse_dist(dist_answer[prop], dist[prop], normalize=False)
        elif prop in ["NumHedge", "NumNode", "LargestSV", "effdiam"]:
            diff = abs(dist[prop] - dist_answer[prop]) / dist_answer[prop]
        difflist.append(str(diff))

    with open(outputpath, "a") as f:
        algoopt = str(modelindex)
        algoname = name
        f.write(",".join([algoname, algoopt] + difflist) + "\n")

# RESULT

In [None]:
d = pd.read_csv(outputdir + dataname + ".txt")
d = d.sort_values(by="sv")
d

In [None]:
d = pd.read_csv(outputdir + dataname + ".txt")
target = d[evallist]
d['avg'] = target.mean(axis=1)
d.to_csv(outputdir + dataname + ".txt", index=False)

d = pd.read_csv(outputdir + dataname + ".txt")
d = d.sort_values(by="avg")
d

In [None]:
d.columns

# RANK & NORM

In [None]:
# Make Normalize Result
d = pd.read_csv(outputdir + dataname + ".txt")
for col in evallist:
    if d[col].std() != 0:
        d[col] = (d[col] - d[col].mean()) / d[col].std()
norms = d[evallist]
d['avg'] = norms.mean(axis=1)
# d = d.sort_values(by=["avg"], ascending=True)
d.to_csv(outputdir + dataname + "_norm.txt", index=False)

# Make Ranking Result
d = pd.read_csv(outputdir + dataname + ".txt")
for ename in evallist:
    d[ename] = d[ename].abs().rank(method='min')
ranks = d[evallist]
d['avg'] = ranks.mean(axis=1)
# d = d.sort_values(by=["avg"], ascending=True)
d.to_csv(outputdir + dataname + "_rank.txt", index=False)

In [None]:
# Norm
nd = pd.read_csv(outputdir + dataname + "_norm.txt")
nd = nd.sort_values(by=["avg"], ascending=True)
nd

In [None]:
# Rank
rd = pd.read_csv(outputdir + dataname + "_rank.txt")
rd = rd.sort_values(by=["avg"], ascending=True)
rd

# GET FINAL

In [None]:
algoname2agg = defaultdict(float)

nd = pd.read_csv(outputdir + dataname + "_norm.txt")
nd = nd.sort_values(by=["avg"], ascending=True)
norder = 1
for irow, row in nd.iterrows():
    algoname = row["AlgoName"]
    algoname2agg[algoname] += (norder * 0.5)
    norder += 1
    
rd = pd.read_csv(outputdir + dataname + "_rank.txt")
rd = rd.sort_values(by=["avg"], ascending=True)
rorder = 1
for irow, row in rd.iterrows():
    algoname = row["AlgoName"]
    algoname2agg[algoname] += (rorder * 0.5)
    rorder += 1

In [None]:
with open(outputdir + dataname + "_final.txt", "w") as f:
    for algoname in algoname2agg.keys():
        f.write(algoname + "\t" + str(algoname2agg[algoname]) + "\n")

# AGGREGATE DATA

In [None]:
dataset = ["email-Enron-full", "email-Eu-full",
           "contact-high-school", "contact-primary-school",
          "NDC-classes-full", "NDC-substances-full"
           ,"tags-ask-ubuntu", "tags-math-sx",
          "threads-ask-ubuntu", "threads-math-sx"]

algolist = ["thera", "hyperlap", "hypercl", "hyperpa", "hyperff", "hyperk"]

In [None]:
table_aggdata2norm = {}
table_aggdata2rank = {}
for dataname in dataset:
    d = pd.read_csv(outputdir + dataname + "_norm.txt")
    for irow, row in d.iterrows():
        algoname = row["AlgoName"]
        if algoname not in table_aggdata2norm:
            table_aggdata2norm[algoname] = defaultdict(list)
        for evalname in evallist + ["avg"]:
            table_aggdata2norm[algoname][evalname].append(row[evalname])
    
    d = pd.read_csv(outputdir + dataname + "_rank.txt")
    for irow, row in d.iterrows():
        algoname = row["AlgoName"]
        if algoname not in table_aggdata2rank:
            table_aggdata2rank[algoname] = defaultdict(list)
        for evalname in evallist + ["avg"]:
            table_aggdata2rank[algoname][evalname].append(row[evalname])
        
        
for algoname in algolist:
    for evalname in evallist + ["avg"]:
        table_aggdata2norm[algoname][evalname] = np.mean(table_aggdata2norm[algoname][evalname])
        table_aggdata2rank[algoname][evalname] = np.mean(table_aggdata2rank[algoname][evalname])

In [None]:
with open(outputdir + "agg_norm_table", "w") as f:
    f.write(",".join(["AlgoName"] + evallist + ["avg"]) + "\n")
    for algoname in algolist:
        f.write(",".join([algoname] + [str(table_aggdata2norm[algoname][evalname]) for evalname in evallist + ["avg"]]) + "\n")
        
with open(outputdir + "agg_rank_table", "w") as f:
    f.write(",".join(["AlgoName"] + evallist + ["avg"]) + "\n")
    for algoname in algolist:
        f.write(",".join([algoname] + [str(table_aggdata2rank[algoname][evalname]) for evalname in evallist + ["avg"]]) + "\n")

In [None]:
nd = pd.read_csv(outputdir + "agg_norm_table")
nd

In [None]:
rd = pd.read_csv(outputdir + "agg_rank_table")
rd

In [None]:
# NUMBER OF PARAMETERS

dataset2numparam = defaultdict(dict)

for dataname in dataset:
    answer_numhedge = 0
    nodeset = set()
    with open("../dataset/" + dataname + ".txt", "r") as f:
        for line in f.readlines():
            hedge = line.rstrip().split(",")
            for v in hedge:
                nodeset.add(int(v))
            answer_numhedge += 1
    answer_numnode = len(nodeset)
    print(dataname, answer_numhedge, answer_numnode)


    number_of_parameter = {}
    if len(trlist[dataname]) > 0:
        # tr : size = |E|
        number_of_parameter["thera"] = answer_numhedge + 4
    # lap: size & degree = |V| + |E|
    number_of_parameter["hyperlap"] = answer_numhedge + answer_numnode
    # cl: size & degree = |V| + |E|
    number_of_parameter["hypercl"] = answer_numhedge + answer_numnode
    # pa: size & degree = |V| + |E|
    number_of_parameter["hyperpa"] = answer_numhedge + answer_numnode
    if len(fflist[dataname]) > 0:
        # ff : no
        number_of_parameter["hyperff"] = 2
    if len(kronlist[dataname]) > 0:
        # output_list & 
        d = pd.read_csv("../results/hyperk/{}/output_list.txt".format(dataname))
        path = d.iloc[kronlist[dataname][0]]["modelpath"]
        print(path)
        tmp = path.split("/")[-2].split("_")
        initrow, initcol = int(tmp[0]), int(tmp[1])
        number_of_parameter["hyperk"] = initrow * initcol
    
    dataset2numparam[dataname] = number_of_parameter

In [None]:
dataset2result = defaultdict(dict)
for dataname in dataset:
    # print(dataname)
    with open(outputdir + dataname + "_final.txt", "r") as f:
        for line in f.readlines():
            tmp = line.rstrip().split("\t")
            algoname = tmp[0]
            result = float(tmp[1])
            dataset2result[dataname][algoname] = result

In [None]:
colordict = {
    "answer": "black",
    
    "hyperk": "#4daf4a",
    "hypercl": "#e6ab02",
    "hyperlap": "#377eb8",
    "hyperpa": "#984ea3",
    "hypertr": "#ff7f00",
    "hyperff": "#e41a1c",
}

markerdict = {
    "email-Enron-full": "o",
    "email-Eu-full": "o",
    "contact-high-school": "^",
    "contact-primary-school": "^",
    "NDC-classes-full": "D",
    "NDC-substances-full": "D",
    "tags-ask-ubuntu": "P",
    "tags-math-sx": "P",
    "threads-ask-ubuntu": "s",
    "threads-math-sx": "s",
    "coauth-MAG-Geology-full": "<",
    "coauth-MAG-History-full": "<"
    
}

In [None]:
aggdata2norm = defaultdict(list)
aggdata2rank = defaultdict(list)
for dataname in dataset:
    d = pd.read_csv(outputdir + dataname + "_norm.txt")
    for irow, row in d.iterrows():
        algoname = row["AlgoName"]
        if algoname not in aggdata2norm:
            aggdata2norm[algoname] = defaultdict(list)
        for evalname in evallist + ["avg"]:
            aggdata2norm[algoname][evalname].append(row[evalname])
    
    d = pd.read_csv(outputdir + dataname + "_rank.txt")
    for irow, row in d.iterrows():
        algoname = row["AlgoName"]
        if algoname not in aggdata2rank:
            aggdata2rank[algoname] = defaultdict(list)
        for evalname in evallist + ["avg"]:
            aggdata2rank[algoname][evalname].append(row[evalname])

In [None]:
for algoname in algolist:
    for evalname in evallist + ["avg"]:
        aggdata2norm[algoname][evalname] = np.mean(aggdata2norm[algoname][evalname])
        aggdata2rank[algoname][evalname] = np.mean(aggdata2rank[algoname][evalname])

In [None]:
evallist2rename = {
    'deg': "Degree", 
    'sz': "Size", 
    'pd': "Pair Degree", 
    'its': "Intersection", 
    'cch': "Clustering Coef.", 
    'dst': "Density", 
    'ov': "Overlapness", 
    'sv': "Singular value", 
    'eff': "Effective Diam.", 
    "avg": "Average"
}

from matplotlib.ticker import FormatStrFormatter

In [None]:
outputpath = "figure/fit/"

for evalname in evallist + ["avg"]:
    plt.figure(figsize=(4,3.3), dpi=100)

    for algoname in algolist:
        numparam = aggdata2numparam[algoname]
        if algoname not in dataset2result[dataname]:
            continue
        result = aggdata2norm[algoname][evalname]
        if algoname == "hyperk":
            plt.scatter(numparam, result, c=colordict[algoname], marker="*", s=700, alpha=1.0)
        else:
            plt.scatter(numparam, result, c=colordict[algoname], s=400, alpha=1.0)

    ax = plt.gca()
    xmin, xmax = ax.get_xlim()
    ymin, ymax = ax.get_ylim()
    plt.xlim(1, max(1, xmax*10))
    if evalname == "avg":
        plt.ylim(-1, 1)
    else:
        plt.ylim((min(-1, ymin-1), max(1, ymax+1)))
    plt.ylabel("Z-score", fontsize=24)
    plt.xlabel("Number of Parameters", fontsize=24)
    plt.xscale("log")
    ax.tick_params(labelcolor='#4B4B4B', labelsize=22)
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))

    plt.savefig(outputpath + "agg_{}_norm.jpg".format(evalname), bbox_inches='tight')
    plt.show()
    plt.close()

In [None]:
outputpath = "figure/fit/"

for evalname in evallist + ["avg"]:
    plt.figure(figsize=(4,3.3), dpi=100)

    for algoname in algolist:
        numparam = aggdata2numparam[algoname]
        if algoname not in dataset2result[dataname]:
            continue
        result = aggdata2rank[algoname][evalname]
        if algoname == "hyperk":
            plt.scatter(numparam, result, c=colordict[algoname], marker="*", s=700, alpha=1.0)
        else:
            plt.scatter(numparam, result, c=colordict[algoname], s=400, alpha=1.0)

    ax = plt.gca()
    xmin, xmax = ax.get_xlim()
    ymin, ymax = ax.get_ylim()
    plt.xlim(1, max(1, xmax*10))
    plt.ylim((1, 6))
    plt.ylabel("Ranking", fontsize=24)
    plt.xlabel("Number of Parameters", fontsize=24)
    plt.xscale("log")
    ax.tick_params(labelcolor='#4B4B4B', labelsize=22)
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))

    plt.savefig(outputpath + "agg_{}_rank.jpg".format(evalname), bbox_inches='tight')
    plt.show()
    plt.close()