In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from IPython.display import display
from collections import defaultdict
import matplotlib
import matplotlib.pyplot as plt
import os
plt.rcParams.update({'font.size': 13})

### Data Analysis

In [None]:
columns = ["E", "V", 
 "deg score", "deg coef", "sz score", "sz coef",
 "pd score", "pd coef", "its score", "its coef", 
 "cc score", "cc coef", "cch score", "cch coef", 
 "dst score", "dst coef", "ov score", "ov coef",
 "lsv", "sv score", "sv coef", "eff"]

property_list = ["NumHedge", "NumNode", "degree", "size", "pairdeg", "intersection",
             "clusteringcoef", "clusteringcoef_hedge", "density_dist", "overlapness_dist",
             "LargestSV", "sv", "effdiam"]

column_mapping = {
    "NumHedge": "E", "NumNode": "V",
    "degree": "deg", "pairdeg": "pd",
    "intersection": "its", "size": "sz",
    "clusteringcoef": "cc", "clusteringcoef_hedge": "cch",
    "density_dist": "dst", "overlapness_dist": "ov",
    "LargestSV": "lsv", "sv": "sv", 
    "effdiam": "eff"
}

In [None]:
from collections import defaultdict
from sklearn.linear_model import LinearRegression
import numpy as np
import os

def linearregression(X, Y, nolog=False):
    if len(X) == 0:
        return 0, 0, [0], 0
    X = np.array(X).reshape(-1, 1)
    Y = np.array(Y).reshape(-1, 1)
    if nolog is False:
        X = np.log2(X)
        Y = np.log2(Y)
    reg = LinearRegression().fit(X, Y)
    score = reg.score(X, Y)
    coef = reg.coef_
    assert len(coef) == 1
    coef = coef[0][0]
    intercept = reg.intercept_[0]
    pred = reg.predict(X).flatten()
    pred = np.exp2(pred)

    return score, coef, pred, intercept

def read_properties(dataname, algoname, modelindex=-1):
    if "answer" == algoname:
        graphpath = "../dataset/" + dataname + ".txt"
        outputdir = "../results/{}/{}/".format(algoname, dataname)
    elif "hypercl" == algoname:
        graphpath = "../dataset/" + dataname + "_cl.txt"
        outputdir = "../results/{}/{}/".format(algoname, dataname)
    elif "hyperlap" == algoname:
        graphpath = "../dataset/" + dataname + "_lap.txt"
        outputdir = "../results/{}/{}/".format(algoname, dataname)
    elif "hyperpa" == algoname:
        graphpath = "../dataset/" + dataname + "_pa.txt"
        outputdir = "../results/{}/{}/".format(algoname, dataname)
    elif "thera" == algoname:
        graphpath = "../dataset/{}_{}_{}.txt".format(dataname, algoname, modelindex)
        outputdir = "../results/{}/{}/{}/".format(algoname, dataname, modelindex)
    elif "hyperff" == algoname:
        graphpath = "../dataset/{}_{}_{}.txt".format(dataname, algoname, modelindex)
        outputdir = "../results/{}/{}/{}/".format(algoname, dataname, modelindex)
    else:
        graphpath = "../results/{}/{}/{}/hypergraph.txt".format(algoname, dataname, modelindex)
        outputdir = "../results/{}/{}/{}/".format(algoname, dataname, modelindex)

    return_dict = {}
    dist = {}
    print(graphpath)
    
    # Num Nodes & Num Edges
    numhedge = 0
    nodeset = set()
    with open(graphpath, "r") as f:
        for line in f.readlines():
            hedge = line.rstrip().split(",")
            for v in hedge:
                nodeset.add(int(v))
            numhedge += 1
    numnode = len(nodeset)
    return_dict["NumHedge"] = numhedge
    return_dict["NumNode"] = numnode  
    dist["NumHedge"] = numhedge
    dist["NumNode"] = numnode    
    
    for distname in ["degree", "pairdeg", "intersection", "size"]:
        dist[distname] = {}
        X = []
        with open(outputdir + distname + ".txt", "r") as f:
            for line in f.readlines():
                val, pdf = line.rstrip().split(",")
                val, pdf = float(val), float(pdf)
                if pdf == 0.0 or val == 0.0:
                    continue
                dist[distname][val] = pdf
                X.append(val)
        X = sorted(X)
        Y = [dist[distname][x] for x in X]
        score, coef, pred, _ = linearregression(X, Y)
        return_dict[distname] = (score, coef)
        
    for distname in ["clusteringcoef_hedge", "density_dist", "overlapness_dist"]:
        dist[distname] = defaultdict(list)
        X = []
        try:
            with open(outputdir + distname + ".txt", "r") as f:
                for line in f.readlines():
                    val, pdf = line.rstrip().split(",")
                    val, pdf = float(val), float(pdf)
                    if val == 0.0 or pdf == 0.0:
                        continue
                    dist[distname][val].append(pdf)
                    X.append(val)
            X = sorted(X)
            Y = []
            for x in X:
                y = np.mean(dist[distname][x])
                dist[distname][x] = y
                if y > 0:
                    Y.append(y)
                else:
                    Y.append(1)
            score, coef, pred, _ = linearregression(X, Y)
            return_dict[distname] = (score, coef)
        except:
            pass
    
    with open(outputdir + "sv.txt", "r") as f:
        tmp = {}
        X = []
        lsv = 0
        for li, line in enumerate(f.readlines()):
            sv = float(line.rstrip())
            if li == 0:
                lsv = sv
            tmp[li + 1] = sv
            X.append(li + 1)
        X = sorted(X)
        if dataname not in ["tags-ask-ubuntu", "tags-math-sx", "threads-ask-ubuntu", "threads-math-sx"]:
            X = X[:min(1000, int(len(X) * 0.5))]
        elif dataname in ["tags-ask-ubuntu", "tags-math-sx", "threads-ask-ubuntu", "threads-math-sx"]:
            X = X[:1000]
        elif dataname in [ "coauth-MAG-Geology-full", "coauth-MAG-History-full"]:
            X = X[:500]
        Y = [tmp[x] for x in X]

        dist["sv"] = {}
        for x,y in zip(X, Y):
            dist["sv"][x] = y
        dist["LargestSV"] = lsv
        score, coef, pred, _ = linearregression(X, Y)
        return_dict["sv"] = (score, coef)

    # EffDiam
    with open(outputdir + "effdiameter.txt", "r") as f:
        effdiam = 0
        for line in f.readlines():
            effdiam = float(line.rstrip())
        return_dict["effdiam"] = effdiam
        dist["effdiam"] = effdiam

    return return_dict, dist

In [None]:
def get_cdf(_dict):
    cumulated_x = sorted(list(_dict.keys()))
    cdf = {}
    cum = 0

    for _x in cumulated_x:
        cum += _dict[_x]
        cdf[_x] = cum
        assert cum < 1.1
        
    return cdf

def get_cumul_dist(dict_x1, dict_x2):
    cdf1 = get_cdf(dict_x1)
    x1 = list(cdf1.keys())
    cdf2 = get_cdf(dict_x2)
    x2 = list(cdf2.keys())
    
    cum1, cum2 = 0, 0
    maxdiff = 0
    for x in sorted(list(set(x1 + x2))):
        if x in x1:
            cum1 = cdf1[x]
        if x in x2:
            cum2 = cdf2[x]
        if abs(cum1 - cum2) > maxdiff:
            maxdiff = abs(cum1 - cum2)
    
    return maxdiff

    
def get_rmse_dist(dict_x1, dict_x2, normalize=False):
    total = 0
    
    maxy1 = 0
    
    x1s = list(dict_x1.keys())
    x2s = list(dict_x2.keys())
    for x in set(x1s + x2s):
        y1, y2 = 0, 0
        if x in x1s:
            y1 = dict_x1[x]
            if maxy1 < y1:
                maxy1 = y1
        if x in x2s:
            y2 = dict_x2[x]
        
        total += (y1 - y2) ** 2
    
    total /= len(set(x1s + x2s))
    total = total ** 0.5
    
    if normalize:
        total /= maxy1
    return total

In [None]:
target_result_dir = "hyperk"

In [None]:
xlabeldict = {
    "degree": "Node degree",
    "size": "Hyperedge size",
    "pairdeg": "Degree of node pairs",
    "intersection": "Intersection size",
    "sv": "Rank",
    "clusteringcoef_hedge": "Node degree",
    "density_dist": "# of nodes",
    "overlapness_dist": "# of nodes"
}

ylabeldict = {
    "degree": "OddsRatio",
    "size": "OddsRatio",
    "pairdeg": "PDF",
    "intersection": 'PDF',
    "sv": "Singular value",
    "clusteringcoef_hedge": "# of inter- \n secting pairs",
    "density_dist": "# of hyperedges",
    "overlapness_dist": r"$\sum$ hyperedge sizes"
}

In [None]:
color = {
    "answer": "black",
    
    "hyperk": "#4daf4a",
    "hypercl": "#e6ab02",
    "hyperlap": "#377eb8",
    "hyperpa": "#984ea3",
    "thera": "#ff7f00",
    "hyperff": "#e41a1c",
}

In [None]:
# dataname = "email-Enron-full"
# dataname = "email-Eu-full"
# dataname = "NDC-classes-full"
# dataname = "NDC-substances-full"
# dataname = "contact-primary-school"
# dataname = "contact-high-school"
# dataname = "tags-ask-ubuntu"
# dataname = "tags-math-sx"
# dataname = "threads-ask-ubuntu"
dataname = "threads-math-sx"

In [None]:
distset = ["degree", "size", "pairdeg", "intersection", "sv", 
           "clusteringcoef_hedge", "density_dist", "overlapness_dist"]

In [None]:
fflist = {
    "email-Enron-full": ["0.51_0.2"],
    "email-Eu-full": ["0.51_0.3"],
    "contact-high-school": ["0.51_0.3"],
    "contact-primary-school": ["0.51_0.3"],
    "NDC-classes-full": ["0.45_0.3"],
    "NDC-substances-full": ["0.45_0.3"],
    "tags-ask-ubuntu": ["0.51_0.3"],
    "tags-math-sx": ["0.51_0.3"],
    "threads-ask-ubuntu": ["0.45_0.2"],
    "threads-math-sx": ["0.45_0.2"]
}

trlist = {
    "email-Enron-full": ["12_0.7_6.0"], 
    "email-Eu-full": ["15_0.5_6.0"],
    "contact-high-school": ["15_0.7_2.0"], 
    "contact-primary-school": ["15_0.5_2.0"],
    "NDC-classes-full": ["15_0.9_6.0"],
    "NDC-substances-full": ["15_0.5_6.0"],
    "tags-ask-ubuntu": ["8_0.5_2.0"], 
    "tags-math-sx": ["8_0.9_2.0"], 
    "threads-ask-ubuntu": ["8_0.5_6.0"], 
    "threads-math-sx": ["8_0.5_6.0"], 
}


namelist = [("answer", -1)]
if len(trlist[dataname]) > 0:
    namelist.append(("thera", trlist[dataname][0]))
namelist.append(("hyperlap", -1))    
namelist.append(("hypercl", -1))
if dataname != "email-Enron-full":
    namelist.append(("hyperpa", -1))
if len(fflist[dataname]) > 0:
    namelist.append(("hyperff", fflist[dataname][0]))
if len(kronlist[dataname]) > 0:
    namelist.append(("hyperk", 0)) # assume there exist just one generated hypergraphs

In [None]:
rename = {
    "answer": "ANSWER",
    "hypercl" : "HyperCL",
    "hyperlap" : "HyperLAP",
    "hyperpa" : "HyperPA",
    "hyperff" : "HyperFF",
    "thera" : "THera", 
    "hyperk" : "HyperK",
}

In [None]:
for distname in distset:
    outputpath = "figure/fit/" + dataname + "/"
    if os.path.isdir(outputpath) is False:
        os.makedirs(outputpath)
    outputpath += distname + ".jpg"

    plt.figure(figsize=(5,4), dpi=100)
    for (name, idx) in namelist:
        print(dataname, name, idx)
        ret, dist = read_properties(dataname, name, idx)
        if name == "answer":
            ret_answer = ret
            dist_answer = dist 
        
        algoname = rename[name]

        # PLOT!
        target_dist = dist[distname]
        x = list(target_dist.keys())
        y = [target_dist[_x] for _x in x]
        score, coef, pred, intercept = linearregression(x, y)
        
        if name in ["answer"]:
            plt.scatter(x, y, label=algoname, c=color[name], alpha=0.6, s=130)
        elif name in ["hypercl",  "hyperlap", "hyperff", "thera"]:
            plt.scatter(x, y, label=algoname, c=color[name], alpha=0.7, s=40)
        else:
            plt.scatter(x, y, label=algoname, c=color[name], alpha=0.9, s= 40)
        
    plt.xscale("log", base=2)
    plt.yscale("log", base=2)
    
    ax = plt.gca()
    ax.tick_params(labelcolor='#4B4B4B', labelsize=22)
    plt.xlabel(xlabeldict[distname], fontsize=24)
    plt.ylabel(ylabeldict[distname], fontsize=24)
        
#     plt.legend(bbox_to_anchor=(1,1.05))
#     plt.title("%s" % (dataname))
#     plt.savefig(outputpath, bbox_inches='tight')
    plt.show()
    plt.close()

print()
print()