In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from IPython.display import display
from collections import defaultdict
import matplotlib
import matplotlib.pyplot as plt
import os
plt.rcParams.update({'font.size': 13})

### Data Analysis

In [None]:
property_list = ["NumHedge", "NumNode", "degree", "size", "pairdeg", "intersection",
             "clusteringcoef_hedge", "density_dist", "overlapness_dist",
             "sv", "effdiam"]

column_mapping = {
    "NumHedge": "E", 
    "NumNode": "V",
    "degree": "deg", 
    "pairdeg": "pd",
    "intersection": "its", 
    "size": "sz",
    "clusteringcoef_hedge": "cch",
    "density_dist": "dst", 
    "overlapness_dist": "ov",
    "sv": "sv", 
    "effdiam": "eff"
}

In [None]:
from collections import defaultdict
from sklearn.linear_model import LinearRegression
import numpy as np
import os

def linearregression(X, Y, nolog=False):
    if len(X) == 0:
        return 0, 0, [0], 0
    X = np.array(X).reshape(-1, 1)
    Y = np.array(Y).reshape(-1, 1)
    if nolog is False:
        X = np.log2(X)
        Y = np.log2(Y)
    reg = LinearRegression().fit(X, Y)
    score = reg.score(X, Y)
    coef = reg.coef_
    assert len(coef) == 1
    coef = coef[0][0]
    intercept = reg.intercept_[0]
    pred = reg.predict(X).flatten()
    pred = np.exp2(pred)

    return score, coef, pred, intercept

def read_properties(dataname, algoname, modelindex=-1):
    if "answer" == algoname:
        graphpath = "../dataset/" + dataname + ".txt"
        outputdir = "../results/{}/{}/".format(algoname, dataname)
    elif "hypercl" == algoname:
        graphpath = "../dataset/" + dataname + "_cl.txt"
        outputdir = "../results/{}/{}/".format(algoname, dataname)
    elif "hyperlap" == algoname:
        graphpath = "../dataset/" + dataname + "_lap.txt"
        outputdir = "../results/{}/{}/".format(algoname, dataname)
    elif "hyperpa" == algoname:
        graphpath = "../dataset/" + dataname + "_pa.txt"
        outputdir = "../results/{}/{}/".format(algoname, dataname)
    elif "thera" == algoname:
        graphpath = "../dataset/{}_{}_{}.txt".format(dataname, algoname, modelindex)
        outputdir = "../results/{}/{}/{}/".format(algoname, dataname, modelindex)
    elif "hyperff" == algoname:
        graphpath = "../dataset/{}_{}_{}.txt".format(dataname, algoname, modelindex)
        outputdir = "../results/{}/{}/{}/".format(algoname, dataname, modelindex)
    else:
        graphpath = "../results/{}/{}/{}/hypergraph.txt".format(algoname, dataname, modelindex)
        outputdir = "../results/{}/{}/{}/".format(algoname, dataname, modelindex)

    return_dict = {}
    dist = {}
    print(graphpath)
    
    # Num Nodes & Num Edges
    numhedge = 0
    nodeset = set()
    with open(graphpath, "r") as f:
        for line in f.readlines():
            hedge = line.rstrip().split(",")
            for v in hedge:
                nodeset.add(int(v))
            numhedge += 1
    numnode = len(nodeset)
    return_dict["NumHedge"] = numhedge
    return_dict["NumNode"] = numnode  
    dist["NumHedge"] = numhedge
    dist["NumNode"] = numnode    
    
    for distname in ["degree", "pairdeg", "intersection", "size"]:
        dist[distname] = {}
        X = []
        with open(outputdir + distname + ".txt", "r") as f:
            for line in f.readlines():
                val, pdf = line.rstrip().split(",")
                val, pdf = float(val), float(pdf)
                if pdf == 0.0 or val == 0.0:
                    continue
                dist[distname][val] = pdf
                X.append(val)
        X = sorted(X)
        Y = [dist[distname][x] for x in X]
        score, coef, pred, _ = linearregression(X, Y)
        return_dict[distname] = (score, coef)
        
    for distname in ["clusteringcoef_hedge", "density_dist", "overlapness_dist"]:
        dist[distname] = defaultdict(list)
        X = []
        try:
            with open(outputdir + distname + ".txt", "r") as f:
                for line in f.readlines():
                    val, pdf = line.rstrip().split(",")
                    val, pdf = float(val), float(pdf)
                    if val == 0.0 or pdf == 0.0:
                        continue
                    dist[distname][val].append(pdf)
                    X.append(val)
            X = sorted(X)
            Y = []
            for x in X:
                y = np.mean(dist[distname][x])
                dist[distname][x] = y
                if y > 0:
                    Y.append(y)
                else:
                    Y.append(1)
            score, coef, pred, _ = linearregression(X, Y)
            return_dict[distname] = (score, coef)
        except:
            pass

    # SV    
    with open(outputdir + "sv.txt", "r") as f:
        tmp = {}
        X = []
        lsv = 0
        for li, line in enumerate(f.readlines()):
            sv = float(line.rstrip())
            if li == 0:
                lsv = sv
            tmp[li + 1] = sv
            X.append(li + 1)
        X = sorted(X)
        if dataname not in ["tags-ask-ubuntu", "tags-math-sx", "threads-ask-ubuntu", "threads-math-sx"]:
            X = X[:min(1000, int(len(X) * 0.5))]
        elif dataname in ["tags-ask-ubuntu", "tags-math-sx", "threads-ask-ubuntu", "threads-math-sx"]:
            X = X[:1000]
        elif dataname in [ "coauth-MAG-Geology-full", "coauth-MAG-History-full"]:
            X = X[:500]
        Y = [tmp[x] for x in X]

        dist["sv"] = {}
        for x,y in zip(X, Y):
            dist["sv"][x] = y
        dist["LargestSV"] = lsv
        score, coef, pred, _ = linearregression(X, Y)
        return_dict["sv"] = (score, coef)


    # EffDiam
    with open(outputdir + "effdiameter.txt", "r") as f:
        effdiam = 0
        for line in f.readlines():
            effdiam = float(line.rstrip())
        return_dict["effdiam"] = effdiam
        dist["effdiam"] = effdiam
    
    # SAVE
    with open(outputdir + "property.txt", "w") as f:
        f.write(",".join(columns) + "\n")
        tmp = []
        for name in property_list:
            if name in ["LargestSV", "effdiam", "NumHedge", "NumNode"]:
                tmp.append(str(return_dict[name]))
            else:
                tmp1, tmp2 = return_dict[name]
                tmp.append(str(tmp1))
                tmp.append(str(tmp2))
        f.write(",".join(tmp))
        f.write("\n")

    return return_dict, dist

In [None]:
def get_cdf(_dict):
    cumulated_x = sorted(list(_dict.keys()))
    cdf = {}
    cum = 0

    for _x in cumulated_x:
        cum += _dict[_x]
        cdf[_x] = cum
        assert cum < 1.1
        
    return cdf

def get_cumul_dist(dict_x1, dict_x2):
    cdf1 = get_cdf(dict_x1)
    x1 = list(cdf1.keys())
    cdf2 = get_cdf(dict_x2)
    x2 = list(cdf2.keys())
    
    cum1, cum2 = 0, 0
    maxdiff = 0
    for x in sorted(list(set(x1 + x2))):
        if x in x1:
            cum1 = cdf1[x]
        if x in x2:
            cum2 = cdf2[x]
        if abs(cum1 - cum2) > maxdiff:
            maxdiff = abs(cum1 - cum2)
    
    return maxdiff

    
def get_rmse_dist(dict_x1, dict_x2, set_length=False, normalize=False):
    total = 0
    maxy1 = 0
    
    x1s = list(dict_x1.keys())
    x2s = list(dict_x2.keys())
    
    if set_length:
        keys = x1s
    else:
        keys = set(x1s+x2s)
    
    for x in keys:
        y1, y2 = 0, 0
        if x in x1s:
            y1 = dict_x1[x]
            if maxy1 < y1:
                maxy1 = y1
        if x in x2s:
            y2 = dict_x2[x]
        
        total += (y1 - y2) ** 2
    
    total /= len(keys)
    total = total ** 0.5
    
    if normalize:
        total /= maxy1
        
    return total

In [None]:
fulldatalist = {
    "email-Enron-half": "email-Enron-full",
    "email-Eu-half": "email-Eu-full",
    "contact-primary-school-half": "contact-primary-school",
    "contact-high-school-half": "contact-high-school",
    "NDC-classes-half": "NDC-classes-full",
    "NDC-substances-half": "NDC-substances-full",
    "tags-ask-ubuntu-half": "tags-ask-ubuntu", 
    "tags-math-sx-half": "tags-math-sx",
    "threads-ask-ubuntu-half": "threads-ask-ubuntu", 
    "threads-math-sx-half": "threads-math-sx"
}


halfdatalist = {
    "email-Enron-full" : "email-Enron-half",
    "email-Eu-full" : "email-Eu-half",
    "contact-primary-school" : "contact-primary-school-half",
    "contact-high-school" : "contact-high-school-half",
    "NDC-classes-full" : "NDC-classes-half",
    "NDC-substances-full" : "NDC-substances-half",
    "tags-ask-ubuntu": "tags-ask-ubuntu-half",
    "tags-math-sx": "tags-math-sx-half",
    "threads-ask-ubuntu": "threads-ask-ubuntu-half",
    "threads-math-sx": "threads-math-sx-half",
}

datalist = ["email-Enron-full", "email-Eu-full",
           "contact-primary-school", "contact-high-school",
           "NDC-classes-full", "NDC-substances-full",
           "tags-ask-ubuntu", "tags-math-sx",
           "threads-ask-ubuntu", "threads-math-sx"]

# Set Data

In [None]:
dataname = datalist[9]
print(dataname)

In [None]:
namelist = [("answer", -1)] # ("hypercl", -1), ("hyperunif", -1),
    
fflist = {
    # When hyperparameter fitting half
    "email-Enron-half": ["0.45_0.3"],
    "email-Eu-half": ["0.51_0.3"],
    "contact-high-school-half": ["0.51_0.3"],
    "contact-primary-school-half": ["0.51_0.2"],
    "NDC-classes-half": ["0.45_0.3"],
    "NDC-substances-half": ["0.45_0.3"],
    "tags-ask-ubuntu-half": ["0.51_0.3"],
    "tags-math-sx-half": ["0.51_0.3"],
    "threads-ask-ubuntu-half": ["0.45_0.2"],
    "threads-math-sx-half": ["0.45_0.3"]
}

namelist = [("answer", -1)]
if len(fflist[halfdatalist[dataname]]) > 0:
    namelist.append(("hyperff", fflist[halfdatalist[dataname]][0]))
if len(kronlist[halfdatalist[dataname]]) > 0:
    namelist.append(("hyperk", 0)) # assume there exist just one generated hypergraphs

In [None]:
def halfindex_2_fullindex(dataname, index):
    check_tmp = {}
    
    d = pd.read_csv("../results/hyperk_half/{}/output_list.txt".format(dataname))
    for irow, row in d.iterrows():
        model_index = row["modelIndex"]
        path = row["modelpath"]
        check_tmp[path] = model_index
        
    ret = -1
    d = pd.read_csv("../results/hyperk/{}/output_list.txt".format(dataname))
    for irow, row in d.iterrows():
        model_index = row["modelIndex"]
        path = row["modelpath"]
        if path in check_tmp and check_tmp[path] == index:
            ret = model_index
            break
            
    return ret

In [None]:
property_list = ["degree", "size", "pairdeg", "intersection","sv", 
             "clusteringcoef_hedge", "density_dist", "overlapness_dist", "effdiam"]

evallist = ['deg', 'sz', 'pd', 'its', 'cch', 'dst', 'ov', 'sv', 'eff'] 

In [None]:
outputdir = "csv/sv_forecast/"
if os.path.isdir(outputdir) is False:
    os.makedirs(outputdir)
    
outputpath = outputdir + dataname + ".txt"
columns = [column_mapping[prop] for prop in property_list]
with open(outputpath, "w") as f:
    f.write(",".join(["AlgoName", "AlgoOpt"] + columns) + "\n")

for name, modelindex in namelist:
    if name == "answer":
        ret, dist = read_properties(dataname, name, modelindex)
        ret_answer = ret
        dist_answer = dist   
        continue
    elif name == "hyperk":
        full_index = halfindex_2_fullindex(halfdatalist[dataname], modelindex)
        ret, dist = read_properties(halfdatalist[dataname], name, full_index)
    else:
        ret, dist = read_properties(dataname, name, modelindex)

    difflist = []
    for prop in property_list:
        if prop in ["degree", "size", "pairdeg", "intersection"]:
            diff = get_cumul_dist(dist_answer[prop], dist[prop])
        elif prop in ["sv"]:
            diff = get_rmse_dist(dist_answer[prop], dist[prop], set_length=True, normalize=False)
        elif prop in ["clusteringcoef_hedge", "density_dist", "overlapness_dist"]:
            diff = get_rmse_dist(dist_answer[prop], dist[prop], normalize=False)
        elif prop in ["effdiam"]:
            diff = abs(dist[prop] - dist_answer[prop]) / dist_answer[prop]
        difflist.append(str(diff))

    with open(outputpath, "a") as f:
        algoopt = str(modelindex)
        algoname = name
        f.write(",".join([algoname, algoopt] + difflist) + "\n")

In [None]:
evallist = ['deg', 'sz', 'pd', 'its', 'cch', 'dst', 'ov', 'sv', 'eff'] 

In [None]:
d = pd.read_csv(outputdir + dataname + ".txt")
target = d[evallist]
d['avg'] = target.mean(axis=1)
# d = d.sort_values(by="sv")
d = d.sort_values(by="avg")
d.head(20)

In [None]:
d.columns

# NORM & RANK

In [None]:
# Make Normalize Result
d = pd.read_csv(outputdir + dataname + ".txt")
for col in evallist:
    if d[col].std() != 0:
        d[col] = (d[col] - d[col].mean()) / d[col].std()
norms = d[evallist]
d['avg'] = norms.mean(axis=1)
# d = d.sort_values(by=["avg"], ascending=True)
d.to_csv(outputdir + dataname + "_norm.txt", index=False)

# Make Ranking Result
d = pd.read_csv(outputdir + dataname + ".txt")
for ename in evallist:
    d[ename] = d[ename].abs().rank(method='min')
ranks = d[evallist]
d['avg'] = ranks.mean(axis=1)
# d = d.sort_values(by=["avg"], ascending=True)
d.to_csv(outputdir + dataname + "_rank.txt", index=False)

In [None]:
# Norm
nd = pd.read_csv(outputdir + dataname + "_norm.txt")
nd = nd.sort_values(by=["avg"], ascending=True)
nd.head(10)

In [None]:
# Rank
rd = pd.read_csv(outputdir + dataname + "_rank.txt")
rd = rd.sort_values(by=["avg"], ascending=True)
rd.head(10)

# Aggregate

In [None]:
dataset = ["email-Enron-full", "email-Eu-full",
           "contact-high-school", "contact-primary-school",
          "NDC-classes-full", "NDC-substances-full"
           ,"tags-ask-ubuntu", "tags-math-sx",
          "threads-ask-ubuntu", "threads-math-sx"]

algolist = ["hyperff", "hyperk"]

In [None]:
algo2result = defaultdict(dict)
algo2norm = defaultdict(dict)
algo2rank = defaultdict(dict)

for dataname in dataset:    
    d = pd.read_csv(outputdir + dataname + ".txt")
    target = d[evallist]
    d['avg'] = target.mean(axis=1)
    for irow, row in d.iterrows():
        algoname = row["AlgoName"]
        if algoname not in algo2result:
            algo2result[algoname] = defaultdict(list)
        for evalname in evallist + ['avg']:
            algo2result[algoname][evalname].append(row[evalname])

    d = pd.read_csv(outputdir + dataname + "_rank.txt")
    for irow, row in d.iterrows():
        algoname = row["AlgoName"]
        if algoname not in algo2rank:
            algo2rank[algoname] = defaultdict(list)
        for evalname in evallist + ['avg']:
            algo2rank[algoname][evalname].append(row[evalname])
    
    d = pd.read_csv(outputdir + dataname + "_norm.txt")
    for irow, row in d.iterrows():
        algoname = row["AlgoName"]
        if algoname not in algo2norm:
            algo2norm[algoname] = defaultdict(list)
        for evalname in evallist + ['avg']:
            algo2norm[algoname][evalname].append(row[evalname])

In [None]:
for algoname in algolist:
    for evalname in evallist + ['avg']:
        algo2result[algoname][evalname] = np.mean(algo2result[algoname][evalname])
        algo2rank[algoname][evalname] = np.mean(algo2rank[algoname][evalname])
        algo2norm[algoname][evalname] = np.mean(algo2norm[algoname][evalname])
    
with open(outputdir + "agg.txt", "w") as f:
    f.write(",".join(["AlgoName"] + evallist + ["avg"]) + "\n")
    for algoname in algolist:
        res = [str(algo2result[algoname][evalname]) for evalname in evallist + ['avg']]
        f.write(",".join([algoname] + res) + "\n")
        
        
with open(outputdir + "agg_rank.txt", "w") as f:
    f.write(",".join(["AlgoName"] + evallist + ["avg"]) + "\n")
    for algoname in algolist:
        res = [str(algo2rank[algoname][evalname]) for evalname in evallist + ['avg']]
        f.write(",".join([algoname] + res) + "\n")
        
        
with open(outputdir + "agg_norm.txt", "w") as f:
    f.write(",".join(["AlgoName"] + evallist + ["avg"]) + "\n")
    for algoname in algolist:
        res = [str(algo2norm[algoname][evalname]) for evalname in evallist + ['avg']]
        f.write(",".join([algoname] + res) + "\n")

In [None]:
d = pd.read_csv(outputdir + "agg.txt")
d

In [None]:
rd = pd.read_csv(outputdir + "agg_rank.txt")
rd

In [None]:
nd = pd.read_csv(outputdir + "agg_norm.txt")
nd

In [None]:
algo2rank = defaultdict(dict)
algo2norm = defaultdict(dict)
rd = pd.read_csv(outputdir + "agg_rank.txt")
for irow, row in rd.iterrows():
    algoname = row["AlgoName"]
    for evalname in evallist + ["avg"]:
        algo2rank[algoname][evalname] = row[evalname]
nd = pd.read_csv(outputdir + "agg_norm.txt")
for irow, row in nd.iterrows():
    algoname = row["AlgoName"]
    for evalname in evallist + ["avg"]:
        algo2norm[algoname][evalname] = row[evalname]

In [None]:
eval2rankbest = defaultdict(float)
for evalname in evallist + ["avg"]:
    for algoname in algolist:
        curval = algo2rank[algoname][evalname]
        if evalname not in eval2rankbest:
            eval2rankbest[evalname] = curval
        elif eval2rankbest[evalname] > curval:
            eval2rankbest[evalname] = curval
eval2normbest = defaultdict(float)
for evalname in evallist + ["avg"]:
    for algoname in algolist:
        curval = algo2norm[algoname][evalname]
        if evalname not in eval2normbest:
            eval2normbest[evalname] = curval
        elif eval2normbest[evalname] > curval:
            eval2normbest[evalname] = curval

for algoname in algolist:
    for evalname in evallist + ["avg"]:
        currank = algo2rank[algoname][evalname]
        curnorm = algo2norm[algoname][evalname]
        if currank == eval2rankbest[evalname]:
            print("& \\textbf{%.2f} " % (currank), end="")
        else:
            print("& %.2f " % (currank), end="")
        if curnorm == eval2normbest[evalname]:
            print("(\\textbf{%.2f}) " % (curnorm), end="")
        else:
            print("(%.2f) " % (curnorm), end="")
            
    print("\\\\")