In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import powerlaw
from IPython.display import display
from collections import defaultdict
import matplotlib
import matplotlib.pyplot as plt
import os
import math
from matplotlib.patches import Arc

plt.rcParams.update({'font.size': 15})

### Data Analysis

In [None]:
from collections import defaultdict
from sklearn.linear_model import LinearRegression
import numpy as np
import os

dataset = ["email-Enron-full", "email-Eu-full", 
           "NDC-classes-full", "NDC-substances-full", 
           "contact-high-school", "contact-primary-school", 
           "tags-ask-ubuntu", "tags-math-sx", 
           "threads-ask-ubuntu", "threads-math-sx"]

def linearregression(X, Y, nolog=False):
    if len(X) == 0:
        return 0, 0, [0], 0
    X = np.array(X).reshape(-1, 1)
    Y = np.array(Y).reshape(-1, 1)
    if nolog is False:
        X = np.log2(X)
        Y = np.log2(Y)
    reg = LinearRegression().fit(X, Y)
    score = reg.score(X, Y)
    coef = reg.coef_
    assert len(coef) == 1
    coef = coef[0][0]
    intercept = reg.intercept_[0]
    pred = reg.predict(X).flatten()
    if nolog is False:
        pred = np.exp2(pred)

    return score, coef, pred, intercept, reg

def read_properties(dataname, inputpath):
    if "answer" in inputpath:
        graphpath = "../../dataset/" + dataname + ".txt"
        outputdir = "../results/answer/" + dataname + "/"
    else:
        graphpath = inputpath + "hypergraph.txt"
        outputdir = inputpath

    return_dict = {} # property -> (score, coef)
    dist = {} # property -> x : y
    print(graphpath)
    
    # Num Nodes & Num Edges
    numhedge = 0
    nodeset = set()
    with open(graphpath, "r") as f:
        for line in f.readlines():
            hedge = line.rstrip().split(",")
            for v in hedge:
                nodeset.add(int(v))
            numhedge += 1
    numnode = len(nodeset)    
    
    for distname in ["degree", "pairdeg", "intersection", "size"]:
        dist[distname] = {}
        X = []
        with open(outputdir + distname + ".txt", "r") as f:
            for line in f.readlines():
                val, pdf = line.rstrip().split(",")
                val, pdf = float(val), float(pdf)
                if pdf == 0.0 or val == 0.0:
                    continue
                dist[distname][val] = pdf
                X.append(val)
        X = sorted(X)
        Y = [dist[distname][x] for x in X]
        score, coef, pred, _, _ = linearregression(X, Y)
        return_dict[distname] = (score, coef)
        
    for distname in ["clusteringcoef_hedge", "density_dist", "overlapness_dist"]:
        dist[distname] = defaultdict(list)
        try:
            X = []
            with open(outputdir + distname + ".txt", "r") as f:
                for line in f.readlines():
                    val, pdf = line.rstrip().split(",")
                    val, pdf = float(val), float(pdf)
                    if val == 0.0 or pdf == 0.0:
                        continue
                    dist[distname][val].append(pdf)
                    X.append(val)
            X = sorted(X)
            Y = []
            for x in X:
                y = np.mean(dist[distname][x])
                dist[distname][x] = y
                if y > 0:
                    Y.append(y)
                else:
                    Y.append(1)
            score, coef, pred, _, _ = linearregression(X, Y)
            return_dict[distname] = (score, coef)
        except:
            return_dict[distname] = (1e-6, 0)

    # SV    
    with open(outputdir + "sv.txt", "r") as f:
        tmp = {}
        X = []
        for li, line in enumerate(f.readlines()):
            sv = float(line.rstrip())
            tmp[li + 1] = sv
            X.append(li + 1)
        X = sorted(X)
        if dataname not in ["tags-ask-ubuntu", "tags-math-sx", "threads-ask-ubuntu", "threads-math-sx"]:
            X = X[:min(1000, int(len(X) * 0.5))]
        elif dataname in ["tags-ask-ubuntu", "tags-math-sx"]:
            X = X[:1000]
        elif dataname in ["threads-ask-ubuntu", "threads-math-sx"]:
            X = X[:1000]
        elif dataname in [ "coauth-MAG-Geology-full", "coauth-MAG-History-full"]:
            X = X[:500]
        Y = [tmp[x] for x in X]

        dist["sv"] = {}
        for x,y in zip(X, Y):
            dist["sv"][x] = y
        score, coef, pred, _, _ = linearregression(X, Y)
        return_dict["sv"] = (score, coef)
        
    # EffDiam
    with open(outputdir + "effdiameter.txt", "r") as f:
        effdiam = 0
        for line in f.readlines():
            effdiam = float(line.rstrip())
        return_dict["effdiam"] = effdiam
        dist["effdiam"] = effdiam

    return return_dict, dist 

In [None]:
def get_cdf(x, y):
    cumulated_x = []
    cumulated_y = []
    cum = 0
    
    sorted_idx = np.argsort(x)
    sorted_x = [x[i] for i in sorted_idx]
    sorted_y = [y[i] for i in sorted_idx]
    
    for _x, _y in zip(sorted_x, sorted_y):
        cum += _y
        assert cum < 1.1 and cum > 0
        cumulated_x.append(_x)
        cumulated_y.append(cum)
        
    return cumulated_x, cumulated_y

def get_odds_ratio(x, y):
    cdf_x, cdf_y = get_cdf(x, y)
    
    new_x, new_y = [], []
    for _x, _y in zip(cdf_x, cdf_y):
        if _y >= 1:
            break
        new_x.append(_x)
        new_y.append(_y / (1.0 - _y))
    
    return new_x, new_y

In [None]:
dataset = ["email-Enron-full", "email-Eu-full", 
          "contact-primary-school", "contact-high-school",
          "NDC-classes-full", "NDC-substances-full",
          "tags-ask-ubuntu", "tags-math-sx", 
          "threads-ask-ubuntu", "threads-math-sx",
           "coauth-MAG-Geology-full", "coauth-MAG-History-full"]

In [None]:
distset = [
    "degree", "size", "pairdeg", "intersection", "sv",
    "clusteringcoef_hedge", "density_dist", "overlapness_dist"
]

In [None]:
xlabeldict = {
    "degree": "Node degree",
    "size": "Hyperedge size",
    "pairdeg": "Degree of node pairs",
    "intersection": "Intersection size",
    "sv": "Rank",
    "clusteringcoef_hedge": "Node degree",
    "density_dist": "# of nodes",
    "overlapness_dist": "# of nodes",
}

ylabeldict = {
    "degree": "OddsRatio",
    "size": "OddsRatio",
    "pairdeg": "PDF",
    "intersection": 'PDF',
    "sv": "Singular value",
    "clusteringcoef_hedge": "# of inter- \n secting pairs",
    "density_dist": "# of hyperedges",
    "overlapness_dist": r"$\sum$ hyperedge sizes",
}

In [None]:
for distname in distset:
    for dataname in dataset:
        print(dataname)

        outputpath = "figure/" + dataname + "/"
        if os.path.isdir(outputpath) is False:
            os.makedirs(outputpath)
        outputpath += distname + ".jpg"

        ret2, ret = read_properties(dataname, "../results/answer/" + dataname + "/")

        dist = ret[distname]
        x = sorted(list(dist.keys()))
        y = [dist[_x] for _x in x]
        score, coef, pred, intercept, reg = linearregression(x, y)

        if distname in ["degree", "size"]:
            select_length = int(0.75 * len(x))
            x, y = get_odds_ratio(x, y)
            _x = x[:select_length]
            _y = y[:select_length]
            score, coef, pred, intercept, reg = linearregression(_x, _y)
            print("coef, score", coef, score)
        else:
            print("coef, score", coef, score)

        plt.figure(figsize=(6,4), dpi=100)
        plt.scatter(x, y, zorder=2)
        line1 = plt.plot(x[:pred.shape[0]], pred, color="red", zorder=2)
        
        # Plot Angle & Score
        x_mid = 2 ** ((math.log2(min(x)) + math.log2(max(x))) / 2)
        pred_x_mid = 2 ** reg.predict(np.array([[math.log2(x_mid)]]))[0][0]
        len_x_mid = math.log2(x_mid) - math.log2(min(x))
        y_mid = 2 ** ((math.log2(min(pred)) + math.log2(max(pred))) / 2)
        xlength = 2 ** (len_x_mid * 0.2)
        xlength2 = 2 ** (len_x_mid * 0.3)
        ylength = 2 ** ((math.log2(pred[-1]) - math.log2(min(pred))) * 0.2)

        ax = plt.gca()
        ax.hlines(y=pred_x_mid, xmin=x_mid, xmax=x_mid * xlength, linewidth=2, color='red')
        
        if pred[0] > pred[-1]: # decreasing
            plt.text(x_mid * xlength2, pred_x_mid, r"$\bf{\rho}$ = %.2f" % (coef), color="red", weight="bold", fontsize=24)
            plt.text(min(x), min(min(pred), min(y)), "Goodness of fit\n" + r"$\bf{R^{2}}$ = %.2f" % (score), color="red", weight="bold", verticalalignment="bottom", fontsize=24)
        else: # incresing
            plt.text(x_mid * xlength2, pred_x_mid, r"$\bf{\rho}$ = %.2f" % (coef), color="red", weight="bold", fontsize=24)
            plt.text(min(x), max(max(pred), max(y)), "Goodness of fit\n" + r"$\bf{R^{2}}$ = %.2f" % (score), color="red", weight="bold", verticalalignment="top", fontsize=24)

            
        plt.xscale("log", base=2)
        plt.yscale("log", base=2)

        ax.tick_params(labelcolor='#4B4B4B', labelsize=22)
        plt.xlabel(xlabeldict[distname], fontsize=24)
        plt.ylabel(ylabeldict[distname], fontsize=24)

        # plt.savefig(outputpath, bbox_inches='tight')
        plt.show()
        plt.close()
    

In [None]:
distset = [
    "clusteringcoef_hedge", "density_dist", "overlapness_dist"
]

In [None]:
for distname in distset:
    for dataname in dataset:
        print(dataname)

        outputpath = "figure/" + dataname + "/"
        if os.path.isdir(outputpath) is False:
            os.makedirs(outputpath)
        outputpath += distname + "2.jpg"

        ret2, ret = read_properties(dataname, "../results/answer/" + dataname + "/")

        dist = ret[distname]
        x = sorted(list(dist.keys()))
        y = [dist[_x] for _x in x]
        score, coef, pred, intercept, reg = linearregression(x, y)

        if distname in ["degree", "size"]:
            select_length = int(0.75 * len(x))
            x, y = get_odds_ratio(x, y)
            _x = x[:select_length]
            _y = y[:select_length]
            score, coef, pred, intercept, reg = linearregression(_x, _y)
            print("coef, score", coef, score)
        else:
            print("coef, score", coef, score)

        plt.figure(figsize=(6,4), dpi=100)
        plt.scatter(x, y, zorder=2)
        line1 = plt.plot(x[:pred.shape[0]], pred, color="red", zorder=2)

        # Plot Angle & Score
        x_mid = 2 ** ((math.log2(min(x)) + math.log2(max(x))) / 2)
        pred_x_mid = 2 ** reg.predict(np.array([[math.log2(x_mid)]]))[0][0]
        len_x_mid = math.log2(x_mid) - math.log2(min(x))
        y_mid = 2 ** ((math.log2(min(pred)) + math.log2(max(pred))) / 2)
        xlength = 2 ** (len_x_mid * 0.2)
        xlength2 = 2 ** (len_x_mid * 0.3)
        ylength = 2 ** ((math.log2(pred[-1]) - math.log2(min(pred))) * 0.2)

        ax = plt.gca()

        if pred[0] > pred[-1]: # decreasing
            plt.text(min(x), min(min(pred), min(y)), r"$y  \propto x^{%.2f}$" % (coef), color="black", weight="bold", verticalalignment="bottom", fontsize=24)
        else: # incresing
            plt.text(min(x), max(max(pred), max(y)), r"$y  \propto x^{%.2f}$" % (coef), color="black", weight="bold", verticalalignment="top", fontsize=24)

        plt.xscale("log", base=2)
        plt.yscale("log", base=2)

        ax.tick_params(labelcolor='#4B4B4B', labelsize=22)
        plt.xlabel(xlabeldict[distname], fontsize=24)
        plt.ylabel(ylabeldict[distname], fontsize=24)


        # plt.savefig(outputpath, bbox_inches='tight')
        plt.show()
        plt.close()
    

# Slope

In [None]:
colordict = {
    "email-Enron-full": "#e41a1c",
    "email-Eu-full": "#e41a1c",
    "contact-high-school": "#377eb8",
    "contact-primary-school": "#377eb8",
    "NDC-classes-full": "#4daf4a",
    "NDC-substances-full": "#4daf4a",
    "tags-ask-ubuntu": "#984ea3",
    "tags-math-sx": "#984ea3",
    "threads-ask-ubuntu": "#ff7f00",
    "threads-math-sx": "#ff7f00",
    "coauth-MAG-Geology-full": "#e6ab02",
    "coauth-MAG-History-full": "#e6ab02"
}

markerdict = {
    "email-Enron-full": "o",
    "email-Eu-full": "o",
    "contact-high-school": "^",
    "contact-primary-school": "^",
    "NDC-classes-full": "D",
    "NDC-substances-full": "D",
    "tags-ask-ubuntu": "P",
    "tags-math-sx": "P",
    "threads-ask-ubuntu": "s",
    "threads-math-sx": "s",
    "coauth-MAG-Geology-full": "<",
    "coauth-MAG-History-full": "<"
    
}

dataset = ["email-Enron-full", "email-Eu-full", 
          "contact-primary-school", "contact-high-school",
          "NDC-classes-full", "NDC-substances-full",
          "tags-ask-ubuntu", "tags-math-sx", 
          "threads-ask-ubuntu", "threads-math-sx",
          "coauth-MAG-Geology-full", "coauth-MAG-History-full"]

distset=["degree", "size", "pairdeg", "intersection", "sv"]

In [None]:
from matplotlib.ticker import FormatStrFormatter

for distname in distset:
    aggregate_coef = {}
    aggregate_score = {}
    outputpath = "figure/" + distname
    
    for dataname in dataset:
        ret2, ret = read_properties(dataname, "../results/answer/" + dataname + "/")
        dist = ret[distname]
        x = sorted(list(dist.keys()))
        y = [dist[_x] for _x in x]
        score, coef, pred, intercept, reg = linearregression(x, y)

        if distname in ["degree", "size"]:
            select_length = int(0.75 * len(x))
            x, y = get_odds_ratio(x, y)
            _x = x[:select_length]
            _y = y[:select_length]
            score, coef, pred, intercept, reg = linearregression(_x, _y)
        
        aggregate_coef[dataname] = coef
        aggregate_score[dataname] = score

    
    # Slope
    plt.figure(figsize=(6,4), dpi=100)
    xs, ys, cs = [], [], []
    for i in range(len(dataset)):
        dataname = dataset[i]
        xs.append(i)
        ys.append(aggregate_coef[dataname])
        plt.scatter(i, aggregate_coef[dataname], c=colordict[dataname], marker=markerdict[dataname], s=300, zorder=2)
    
    plt.tick_params( axis='x', which='both', bottom=False, top=False, labelbottom=False)
    ax = plt.gca()
    ax.tick_params(labelcolor='#4B4B4B', labelsize=22)
    
    plt.ylim((min(ys)-0.5, max(ys)+0.5))
    plt.xlabel("Dataset", fontsize=24)
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
    plt.ylabel(r"Slope $\rho$" , fontsize=24)
    
    # plt.savefig(outputpath + "_coef.jpg", bbox_inches='tight')
    plt.show()
    plt.close()
    
    # R^2 Score
    plt.figure(figsize=(6,4), dpi=100)
    xs, ys, cs = [], [], []
    for i in range(len(dataset)):
        dataname = dataset[i]
        xs.append(i)
        ys.append(aggregate_score[dataname])
        plt.scatter(i, aggregate_score[dataname], c=colordict[dataname], marker=markerdict[dataname],  s=300, zorder=2)
    plt.hlines(y=1, xmin=min(xs), xmax=max(xs), color="black", zorder=1)
    
    plt.tick_params( axis='x', which='both', bottom=False, top=False, labelbottom=False)

    ax = plt.gca()
    ax.tick_params(labelcolor='#4B4B4B', labelsize=22)
    
    plt.ylim((0.75, 1.15))
    
    plt.text(min(xs), 1.1, "Ideal = 1.0", color="black", verticalalignment="top", fontsize=24)

    plt.xlabel("Dataset", fontsize=24)
    plt.ylabel(r"$R^{2}$ Score" , fontsize=24)
    
    # plt.savefig(outputpath + "_score.jpg", bbox_inches='tight')
    plt.show()
    plt.close()