In [None]:
import sys
sys.path.append("../src/")

import xgi
from utils import read_hyperedges, read_data, largest_connected_component
from layer_randomization import layer_randomization

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
data_prefix = "../data/"
dataset_name = "coauth-MAG-History"
dataset_path = f"{data_prefix}{dataset_name}/"

observed_filename = dataset_path + f"{dataset_name}.txt"

In [None]:
# Read observed data, compute largest CC, and construct hypergraph
observed_hyperedges = read_data(dataset_path + dataset_name + "-", multiedges=False)
#observed_hyperedges = read_hyperedges(observed_filename)
observed_cc = largest_connected_component(observed_hyperedges, remove_single_nodes=False)
observed_hg = xgi.Hypergraph(incoming_data=observed_cc)

In [None]:
# Compute layer randomization, compute largest CC, and construct hypergraph
layer_rnd_hyperedges = layer_randomization(observed_cc)
layer_rnd_cc = largest_connected_component(layer_rnd_hyperedges, remove_single_nodes=False)
layer_rnd_hg = xgi.Hypergraph(incoming_data=layer_rnd_cc)

In [None]:
print("observed: ", len(observed_hyperedges), len(observed_cc))
print("layer randomization: ", len(layer_rnd_hyperedges), len(layer_rnd_cc))

# Hyperedges Size Distribution

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(6, 4), squeeze=False, sharey=True)
row = col = 0

print(len(observed_hg.edges.order.aslist()))
print(sum(observed_hg.edges.order.aslist()))
axs[row][col].hist(observed_hg.edges.order.aslist())
axs[row][col].set(
    title="Observed",
    yscale='log'
)

col = 1
print(len(layer_rnd_hg.edges.order.aslist()))
print(sum(layer_rnd_hg.edges.order.aslist()))
axs[row][col].hist(layer_rnd_hg.edges.order.aslist())
axs[row][col].set(
    title="Layer",
    yscale='log'
)

# Degree

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), squeeze=False, sharex=True, sharey=True)
row = col = 0

axs[row][col].hist(observed_hg.nodes.degree.aslist())
axs[row][col].set(
    title="Observed",
    yscale='log'
)

col = 1
axs[row][col].hist(layer_rnd_hg.nodes.degree.aslist())
axs[row][col].set(
    title="Layer",
    yscale='log'
)

In [None]:
max_deg_node = [node for node in observed_hg.nodes.filterby("degree", observed_hg.nodes.degree.max())][0]
print(max_deg_node)
print(observed_hg.nodes.degree.max())
deg_vect = [observed_hg.nodes.degree(order=k)[max_deg_node] for k in range(10)]
print(deg_vect)

In [None]:
max_deg_node = [node for node in layer_rnd_hg.nodes.filterby("degree", layer_rnd_hg.nodes.degree.max())][0]
print(max_deg_node)
print(layer_rnd_hg.nodes.degree.max())
deg_vect = [layer_rnd_hg.nodes.degree(order=k)[max_deg_node] for k in range(10)]
print(deg_vect)

In [None]:
for k in range(1, 11):

    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), squeeze=False, sharex=True, sharey=True)
    row = col = 0
    fig.suptitle(k)
    deg = observed_hg.nodes.degree(order=k).asnumpy()
    axs[row][col].hist(deg[deg>0])
    axs[row][col].set(
        title="Observed",
        yscale='log'
    )

    col = 1
    deg = layer_rnd_hg.nodes.degree(order=k).asnumpy()
    axs[row][col].hist(deg[deg>0])
    axs[row][col].set(
        title="Layer",
        yscale='log'
    )

# Overlap DAG (cross-layer overlap relationships)

In [None]:
from encapsulation_dag import overlap_dag

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), squeeze=False, sharex=True)
row = col = 0

observed_ovdag, _, _ = overlap_dag(observed_cc)
observed_outdist = [len(observed_ovdag[he]) for he in observed_ovdag]
print(sum(observed_outdist))
axs[row][col].hist(observed_outdist)
axs[row][col].set(
    title="Observed",
    yscale='log'
)

col = 1
layer_rnd_overdag, _, _ = overlap_dag(layer_rnd_cc)
layer_rnd_outdist = [len(layer_rnd_overdag[he]) for he in layer_rnd_overdag]
print(sum(layer_rnd_outdist))
axs[row][col].hist(layer_rnd_outdist)
axs[row][col].set(
    title="Layer",
    yscale='log'
)

# Encapsulation DAG (cross-layer subset relationships)

In [None]:
from encapsulation_dag import encapsulation_dag

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), squeeze=False, sharex=True)
row = col = 0

observed_ovdag, _, _ = encapsulation_dag(observed_cc)
observed_outdist = [len(observed_ovdag[he]) for he in observed_ovdag]
print(sum(observed_outdist))
axs[row][col].hist(observed_outdist)
axs[row][col].set(
    title="Observed",
    yscale='log'
)

col = 1
layer_rnd_overdag, _, _ = encapsulation_dag(layer_rnd_cc)
layer_rnd_outdist = [len(layer_rnd_overdag[he]) for he in layer_rnd_overdag]
print(sum(layer_rnd_outdist))
axs[row][col].hist(layer_rnd_outdist)
axs[row][col].set(
    title="Layer",
    yscale='log'
)

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
from encapsulation_dag import encapsulation_dag, overlap_dag, overlap_graph
from utils import read_data, read_hyperedges, largest_connected_component
from layer_randomization import layer_randomization

def print_observed_stats(cc):
    print("Computing encapsulation DAG...")
    obs_encap, _, _ = encapsulation_dag(cc)
    print("Number of encapsulation DAG edges in observed data: " + str(obs_encap.number_of_edges()))

    print("Computing overlap DAG...")
    obs_overlap_dag, _, _ = overlap_dag(cc)
    print("Number of overlap DAG in observed data: " + str(obs_overlap_dag.number_of_edges()))

    print("Computing overlap graph...")
    obs_overlap, _, _ = overlap_graph(cc, normalize_weight=False)
    sum_of_weights = sum([data["weight"] for _,_, data in obs_overlap.edges(data=True)])
    print("Number of overlap edges in observed data: " + str(obs_overlap.number_of_edges()))
    return obs_encap.number_of_edges(), obs_overlap_dag.number_of_edges(), obs_overlap.number_of_edges(), sum_of_weights


def print_random_stats(cc, obs_encap, obs_overdag, obs_overlap):
    print("Computing encapsulation DAG...")
    rnd_encap, _, _ = encapsulation_dag(cc)
    print(f"Number of encapsulation DAG edges in random data: {rnd_encap.number_of_edges()} %: {rnd_encap.number_of_edges() / obs_encap}")

    print("Computing overlap DAG...")
    rnd_overlap_dag, _, _ = overlap_dag(cc)
    print(f"Number of overlap DAG edges in random data: {rnd_overlap_dag.number_of_edges()} %: {rnd_overlap_dag.number_of_edges() / obs_overdag}")

    print("Computing overlap graph...")
    rnd_overlap, _, _ = overlap_graph(cc, normalize_weight=False)
    sum_of_weights = sum([data["weight"] for _,_, data in rnd_overlap.edges(data=True)])
    print(f"Number of overlap edges in random data: {rnd_overlap.number_of_edges()} %: {rnd_overlap.number_of_edges() / obs_overlap}")
    return rnd_encap.number_of_edges(), rnd_overlap_dag.number_of_edges(), rnd_overlap.number_of_edges(), sum_of_weights

# Compute/read changes in DAG stats after randomization

In [None]:
datasets = [#"coauth-DBLP",
            "email-Eu",
            "coauth-MAG-Geology",
            "coauth-MAG-History",
            "contact-high-school",
            "contact-primary-school",
            "email-Enron",

]
num_samples = 5

In [None]:
import pickle

In [None]:
from pathlib import Path

In [None]:
read_from_files = False
if read_from_files:
    obs_data = dict()
    layer_data = dict()
    for dataset_name in datasets:
        with open(f"../results/{dataset_name}/randomization_comparison.pickle", "rb") as fpickle:
            obs, layer = pickle.load(fpickle)
            
        obs_data[dataset_name] = obs
        layer_data[dataset_name] = layer
else:
    for dataset_name in datasets:
        print(dataset_name)
        filename = f"../data/{dataset_name}/{dataset_name}-"

        print("Reading hyperedges...")
        obs_hyperedges = read_data(filename, multiedges=False)
        print("Done.")

        print("Computing largest connected component...")
        obs_cc = largest_connected_component(obs_hyperedges, remove_single_nodes=False)
        print("Done.")

        obs_encap, obs_overdag, obs_overlap, obs_overlap_sum = print_observed_stats(obs_cc)
        obs_data[dataset_name] = { 
            "encap": obs_encap,
            "overdag": obs_overdag,
            "overlap": obs_overlap,
            "overlap_sum": obs_overlap_sum
        }

        layer_data[dataset_name] = {
            "encap":[],
            "overdag":[],
            "overlap":[],
            "overlap_sum":[]
        }

        for i in range(num_samples):
            print(f"Computing layer randomization {i}...")
            random = layer_randomization(obs_hyperedges)
            cc = largest_connected_component(random, remove_single_nodes=True)
            encap, overdag, overlap, overlap_sum = print_random_stats(cc, obs_encap, obs_overdag, obs_overlap)
            layer_data[dataset_name]["encap"].append(encap)
            layer_data[dataset_name]["overdag"].append(overdag)
            layer_data[dataset_name]["overlap"].append(overlap)
            layer_data[dataset_name]["overlap_sum"].append(overlap_sum)
            print()

        Path(f"../results/{dataset_name}/").mkdir(parents=True, exist_ok=True)
        with open(f"../results/{dataset_name}/randomization_comparison.pickle") as fpickle:
            pickle.dump((obs_data, layer_data), fpickle)

In [None]:
fig, axs = plt.subplots(1, len(datasets), squeeze=False, figsize=(35, 4), sharey=True)
for col, dataset_name in enumerate(datasets):
    obs_bars = []
    layer_bars = []
    layer_errs = []
    for key in ["encap", "overlap", "overlap_sum"]:
        #obs_bars.append()
        layer = np.array(layer_data[dataset_name][key]) / obs_data[dataset_name][key]
        layer_bars.append(np.mean(layer))
        layer_errs.append(np.std(layer))

    x = np.array(list(range(1, len(layer_bars)+1)))
    width=0.1
    #axs[0][col].bar(x-width, obs_bars, width=width, label="Observed")
    axs[0][col].bar(x, layer_bars, yerr=layer_errs, width=width, label="Layer Randomization")
    axs[0][col].set_xticks(x, labels=[f"Encap. Edges\n{obs_data[dataset_name]['encap']}",
                                      f"Overlap Edges\n{obs_data[dataset_name]['overlap']}",
                                      f"Total Overlap\n{obs_data[dataset_name]['overlap_sum']}"], size=14)
    axs[0][col].set_yticks([0.25, 0.5, 0.75, 1.0], labels=["0.25", "0.50", "0.75", "1.00"], size=20)
    axs[0][col].set_title(dataset_name, size=21)
    axs[0][col].set_ylim((0, 1.1))
    axs[0][col].set_xlim((0.5,len(layer_bars)+0.25))
    axs[0][col].spines['top'].set_visible(False)
    axs[0][col].spines['right'].set_visible(False)
#fig.savefig("../results/plots/layer_randomization_comparison.pdf", bbox_inches="tight", transparent=True)

# Read component size information

In [None]:
from collections import Counter
def read_component_data(dataset, num_samples, data_dir="../data/"):
    dataset_info = dict()
    # Compute observed DAG
    observed_path = data_dir + dataset + "/" + dataset + "-" 
    # Read heights by node dict
    with open(data_dir + dataset + "/" + dataset + "_dag_components.txt", "r") as fin:
        obs_components = np.array(list(map(int, fin.readline().split(','))))
    dataset_info["observed_components_dist"] = obs_components
    
    # Get a random DAG
    with open(data_dir + dataset + "/" + dataset + "_layer_randomization_dag_components.txt", 'r') as fin:
        random_comps = []
        for line in fin:
            random_comps.append(np.array(list(map(int, line.split(',')))))
    dataset_info["random_comps"] = random_comps
    
    # Get averages of random count distributions
    random_count_dists = dict()
    for arr in random_comps:
        arr_counts = dict(Counter(arr))
        for key in arr_counts:
            if key in random_count_dists:
                random_count_dists[key].append(arr_counts[key])
            else:
                random_count_dists[key] = [arr_counts[key]]

    dataset_info["random_count_dists"] = random_count_dists
    
    random_means = dict()
    #random_stds = dict()
    for key in random_count_dists:
        random_means[key] = sum(random_count_dists[key]) / num_samples
        #random_stds[key] = np.std(random_count_dists[key])

    # Fill in missing values from both counters
    observed_counts = dict(Counter(obs_components))
    for c in set(observed_counts.keys()).union(set(random_means.keys())):
        if c not in random_means:
            random_means[c] = 0
            #random_stds[c] = 0

        if c not in observed_counts:
            observed_counts[c] = 0

    dataset_info["observed_counts"] = observed_counts
    dataset_info["random_means"] = random_means
    #dataset_info["random_stds"] = random_stds
    return dataset_info

In [None]:
def get_binning(values, num_bins = 15, log_binning = False, is_pmf = True, bins=None):   
    if bins is None:
        # We need to define the support of our distribution
        lower_bound = min(values)
        upper_bound = max(values)

        # And the type of binning we want
        if log_binning:
            lower_bound = np.log10(lower_bound)
            upper_bound = np.log10(upper_bound)+1
            bins = np.logspace(lower_bound,upper_bound,num_bins+1, base = 10)
        else:
            bins = np.linspace(lower_bound,upper_bound,num_bins+1)
    
    # Then we can compute the histogram using numpy
    if is_pmf:
        y, __ = np.histogram(values, bins = bins, density=False)
        p = y/float(y.sum())
        
    else:
        p, __ = np.histogram(values, bins = bins, density=False)
    
    # Now, we need to compute for each y the value of x
    x = bins[1:] - np.diff(bins)/2.0    
    
    if bins is None:
        x = x[p>0]
        p = p[p>0]

    return x, p, bins

def bin_distributions(dataset_info, log_binning=True, num_bins=50, is_pmf=True):
    # Bin the observed distribution
    obs_comps_dist = dataset_info["observed_components_dist"]
    x, y, bins = get_binning(obs_comps_dist, num_bins = num_bins, log_binning = log_binning, is_pmf = is_pmf)
    dataset_info["obs_x"] = x
    dataset_info["obs_y"] = y
    
    # Bin the random distribution
    rnd_comps_dists = dataset_info["random_comps"]
    #rnd_array = np.zeros(y.shape[0])
    #for dist in rnd_comps_dists:
    #    x, y, bins = get_binning(dist, log_binning = log_binning, is_pmf = True, bins=bins)
    #    rnd_array += y
    #rnd_array /= len(dist)
    #x, rnd_array, bins = get_binning(rnd_comps_dists[0], log_binning = log_binning, is_pmf = True, bins=bins)
    
    rnd_lists = [[] for _ in x]
    for dist in rnd_comps_dists:
        x, y, bins = get_binning(dist, log_binning = log_binning, is_pmf = is_pmf, bins=bins)
        for idx, val in enumerate(y):
            rnd_lists[idx].append(val)
    
    rnd_array = np.zeros(y.shape[0])
    for idx in range(len(x)):
        rnd_array[idx] = np.median(rnd_lists[idx])
    #rnd_array /= len(dist)
    #x, rnd_array, bins = get_binning(rnd_comps_dists[0], log_binning = log_binning, is_pmf = True, bins=bins)
    dataset_info["rnd_x"] = x
    dataset_info["rnd_y"] = rnd_array

In [None]:
datasets = ["coauth-MAG-Geology", "coauth-MAG-History",  "contact-high-school", "contact-primary-school", "email-Enron", "email-Eu"]
dataset_info_dicts = dict()

for dataset in datasets:
    print(dataset)
    dataset_info_dicts[dataset] = read_component_data(dataset, num_samples)
    bin_distributions(dataset_info_dicts[dataset], is_pmf = False, log_binning=True, num_bins=10)

In [None]:
fig, axs = plt.subplots(1, len(datasets), squeeze=False, figsize=(35, 4))
for col, dataset_name in enumerate(datasets):
    observed_x = dataset_info_dicts[dataset_name]["obs_x"]
    observed_y = dataset_info_dicts[dataset_name]["obs_y"]
    axs[0][col].scatter(observed_x, observed_y, label="Observed")
    
    random_x = dataset_info_dicts[dataset_name]["rnd_x"]
    random_y = dataset_info_dicts[dataset_name]["rnd_y"]
    axs[0][col].scatter(random_x, random_y, label="Random", alpha=0.8, marker='^')
    #axs[0][col].set_title(dataset_name, size=21)
    axs[0][col].set(yscale='log', xscale='log', xlabel="Component Size")
    if col == 0:
        axs[0][col].set_ylabel("Number of Components")
        axs[0][col].legend(frameon=False, fontsize=16)
    axs[0][col].spines['top'].set_visible(False)
    axs[0][col].spines['right'].set_visible(False)

#fig.savefig("../results/plots/components.pdf", bbox_inches="tight", transparent=True)

# Plot both together

In [None]:
fig, axs = plt.subplots(2, len(datasets), squeeze=False, figsize=(35, 8), sharey=False)
plt.rcParams["xtick.labelsize"] = 13
plt.rcParams["ytick.labelsize"] = 14
plt.rcParams['axes.labelsize'] = 20
row_idx = 0
for col, dataset_name in enumerate(datasets):
    obs_bars = []
    layer_bars = []
    layer_errs = []
    for key in ["encap", "overlap", "overlap_sum"]:
        #obs_bars.append()
        layer = np.array(layer_data[dataset_name][key]) / obs_data[dataset_name][key]
        layer_bars.append(np.mean(layer))
        layer_errs.append(np.std(layer))

    x = np.array(list(range(1, len(layer_bars)+1)))
    width=0.1
    axs[row_idx][col].bar(x, layer_bars, yerr=layer_errs, width=width, label="Layer Randomization")
    axs[row_idx][col].set_xticks(x, labels=[f"Encap. Edges\n{obs_data[dataset_name]['encap']}",
                                      f"Overlap Edges\n{obs_data[dataset_name]['overlap']}",
                                      f"Total Overlap\n{obs_data[dataset_name]['overlap_sum']}"])
    axs[row_idx][col].set_yticks([0.25, 0.5, 0.75, 1.0], labels=["0.25", "0.50", "0.75", "1.00"])
    axs[row_idx][col].set_title(dataset_name, size=21)
    axs[row_idx][col].set_ylim((0, 1.1))
    axs[row_idx][col].set_xlim((0.5,len(layer_bars)+0.25))
    axs[row_idx][col].spines['top'].set_visible(False)
    axs[row_idx][col].spines['right'].set_visible(False)
    if col == 0:
        axs[row_idx][col].set(ylabel="Fraction of Observed")


row_idx = 1
for col, dataset_name in enumerate(datasets):
    observed_x = dataset_info_dicts[dataset_name]["obs_x"]
    observed_y = dataset_info_dicts[dataset_name]["obs_y"]
    axs[row_idx][col].scatter(observed_x, observed_y, label="Observed")
    
    random_x = dataset_info_dicts[dataset_name]["rnd_x"]
    random_y = dataset_info_dicts[dataset_name]["rnd_y"]
    axs[row_idx][col].scatter(random_x, random_y, label="Random", alpha=0.8, marker='^')
    axs[row_idx][col].set(yscale='log', xscale='log', xlabel="Component Size")
    if col == 0:
        axs[row_idx][col].set_ylabel("# Components")
        axs[row_idx][col].legend(frameon=False, fontsize=16)
    axs[row_idx][col].spines['top'].set_visible(False)
    axs[row_idx][col].spines['right'].set_visible(False)

fig.subplots_adjust(hspace=0.3, wspace=0.3)
#fig.savefig("../results/plots/layer_randomization_comparison_combined.pdf", bbox_inches="tight", transparent=True)