In [None]:
import sys
sys.path.append("../src/")

import pickle
import numpy as np
import networkx as nx

from collections import Counter, defaultdict

from utils import read_data
from encapsulation_dag import encapsulation_dag
from layer_randomization import layer_randomization

import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib_defaults
%matplotlib inline

In [None]:
data_dir = "../data/"
datasets = ["email-Enron", "email-Eu", "contact-primary-school", "contact-high-school", "coauth-MAG-History", "coauth-MAG-Geology"]#, "coauth-DBLP"]
num_samples = 5
remove_single_nodes = True

In [None]:
for dataset in datasets:
    print(dataset)
    observed_path = data_dir + dataset + "/" + dataset + "-" 
    print("Reading hyperedges.")
    hyperedges = read_data(observed_path, multiedges=False)
    if remove_single_nodes:
        hyperedges = [he for he in hyperedges if len(he) > 1]

    print("Computing observed dag.")
    obs_dag, obs_nth, obs_he_map = encapsulation_dag(hyperedges)

    print(f"Dag edges: {obs_dag.number_of_edges()}")

    # Observed
    dag = obs_dag
    component_sizes = [len(c) for c in nx.weakly_connected_components(obs_dag)]
    components_output_file = data_dir + dataset + f"/{dataset}_dag_components.txt"
    with open(components_output_file, "w") as fout:
        fout.write(",".join(map(str,component_sizes)))

    # Random
    random_comps = []
    for _ in range(num_samples):
        print("Computing layer randomization.")
        random_hyperedges = layer_randomization(hyperedges)
        #### Heights ####
        print("Computing random dag.")
        random_dag, _, _ = encapsulation_dag(random_hyperedges)
        print(f"Random dag has {random_dag.number_of_edges()} edges.")
        random_component_sizes = [len(c) for c in nx.weakly_connected_components(random_dag)]
        random_comps.append(random_component_sizes)

    components_output_file = data_dir + dataset + f"/{dataset}_layer_randomization_dag_components.txt"
    with open(components_output_file, "w") as fout:
        for sample_comp in random_comps:
            fout.write(",".join(map(str,sample_comp)) + "\n")

# All datasets on one plot

In [None]:
def read_component_data(dataset, num_samples, data_dir="../data/"):
    dataset_info = dict()
    # Compute observed DAG
    observed_path = data_dir + dataset + "/" + dataset + "-" 
    # Read heights by node dict
    with open(data_dir + dataset + "/" + dataset + "_dag_components.txt", "r") as fin:
        obs_components = np.array(list(map(int, fin.readline().split(','))))
    dataset_info["observed_components_dist"] = obs_components
    
    # Get a random DAG
    with open(data_dir + dataset + "/" + dataset + "_layer_randomization_dag_components.txt", 'r') as fin:
        random_comps = []
        for line in fin:
            random_comps.append(np.array(list(map(int, line.split(',')))))
    dataset_info["random_comps"] = random_comps
    
    # Get averages of random count distributions
    random_count_dists = dict()
    for arr in random_comps:
        arr_counts = dict(Counter(arr))
        for key in arr_counts:
            if key in random_count_dists:
                random_count_dists[key].append(arr_counts[key])
            else:
                random_count_dists[key] = [arr_counts[key]]

    dataset_info["random_count_dists"] = random_count_dists
    
    random_means = dict()
    #random_stds = dict()
    for key in random_count_dists:
        random_means[key] = sum(random_count_dists[key]) / num_samples
        #random_stds[key] = np.std(random_count_dists[key])

    # Fill in missing values from both counters
    observed_counts = dict(Counter(obs_components))
    for c in set(observed_counts.keys()).union(set(random_means.keys())):
        if c not in random_means:
            random_means[c] = 0
            #random_stds[c] = 0

        if c not in observed_counts:
            observed_counts[c] = 0

    dataset_info["observed_counts"] = observed_counts
    dataset_info["random_means"] = random_means
    #dataset_info["random_stds"] = random_stds
    return dataset_info

datasets = ["coauth-MAG-Geology", "coauth-MAG-History",  "contact-high-school", "contact-primary-school", "email-Enron", "email-Eu"]
dataset_info_dicts = dict()

for dataset in datasets:
    print(dataset)
    dataset_info_dicts[dataset] = read_component_data(dataset, num_samples)

In [None]:
def get_binning(values, num_bins = 15, log_binning = False, is_pmf = True, bins=None):   
    if bins is None:
        # We need to define the support of our distribution
        lower_bound = min(values)
        upper_bound = max(values)

        # And the type of binning we want
        if log_binning:
            lower_bound = np.log10(lower_bound)
            upper_bound = np.log10(upper_bound)+1
            bins = np.logspace(lower_bound,upper_bound,num_bins+1, base = 10)
        else:
            bins = np.linspace(lower_bound,upper_bound,num_bins+1)
    
    # Then we can compute the histogram using numpy
    if is_pmf:
        y, __ = np.histogram(values, bins = bins, density=False)
        p = y/float(y.sum())
        
    else:
        p, __ = np.histogram(values, bins = bins, density=False)
    
    # Now, we need to compute for each y the value of x
    x = bins[1:] - np.diff(bins)/2.0    
    
    if bins is None:
        x = x[p>0]
        p = p[p>0]

    return x, p, bins

def bin_distributions(dataset_info, log_binning=True, num_bins=50, is_pmf=True):
    # Bin the observed distribution
    obs_comps_dist = dataset_info["observed_components_dist"]
    x, y, bins = get_binning(obs_comps_dist, num_bins = num_bins, log_binning = log_binning, is_pmf = is_pmf)
    dataset_info["obs_x"] = x
    dataset_info["obs_y"] = y
    
    # Bin the random distribution
    rnd_comps_dists = dataset_info["random_comps"]
    #rnd_array = np.zeros(y.shape[0])
    #for dist in rnd_comps_dists:
    #    x, y, bins = get_binning(dist, log_binning = log_binning, is_pmf = True, bins=bins)
    #    rnd_array += y
    #rnd_array /= len(dist)
    #x, rnd_array, bins = get_binning(rnd_comps_dists[0], log_binning = log_binning, is_pmf = True, bins=bins)
    
    rnd_lists = [[] for _ in x]
    for dist in rnd_comps_dists:
        x, y, bins = get_binning(dist, log_binning = log_binning, is_pmf = is_pmf, bins=bins)
        for idx, val in enumerate(y):
            rnd_lists[idx].append(val)
    
    rnd_array = np.zeros(y.shape[0])
    for idx in range(len(x)):
        rnd_array[idx] = np.median(rnd_lists[idx])
    #rnd_array /= len(dist)
    #x, rnd_array, bins = get_binning(rnd_comps_dists[0], log_binning = log_binning, is_pmf = True, bins=bins)
    dataset_info["rnd_x"] = x
    dataset_info["rnd_y"] = rnd_array

In [None]:
for dataset_name in dataset_info_dicts:
    bin_distributions(dataset_info_dicts[dataset_name], is_pmf = False, log_binning=True, num_bins=10)

In [None]:
fig, axs = plt.subplots(1, len(datasets), squeeze=False, figsize=(35, 4))
for col, dataset_name in enumerate(datasets):
    observed_x = dataset_info_dicts[dataset_name]["obs_x"]
    observed_y = dataset_info_dicts[dataset_name]["obs_y"]
    axs[0][col].scatter(observed_x, observed_y, label="Observed")
    
    random_x = dataset_info_dicts[dataset_name]["rnd_x"]
    random_y = dataset_info_dicts[dataset_name]["rnd_y"]
    axs[0][col].scatter(random_x, random_y, label="Random", alpha=0.8, marker='^')
    #axs[0][col].set_title(dataset_name, size=21)
    axs[0][col].set(yscale='log', xscale='log', xlabel="Component Size")
    if col == 0:
        axs[0][col].set_ylabel("Number of Components")
        axs[0][col].legend(frameon=False, fontsize=16)
    axs[0][col].spines['top'].set_visible(False)
    axs[0][col].spines['right'].set_visible(False)

#fig.savefig("../results/plots/components.pdf", bbox_inches="tight", transparent=True)

In [None]:
fig, axs = plt.subplots(1, len(datasets), squeeze=False, figsize=(35, 4))
for col, dataset_name in enumerate(datasets):
    obs_counts = dataset_info_dicts[dataset_name]["observed_counts"]
    x = sorted(list(obs_counts.keys()))
    y = [obs_counts[xval] for xval in x]
    axs[0][col].scatter(x, y, label="Observed")
    
    random_means = dataset_info_dicts[dataset_name]["random_means"]
    x = sorted(list(random_means.keys()))
    y = [random_means[xval] for xval in x]
    axs[0][col].scatter(x, y, label="Random", alpha=0.3)
    
    if col == 0:
        axs[0][col].legend()

    axs[0][col].set_title(dataset_name, size=21)
    axs[0][col].set(yscale='log', xscale='log', xlabel="Component Size", ylabel="Count")
    axs[0][col].spines['top'].set_visible(False)
    axs[0][col].spines['right'].set_visible(False)

#fig.savefig("../results/plots/layer_randomization_comparison.pdf", bbox_inches="tight", transparent=True)