In [None]:
import sys
import pickle
import networkx as nx
import numpy as np
import pickle
from matplotlib import pyplot as plt
from collections import defaultdict, Counter

In [None]:
sys.path.append("../src/")
import matplotlib_defaults
from encapsulation_dag import *
from utils import read_hyperedges, read_data
from layer_randomization import layer_randomization

Note: The first time this notebook is run, read_dists must be False to generate the data. This could take some time, especially for the larger datasets.

In [None]:
data_dir = "../data/"
datasets = [
    #"coauth-DBLP",
    "coauth-MAG-Geology",
    "coauth-MAG-History",
    "contact-high-school",
    "contact-primary-school",
    "email-Enron",
    "email-Eu"
]

In [None]:
read_dists = True
if not read_dists:
    obs_overlap_dists = dict()
    random_overlap_dists = dict()
    obs_hyperedge_size_counts = dict()
    for dataset in datasets:
        print(dataset)
        observed_data_path = data_dir + dataset + "/" + dataset + "-"
        obs_hyperedges = read_data(observed_data_path, multiedges=False)
        obs_dag, nth, he_map = encapsulation_dag(obs_hyperedges)
        obs_overlap_dists[dataset] = get_overlap_dists(obs_dag, binomial_norm=False, in_neighbors=False)
        hyperedge_size_counts = defaultdict(int)
        for node in obs_dag.nodes():
            hyperedge_size_counts[len(node)] += 1
        obs_hyperedge_size_counts[dataset] = hyperedge_size_counts.copy()

        random_dag, _, _ = encapsulation_dag(layer_randomization(obs_hyperedges))
        random_overlap_dists[dataset] = get_overlap_dists(random_dag, binomial_norm=False, in_neighbors=False)
        
        with open(data_dir + dataset + "/" + dataset + "_overlap_dists.pickle", "wb") as fpickle:
            pickle.dump(obs_overlap_dists[dataset], fpickle)
        with open(data_dir + dataset + "/" + dataset + "layer_randomization_overlap_dists.pickle", "wb") as fpickle:
            pickle.dump(random_overlap_dists[dataset], fpickle)
        with open(data_dir + dataset + "/" + dataset + "hyperedge_size_counts.pickle", "wb") as fpickle:
            pickle.dump(obs_hyperedge_size_counts[dataset], fpickle)
else:
    obs_overlap_dists = dict()
    random_overlap_dists = dict()
    obs_hyperedge_size_counts = dict()
    for dataset in datasets:
        with open(data_dir + dataset + "/" + dataset + "_overlap_dists.pickle", "rb") as fpickle:
            obs_overlap_dists[dataset] = pickle.load(fpickle)
        with open(data_dir + dataset + "/" + dataset + "layer_randomization_overlap_dists.pickle", "rb") as fpickle:
            random_overlap_dists[dataset] = pickle.load(fpickle)
        with open(data_dir + dataset + "/" + dataset + "hyperedge_size_counts.pickle", "rb") as fpickle:
            obs_hyperedge_size_counts[dataset] = pickle.load(fpickle)

In [None]:
max_m = 5
max_n = 25

In [None]:
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=plt.cm.Dark2.colors)

In [None]:
c = plt.cm.Dark2.colors

In [None]:
fig = plt.figure(figsize=(15,5))
gridsize = (2, len(datasets))
for col_idx, dataset in enumerate(datasets):
    # Going to plot this as a line for each dataset
    hyperedge_size_counts = obs_hyperedge_size_counts[dataset]
    
    # Getting x values based on the sizes that appear in the size counts
    size_x = np.ones(max(hyperedge_size_counts.keys())).cumsum()
    # Getting number of hyperedges of each size to plot
    size_y = np.array([hyperedge_size_counts[key] for key in size_x])
    
    row_idx = 0
    # Initialize the non-normalized axis
    ax = plt.subplot2grid(gridsize, (row_idx,col_idx))
    ax.plot(size_x, size_y, label="# Edges", color='black', ls='--', alpha=0.7)
    ax.set_title(dataset, fontsize=11, y=1.1)
    ax.set(xlabel=r"$n$", yscale='log', ylim=(10**-1, 10**7))
    
    for normalized in [False, True]:
        if normalized:
            # Add the normalized axis
            row_idx += 1
            ax = plt.subplot2grid(gridsize, (row_idx,col_idx))
        
        max_x = min(max(hyperedge_size_counts.keys())+1, max_n+1)
        xtick_step = int(max_x *0.2)
        ax.set_xticks(list(range(0, max_x, xtick_step)), list(range(0, max_x, xtick_step)))
        # For each smaller size
        for m in range(1, max_m+1):
            # Get the relevant n values (m+1, max_n)
            xvals = list(range(m+1, min(max(hyperedge_size_counts.keys())+1, max_n+1)))
            
            # Initialize arrays that will be plotted for both observed and random
            yvals = np.zeros(len(xvals))
            random_yvals = np.zeros(len(xvals))
            
            # Loop over the values of n
            for i,n in enumerate(xvals):
                # Get the observed and random values
                yvals[i] = 0.0
                if n in obs_overlap_dists[dataset] and m in obs_overlap_dists[dataset][n]:
                    yvals[i] = float(sum(obs_overlap_dists[dataset][n][m]))
                
                random_yvals[i] = 0.0
                if n in random_overlap_dists[dataset] and m in random_overlap_dists[dataset][n]:
                    random_yvals[i] = float(sum(random_overlap_dists[dataset][n][m]))
            
            if normalized:
                nonzero_indices = size_y[m:] > 0
                if nonzero_indices.sum() == nonzero_indices.shape[0]:
                    yvals /= size_y[m:]
                    random_yvals /= size_y[m:]
                else:
                    yvals[nonzero_indices] /= size_y[m:][nonzero_indices]
                    random_yvals[nonzero_indices] /= size_y[m:][nonzero_indices]
            
            l = ax.scatter(xvals, yvals, label=f"m={m}", facecolors='none', edgecolor=c[m-1], s=8, linewidths=0.6)
            if not normalized:
                ax.scatter(xvals, random_yvals, marker="^", label=f"_m={m} rnd", facecolors='none', color = c[m-1], s=8, linewidths=0.6)
                ax.set(yscale='symlog', xlabel=r"$n$")
                
            ax.set(xlabel=r"$n$")

        if col_idx == 0 and not normalized:
            ax.set(ylabel=r"$Encapsulation(n,m)$")
            ax.legend(ncols=6, bbox_to_anchor=(1.5,-1.8), loc="upper left", frameon=False)
        elif col_idx == 0 and normalized:
            ax.set(ylabel=r"$\frac{Encapsulation(n,m)}{n}$")

        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
fig.subplots_adjust(wspace=0.5, hspace=0.45)
#plt.savefig("../results/plots/encapsulation-m-n-mpl.pdf", bbox_inches="tight")

# Within $(n,m)$ distrubtions

In [None]:
from math import comb

In [None]:
def plot_n_m_dist(overlap_dists, n_range, ax, plot_one=True, legend=True):
    if plot_one:
        lower_range = 0
    else:
        lower_range = 1
    for n in n_range:
        for m in range(n-1, lower_range, -1):
            if n in overlap_dists and m in overlap_dists[n]:
                d = dict(Counter(overlap_dists[n][m]))
            else:
                denom = comb(n, m)
                d = {val/denom: 0.0 for val in range(denom)}
                
            xp = []
            yp = []
            for x in sorted(d.keys()):
                xp.append(x)
                yp.append(float(d[x]))
            yp = np.array(yp)
            #yp /= yp.sum()
            ax.plot(xp, yp, label=f"n={n}, m={m}", marker='o')
    if legend:
        ax.legend()
    ax.set_xticks([0.0, 0.25, 0.5, 0.75, 1.0], [0.0, 0.25, 0.5, 0.75, 1.0], size=10)

In [None]:
read_dists = True
if not read_dists:
    obs_overlap_dists = dict()
    random_overlap_dists = dict()
    obs_hyperedge_size_counts = dict()
    for dataset in datasets:
        print(dataset)
        observed_data_path = data_dir + dataset + "/" + dataset + "-"
        obs_hyperedges = read_data(observed_data_path, multiedges=False)
        obs_dag, nth, he_map = encapsulation_dag(obs_hyperedges)
        obs_overlap_dists[dataset] = get_overlap_dists(obs_dag, binomial_norm=True, in_neighbors=False)

        random_dag, _, _ = encapsulation_dag(layer_randomization(obs_hyperedges))
        random_overlap_dists[dataset] = get_overlap_dists(random_dag, binomial_norm=True, in_neighbors=False)
        
        with open(data_dir + dataset + "/" + dataset + "_overlap_dists_normed.pickle", "wb") as fpickle:
            pickle.dump(obs_overlap_dists[dataset], fpickle)
        with open(data_dir + dataset + "/" + dataset + "layer_randomization_overlap_dists_normed.pickle", "wb") as fpickle:
            pickle.dump(random_overlap_dists[dataset], fpickle)
else:
    obs_overlap_dists = dict()
    random_overlap_dists = dict()
    for dataset in datasets:
        with open(data_dir + dataset + "/" + dataset + "_overlap_dists_normed.pickle", "rb") as fpickle:
            obs_overlap_dists[dataset] = pickle.load(fpickle)
        with open(data_dir + dataset + "/" + dataset + "layer_randomization_overlap_dists_normed.pickle", "rb") as fpickle:
            random_overlap_dists[dataset] = pickle.load(fpickle)

In [None]:
n_range = [5, 4, 3, 2] 
fig = plt.figure(figsize=(15,5))
gridsize = (2, len(datasets))
plot_legend = True
for col_idx, dataset in enumerate(datasets):
    ax_obs = plt.subplot2grid(gridsize, (0, col_idx))
    plot_n_m_dist(obs_overlap_dists[dataset], n_range, ax_obs, plot_one=False, legend=False)
    ax_obs.set_title(dataset, fontsize=11, y=1.)

    if plot_legend:
        ax_obs.set_ylabel("Observed Frequency")
    
    ax_rnd = plt.subplot2grid(gridsize, (1, col_idx))
    plot_n_m_dist(random_overlap_dists[dataset], n_range, ax_rnd, plot_one=False, legend=False)
    ax_rnd.set_xlabel(r"$\frac{Encapsulation(n,m)}{\binom{n}{m}}$")
    
    ylims = ax_rnd.get_ylim()[0] - 0.1 * ax_obs.get_ylim()[1], ax_obs.get_ylim()[1]
    ax_rnd.set_ylim(ylims)
    ax_rnd.spines['top'].set_visible(False)
    ax_rnd.spines['right'].set_visible(False)
    
    ax_obs.set_ylim(ylims)
    ax_obs.spines['top'].set_visible(False)
    ax_obs.spines['right'].set_visible(False)
    if plot_legend:
        ax_rnd.set_ylabel("Random Frequency")
        ax_obs.legend(ncols=6, bbox_to_anchor=(0.55,-1.75), loc="upper left", frameon=False)
        plot_legend = False


plt.subplots_adjust(hspace=0.2, wspace=0.7)
#plt.savefig("../results/plots/overlap-hists.pdf", bbox_inches="tight")