# Dataset Email EU Core

Import libs

In [None]:
from matplotlib import pyplot as plt
import networkx as nx
import contextlib
import numpy as np
import pandas as pd
import itertools
import torch
import os

import scipy.stats

import lib
from lib.preprocessing import Dataset
from lib.models import WoldModelVariational
from gb import GrangerBusca

# Set cells width for nicer output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

---

## Load the dataset

The dataset must first be downloaded from the SNAP dataset repository: <https://snap.stanford.edu/data/email-Eu-core-temporal.html>

    wget https://snap.stanford.edu/data/email-Eu-core-temporal.txt.gz

Set input path where the dataset is located

In [None]:
INPUT_PATH = "email-Eu-core-temporal.txt.gz"

Load the dataset

In [None]:
dataset = Dataset(INPUT_PATH, top=100)

Quick exploration 

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(18, 3.5))

axs[0].hist(list(map(len, dataset.timestamps)))
axs[0].set_xlabel('Number of events \n in each dim')

end_time = dataset.end_time
axs[1].hist(list(map(min, dataset.timestamps)), bins=np.linspace(0, end_time, 20))
axs[1].set_xlabel('First of event \n in each dim')

axs[2].hist(list(map(max, dataset.timestamps)), bins=np.linspace(0, end_time, 20))
axs[2].set_xlabel('Last of events \n in each dim')

fig.suptitle(f"Dataset {os.path.split(INPUT_PATH)[1]}");

Print statistics of the dataset

In [None]:
print(f"Num. of dimensions: {len(dataset.timestamps):,d}")
print(f"    Num. of events: {sum(map(len, dataset.timestamps)):,d}")
print(f"               %NZ: {100 * dataset.graph.number_of_edges() / (dataset.graph.number_of_nodes() ** 2):.2f}%")
print()
print("Stats. of num. of events per dim:")
num_jumps_per_dim = np.array(list(map(len, dataset.timestamps)))
print(pd.Series(num_jumps_per_dim).describe())

Build ground truth adjacency matrix

In [None]:
adjacency_true = nx.adjacency_matrix(dataset.graph, nodelist=range(dataset.dim)).toarray()
adjacency_true = adjacency_true / adjacency_true.max()