In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

from scipy.io import loadmat 

import matplotlib.pyplot as plt
import seaborn as sns
sns.reset_orig()

In [2]:
import networkx as nx

from scipy.stats import bernoulli
from scipy.special import loggamma

from sklearn.preprocessing import OneHotEncoder

import tqdm

from rpy2.robjects import numpy2ri
from rpy2.robjects.packages import importr

In [3]:
import os
import glob
import pickle

In [4]:
blockmodels = importr("blockmodels")

In [5]:
base = importr("base")

In [6]:
dollar = base.__dict__["$"]

In [7]:
pd.options.display.max_rows = 1000

In [8]:
outdir = './output'
if not os.path.exists(outdir):
    os.makedirs(outdir)

In [9]:
from functools import lru_cache

@lru_cache(maxsize = 10000)
def normterm_discrete(n, k):
    if n == 1:
        return np.log(k)
    if k == 1:
        return 1.0
    elif k == 2:
        return np.sum(sorted([ np.exp(loggamma(n+1) - loggamma(t+1) - loggamma(n-t+1) + 
                               t*(np.log(t) - np.log(n)) + (n-t)*(np.log(n-t) - np.log(n))
                        )
                        for t in range(1, n)]))
    else:
        return normterm_discrete(n, k-1) + n/(k-2) * normterm_discrete(n, k-2)

In [10]:
def calc_dnml(X, Z1, Z2, K=3, L=3):    
    N = X.shape[0]
    
    codelen_x_z = 0.0
    codelen_z = 0.0

    for k in range(K):
        for l in range(L):
            n_pos = np.sum(X[Z1 == k, :][:, Z2 == l] == 1)
            n_neg = np.sum(X[Z1 == k, :][:, Z2 == l] == 0)
            n_all = n_pos + n_neg

            if n_all >=2:
                codelen_x_z += n_all * np.log(n_all)
                codelen_x_z += np.log(normterm_discrete(n_all, 2))
            
            if n_pos >=2:
                codelen_x_z -= n_pos * np.log(n_pos)
            if n_neg >=2:
                codelen_x_z -= n_neg * np.log(n_neg)
            
        n_k = np.sum(Z1 == k)
        if n_k >= 1:
            codelen_z += n_k * (np.log(N) - np.log(n_k))

    codelen_z += np.log(normterm_discrete(N, K))
    
    codelen = codelen_x_z + codelen_z

    return codelen, codelen_x_z, codelen_z

In [11]:
def calc_lsc(X, Z1, Z2, K=3, L=3):    
    codelen = 0.0
    N = X.shape[0]

    for k in range(K):
        for l in range(L):
            n_pos = np.sum(X[Z1 == k, :][:, Z2 == l] == 1)
            n_neg = np.sum(X[Z1 == k, :][:, Z2 == l] == 0)
            n_all = n_pos + n_neg

            if n_all >=2:
                codelen += n_all * np.log(n_all)
            
            if n_pos >=2:
                codelen -= n_pos * np.log(n_pos)
            if n_neg >=2:
                codelen -= n_neg * np.log(n_neg)

        n_k = np.sum(Z1 == k)
        if n_k >= 1:
            codelen += n_k * (np.log(N) - np.log(n_k))

        codelen += (k + (k+1)*(k+2))/2 * np.log(N/(2.0*np.pi)) -(k+1)/2 * np.log(2.0) + \
           (k+1) * loggamma((k+3)/2) - loggamma((k+1)*(k+3)/2) + (k+1)*(k+2)/2 * np.log(np.pi)

    return codelen

In [12]:
def estimate_sbm(X, cluster_range_max=6):
    numpy2ri.activate()
    sbm = blockmodels.BM_bernoulli(membership_type="SBM", adj=np.array(X),
                                       verbosity=0,
                                       exploration_factor=1.5,
                                       explore_min=cluster_range_max,
                                       explore_max=cluster_range_max)

    estimate = dollar(sbm, "estimate")
    estimate()

    theta_list = []
    pi_list = []
    z_posterior_list = []
    for k in range(cluster_range_max):
        n_clusters = k + 1
        theta = np.array(dollar(dollar(sbm, "model_parameters")[k], "pi"))
        z_posterior = np.array(dollar(dollar(sbm, "memberships")[k], "Z"))
        pi = np.sum(z_posterior, axis=0) + 10 * EPS
        pi /= np.sum(pi)

        theta_list.append(theta)
        z_posterior_list.append(z_posterior)
        pi_list.append(pi)

    return pi_list, theta_list, z_posterior_list

In [13]:
cluster_range_max = 7

In [14]:
files_pairs = sorted(glob.glob('./data/enron/*.pairs'))

In [15]:
with open('addresses.txt', 'r') as f:
    addresses = np.array(f.read().splitlines())

In [16]:
n_term = len(files_pairs)
n_employee = len(addresses)
X = np.zeros((n_employee, n_employee, n_term), dtype=np.int)

for k, file in enumerate(files_pairs):
    print(file)
    try:
        sender_receiver = np.loadtxt(file, delimiter='\t', dtype=np.int)
        if len(sender_receiver.shape) == 1:
            sender_receiver = sender_receiver.reshape(1, -1)
        id_sender = sender_receiver[:, 0] - 1
        id_receiver = sender_receiver[:, 1] - 1
        X[id_sender, id_receiver, k] += 1
    except:
        continue

./data/enron/19990510.pairs
./data/enron/19990517.pairs
./data/enron/19990524.pairs
./data/enron/19990531.pairs
./data/enron/19990607.pairs
./data/enron/19990614.pairs
./data/enron/19990621.pairs
./data/enron/19990628.pairs
./data/enron/19990705.pairs
./data/enron/19990712.pairs
./data/enron/19990719.pairs
./data/enron/19990726.pairs
./data/enron/19990802.pairs
./data/enron/19990809.pairs
./data/enron/19990816.pairs
./data/enron/19990823.pairs
./data/enron/19990830.pairs
./data/enron/19990906.pairs
./data/enron/19990913.pairs
./data/enron/19990920.pairs
./data/enron/19990927.pairs
./data/enron/19991004.pairs
./data/enron/19991011.pairs
./data/enron/19991018.pairs
./data/enron/19991025.pairs
./data/enron/19991101.pairs
./data/enron/19991108.pairs
./data/enron/19991115.pairs
./data/enron/19991122.pairs
./data/enron/19991129.pairs
./data/enron/19991206.pairs
./data/enron/19991213.pairs
./data/enron/19991220.pairs
./data/enron/19991227.pairs
./data/enron/20000103.pairs
./data/enron/2000011

  
  
  


In [17]:
X.shape

(151, 151, 162)

In [18]:
with open('n_emails.pkl', 'wb') as f:
    pickle.dump(X, f)

In [19]:
X = (X > 0).astype(np.int32)

In [20]:
np.sum(X)

7444

In [21]:
EPS = np.finfo(np.float).eps

#T = X_all.shape[1]
T = 157

h = 2

pi_all_total = []
theta_all_total = []
z_all_total = []

pi_all_former = []
theta_all_former = []
z_all_former = []

pi_all_latter = []
theta_all_latter = []
z_all_latter = []
        
for t in tqdm.tqdm(range(h, T-h)):
    seed = t

    # total
    X_total = np.any(X[:, :, (t-h):(t+h)] == 1, axis=2).astype(np.int)
    print(X_total.shape)
    pi_list, theta_list, z_posterior_list = estimate_sbm(X_total, cluster_range_max)            

    pi_all_total.append(pi_list)
    theta_all_total.append(theta_list)
    z_all_total.append(z_posterior_list)

    # former
    X_former = np.any(X[:, :, (t-h):t] == 1, axis=2).astype(np.int)
    pi_list, theta_list, z_posterior_list = estimate_sbm(X_former, cluster_range_max)

    pi_all_former.append(pi_list)
    theta_all_former.append(theta_list)
    z_all_former.append(z_posterior_list)

    # latter
    X_latter = np.any(X[:, :, t:(t+h)] == 1, axis=2).astype(np.int)
    pi_list, theta_list, z_posterior_list = estimate_sbm(X_latter, cluster_range_max)

    pi_all_latter.append(pi_list)
    theta_all_latter.append(theta_list)
    z_all_latter.append(z_posterior_list)

with open(os.path.join(outdir, 'pi_total_enron.pkl'), 'wb') as f:
    pickle.dump(pi_all_total, f)
with open(os.path.join(outdir, 'theta_total_enron.pkl'), 'wb') as f:
    pickle.dump(theta_all_total, f)
with open(os.path.join(outdir, 'z_total_enron.pkl'), 'wb') as f:
    pickle.dump(z_all_total, f)

with open(os.path.join(outdir, 'pi_former_enron.pkl'), 'wb') as f:
    pickle.dump(pi_all_former, f)
with open(os.path.join(outdir, 'theta_former_enron.pkl'), 'wb') as f:
    pickle.dump(theta_all_former, f)
with open(os.path.join(outdir, 'z_former_enron.pkl'), 'wb') as f:
    pickle.dump(z_all_former, f)

with open(os.path.join(outdir, 'pi_latter_enron.pkl'), 'wb') as f:
    pickle.dump(pi_all_latter, f)
with open(os.path.join(outdir, 'theta_latter_enron.pkl'), 'wb') as f:
    pickle.dump(theta_all_latter, f)
with open(os.path.join(outdir, 'z_latter_enron.pkl'), 'wb') as f:
    pickle.dump(z_all_latter, f)

  0%|          | 0/153 [00:00<?, ?it/s]

(151, 151)





  1%|          | 1/153 [00:07<19:52,  7.85s/it]

(151, 151)





  1%|▏         | 2/153 [00:14<18:37,  7.40s/it]

(151, 151)





  2%|▏         | 3/153 [00:22<18:51,  7.54s/it]

(151, 151)





  3%|▎         | 4/153 [00:27<17:20,  6.98s/it]

(151, 151)





  3%|▎         | 5/153 [00:33<16:03,  6.51s/it]

(151, 151)





  4%|▍         | 6/153 [00:39<15:44,  6.43s/it]

(151, 151)





  5%|▍         | 7/153 [00:44<14:30,  5.97s/it]

(151, 151)





  5%|▌         | 8/153 [00:51<15:39,  6.48s/it]

(151, 151)





  6%|▌         | 9/153 [00:59<16:26,  6.85s/it]

(151, 151)





  7%|▋         | 10/153 [01:07<16:48,  7.05s/it]

(151, 151)





  7%|▋         | 11/153 [01:15<17:19,  7.32s/it]

(151, 151)





  8%|▊         | 12/153 [01:23<17:50,  7.59s/it]

(151, 151)





  8%|▊         | 13/153 [01:33<19:37,  8.41s/it]

(151, 151)





  9%|▉         | 14/153 [01:41<19:10,  8.28s/it]

(151, 151)





 10%|▉         | 15/153 [01:49<18:49,  8.19s/it]

(151, 151)





 10%|█         | 16/153 [01:56<17:49,  7.81s/it]

(151, 151)





 11%|█         | 17/153 [02:04<17:44,  7.82s/it]

(151, 151)





 12%|█▏        | 18/153 [02:13<18:34,  8.26s/it]

(151, 151)





 12%|█▏        | 19/153 [02:21<18:02,  8.07s/it]

(151, 151)





 13%|█▎        | 20/153 [02:31<19:31,  8.81s/it]

(151, 151)





 14%|█▎        | 21/153 [02:40<18:59,  8.63s/it]

(151, 151)





 14%|█▍        | 22/153 [02:47<18:05,  8.29s/it]

(151, 151)





 15%|█▌        | 23/153 [02:56<18:07,  8.37s/it]

(151, 151)





 16%|█▌        | 24/153 [03:02<16:57,  7.89s/it]

(151, 151)





 16%|█▋        | 25/153 [03:11<17:01,  7.98s/it]

(151, 151)





 17%|█▋        | 26/153 [03:16<15:03,  7.11s/it]

(151, 151)





 18%|█▊        | 27/153 [03:22<14:31,  6.92s/it]

(151, 151)





 18%|█▊        | 28/153 [03:28<13:50,  6.64s/it]

(151, 151)





 19%|█▉        | 29/153 [03:39<16:35,  8.03s/it]

(151, 151)





 20%|█▉        | 30/153 [03:48<16:54,  8.25s/it]

(151, 151)





 20%|██        | 31/153 [03:59<18:34,  9.13s/it]

(151, 151)





 21%|██        | 32/153 [04:08<18:15,  9.05s/it]

(151, 151)





 22%|██▏       | 33/153 [04:19<19:02,  9.52s/it]

(151, 151)





 22%|██▏       | 34/153 [04:26<17:40,  8.91s/it]

(151, 151)





 23%|██▎       | 35/153 [04:36<17:55,  9.12s/it]

(151, 151)





 24%|██▎       | 36/153 [04:43<16:41,  8.56s/it]

(151, 151)





 24%|██▍       | 37/153 [04:52<16:31,  8.54s/it]

(151, 151)





 25%|██▍       | 38/153 [05:00<16:18,  8.51s/it]

(151, 151)





 25%|██▌       | 39/153 [05:08<15:33,  8.19s/it]

(151, 151)





 26%|██▌       | 40/153 [05:13<13:55,  7.39s/it]

(151, 151)





 27%|██▋       | 41/153 [05:20<13:18,  7.13s/it]

(151, 151)





 27%|██▋       | 42/153 [05:25<12:23,  6.70s/it]

(151, 151)





 28%|██▊       | 43/153 [05:34<13:13,  7.22s/it]

(151, 151)





 29%|██▉       | 44/153 [09:46<2:26:49, 80.82s/it]

(151, 151)





 29%|██▉       | 45/153 [09:57<1:47:26, 59.69s/it]

(151, 151)





 30%|███       | 46/153 [10:05<1:18:51, 44.22s/it]

(151, 151)





 31%|███       | 47/153 [10:12<58:39, 33.20s/it]  

(151, 151)





 31%|███▏      | 48/153 [10:18<43:38, 24.94s/it]

(151, 151)





 32%|███▏      | 49/153 [10:27<35:01, 20.21s/it]

(151, 151)





 33%|███▎      | 50/153 [10:34<27:59, 16.31s/it]

(151, 151)





 33%|███▎      | 51/153 [10:43<23:40, 13.92s/it]

(151, 151)





 34%|███▍      | 52/153 [10:50<19:51, 11.79s/it]

(151, 151)





 35%|███▍      | 53/153 [10:59<18:31, 11.11s/it]

(151, 151)





 35%|███▌      | 54/153 [11:06<16:04,  9.74s/it]

(151, 151)





 36%|███▌      | 55/153 [11:14<15:01,  9.20s/it]

(151, 151)





 37%|███▋      | 56/153 [11:21<13:47,  8.53s/it]

(151, 151)





 37%|███▋      | 57/153 [11:31<14:38,  9.15s/it]

(151, 151)





 38%|███▊      | 58/153 [11:40<14:32,  9.18s/it]

(151, 151)





 39%|███▊      | 59/153 [11:51<15:06,  9.64s/it]

(151, 151)





 39%|███▉      | 60/153 [12:01<15:15,  9.85s/it]

(151, 151)





 40%|███▉      | 61/153 [12:09<14:12,  9.26s/it]

(151, 151)





 41%|████      | 62/153 [12:17<13:32,  8.93s/it]

(151, 151)





 41%|████      | 63/153 [12:25<12:36,  8.41s/it]

(151, 151)





 42%|████▏     | 64/153 [12:32<11:56,  8.05s/it]

(151, 151)





 42%|████▏     | 65/153 [12:41<12:22,  8.44s/it]

(151, 151)





 43%|████▎     | 66/153 [12:49<12:00,  8.28s/it]

(151, 151)





 44%|████▍     | 67/153 [12:56<11:27,  7.99s/it]

(151, 151)





 44%|████▍     | 68/153 [13:06<12:07,  8.55s/it]

(151, 151)





 45%|████▌     | 69/153 [13:15<12:12,  8.72s/it]

(151, 151)





 46%|████▌     | 70/153 [13:22<11:11,  8.09s/it]

(151, 151)





 46%|████▋     | 71/153 [13:32<11:57,  8.75s/it]

(151, 151)





 47%|████▋     | 72/153 [13:40<11:19,  8.39s/it]

(151, 151)





 48%|████▊     | 73/153 [13:48<11:00,  8.26s/it]

(151, 151)





 48%|████▊     | 74/153 [13:55<10:18,  7.83s/it]

(151, 151)





 49%|████▉     | 75/153 [14:03<10:22,  7.98s/it]

(151, 151)





 50%|████▉     | 76/153 [14:13<10:55,  8.51s/it]

(151, 151)





 50%|█████     | 77/153 [14:20<10:13,  8.07s/it]

(151, 151)





 51%|█████     | 78/153 [14:28<10:15,  8.20s/it]

(151, 151)





 52%|█████▏    | 79/153 [14:35<09:27,  7.67s/it]

(151, 151)





 52%|█████▏    | 80/153 [14:42<09:05,  7.47s/it]

(151, 151)





 53%|█████▎    | 81/153 [14:50<09:10,  7.65s/it]

(151, 151)





 54%|█████▎    | 82/153 [14:56<08:37,  7.29s/it]

(151, 151)





 54%|█████▍    | 83/153 [15:05<08:52,  7.60s/it]

(151, 151)





 55%|█████▍    | 84/153 [15:14<09:26,  8.21s/it]

(151, 151)





 56%|█████▌    | 85/153 [15:21<08:58,  7.93s/it]

(151, 151)





 56%|█████▌    | 86/153 [15:33<09:56,  8.90s/it]

(151, 151)





 57%|█████▋    | 87/153 [15:40<09:15,  8.42s/it]

(151, 151)





 58%|█████▊    | 88/153 [15:48<09:01,  8.33s/it]

(151, 151)





 58%|█████▊    | 89/153 [15:57<09:05,  8.52s/it]

(151, 151)





 59%|█████▉    | 90/153 [16:03<08:10,  7.78s/it]

(151, 151)





 59%|█████▉    | 91/153 [16:10<07:56,  7.68s/it]

(151, 151)





 60%|██████    | 92/153 [16:19<08:10,  8.05s/it]

(151, 151)





 61%|██████    | 93/153 [16:25<07:21,  7.36s/it]

(151, 151)





 61%|██████▏   | 94/153 [16:33<07:20,  7.47s/it]

(151, 151)





 62%|██████▏   | 95/153 [16:40<07:06,  7.35s/it]

(151, 151)





 63%|██████▎   | 96/153 [16:47<06:51,  7.21s/it]

(151, 151)





 63%|██████▎   | 97/153 [16:54<06:37,  7.09s/it]

(151, 151)





 64%|██████▍   | 98/153 [17:00<06:18,  6.89s/it]

(151, 151)





 65%|██████▍   | 99/153 [17:06<06:01,  6.70s/it]

(151, 151)





 65%|██████▌   | 100/153 [17:13<05:58,  6.76s/it]

(151, 151)





 66%|██████▌   | 101/153 [17:20<05:52,  6.79s/it]

(151, 151)





 67%|██████▋   | 102/153 [17:27<05:50,  6.86s/it]

(151, 151)





 67%|██████▋   | 103/153 [17:35<05:55,  7.11s/it]

(151, 151)





 68%|██████▊   | 104/153 [17:40<05:23,  6.61s/it]

(151, 151)





 69%|██████▊   | 105/153 [17:46<05:06,  6.39s/it]

(151, 151)





 69%|██████▉   | 106/153 [17:52<04:54,  6.26s/it]

(151, 151)





 70%|██████▉   | 107/153 [17:59<04:55,  6.41s/it]

(151, 151)





 71%|███████   | 108/153 [18:09<05:32,  7.38s/it]

(151, 151)





 71%|███████   | 109/153 [18:16<05:26,  7.41s/it]

(151, 151)





 72%|███████▏  | 110/153 [18:22<05:01,  7.01s/it]

(151, 151)





 73%|███████▎  | 111/153 [18:28<04:42,  6.72s/it]

(151, 151)





 73%|███████▎  | 112/153 [18:36<04:51,  7.12s/it]

(151, 151)





 74%|███████▍  | 113/153 [18:44<04:58,  7.46s/it]

(151, 151)





 75%|███████▍  | 114/153 [18:52<04:57,  7.62s/it]

(151, 151)





 75%|███████▌  | 115/153 [19:02<05:09,  8.14s/it]

(151, 151)





 76%|███████▌  | 116/153 [19:08<04:40,  7.57s/it]

(151, 151)





 76%|███████▋  | 117/153 [19:15<04:28,  7.45s/it]

(151, 151)





 77%|███████▋  | 118/153 [19:22<04:14,  7.28s/it]

(151, 151)





 78%|███████▊  | 119/153 [19:29<04:01,  7.10s/it]

(151, 151)





 78%|███████▊  | 120/153 [19:36<03:59,  7.24s/it]

(151, 151)





 79%|███████▉  | 121/153 [19:45<04:01,  7.55s/it]

(151, 151)





 80%|███████▉  | 122/153 [19:52<03:48,  7.38s/it]

(151, 151)





 80%|████████  | 123/153 [19:58<03:29,  6.99s/it]

(151, 151)





 81%|████████  | 124/153 [20:05<03:24,  7.04s/it]

(151, 151)





 82%|████████▏ | 125/153 [20:10<03:03,  6.54s/it]

(151, 151)





 82%|████████▏ | 126/153 [20:16<02:48,  6.24s/it]

(151, 151)





 83%|████████▎ | 127/153 [20:21<02:31,  5.82s/it]

(151, 151)





 84%|████████▎ | 128/153 [20:27<02:27,  5.88s/it]

(151, 151)





 84%|████████▍ | 129/153 [20:33<02:21,  5.90s/it]

(151, 151)





 85%|████████▍ | 130/153 [20:39<02:21,  6.15s/it]

(151, 151)





 86%|████████▌ | 131/153 [20:46<02:16,  6.22s/it]

(151, 151)





 86%|████████▋ | 132/153 [20:51<02:04,  5.91s/it]

(151, 151)





 87%|████████▋ | 133/153 [20:56<01:56,  5.81s/it]

(151, 151)





 88%|████████▊ | 134/153 [21:03<01:52,  5.90s/it]

(151, 151)





 88%|████████▊ | 135/153 [21:09<01:50,  6.14s/it]

(151, 151)





 89%|████████▉ | 136/153 [21:15<01:45,  6.18s/it]

(151, 151)





 90%|████████▉ | 137/153 [21:24<01:49,  6.87s/it]

(151, 151)





 90%|█████████ | 138/153 [21:30<01:41,  6.76s/it]

(151, 151)





 91%|█████████ | 139/153 [21:37<01:35,  6.80s/it]

(151, 151)





 92%|█████████▏| 140/153 [21:45<01:29,  6.91s/it]

(151, 151)





 92%|█████████▏| 141/153 [21:51<01:19,  6.65s/it]

(151, 151)





 93%|█████████▎| 142/153 [21:56<01:10,  6.41s/it]

(151, 151)





 93%|█████████▎| 143/153 [22:01<00:58,  5.90s/it]

(151, 151)





 94%|█████████▍| 144/153 [22:06<00:51,  5.70s/it]

(151, 151)





 95%|█████████▍| 145/153 [22:11<00:42,  5.32s/it]

(151, 151)





 95%|█████████▌| 146/153 [22:16<00:36,  5.22s/it]

(151, 151)





 96%|█████████▌| 147/153 [22:21<00:30,  5.07s/it]

(151, 151)





 97%|█████████▋| 148/153 [22:25<00:24,  4.91s/it]

(151, 151)





 97%|█████████▋| 149/153 [22:30<00:19,  4.86s/it]

(151, 151)





 98%|█████████▊| 150/153 [22:34<00:13,  4.60s/it]

(151, 151)





 99%|█████████▊| 151/153 [22:38<00:09,  4.56s/it]

(151, 151)





 99%|█████████▉| 152/153 [22:43<00:04,  4.60s/it]

(151, 151)





100%|██████████| 153/153 [22:48<00:00,  4.62s/it]
