In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tqdm
import pickle
import os

from rpy2.robjects import numpy2ri
from rpy2.robjects.packages import importr

In [2]:
def calc_dnml(X, Z1, Z2, K=3, L=3):    
    N = X.shape[0]
    
    codelen_x_z = 0.0
    codelen_z = 0.0

    for k in range(K):
        for l in range(L):
            lam = np.mean(X[Z1 == k, :][:, Z2 == l])

            codelen_x_z += n_all * np.log(n_all)
            codelen_x_z += np.log(normterm_discrete(n_all, 2))

        n_k = np.sum(Z1 == k)
        if n_k >= 1:
            codelen_z += n_k * (np.log(N) - np.log(n_k))

    codelen_z += np.log(normterm_discrete(N, K))
    
    codelen = codelen_x_z + codelen_z

    return codelen, codelen_x_z, codelen_z

In [3]:
outdir = 'output/'
if not os.path.exists(outdir):
    os.makedirs(outdir)

In [4]:
blockmodels = importr("blockmodels")
base = importr("base")
dollar = base.__dict__["$"]

In [5]:
EPS = np.finfo(np.float).eps

In [6]:
# read data
d = pd.read_csv('data_processed/FEH_200504_201503.csv')

In [7]:
d.head(3)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,北海道,青森県,岩手県,宮城県,秋田県,山形県,...,愛媛県,高知県,福岡県,佐賀県,長崎県,熊本県,大分県,宮崎県,鹿児島県,沖縄県
0,2015000303,2015年3月,1000,北海道,-,400,146,472,128,102,...,49,31,304,24,48,70,30,29,39,129
1,2015000303,2015年3月,2000,青森県,535,-,536,942,222,95,...,2,1,41,7,27,17,12,3,15,139
2,2015000303,2015年3月,3000,岩手県,231,421,-,1309,215,122,...,2,8,33,2,4,6,4,4,9,8


In [8]:
columns = d.loc[:, '北海道':'沖縄県'].columns

In [9]:
columns

Index(['北海道', '青森県', '岩手県', '宮城県', '秋田県', '山形県', '福島県', '茨城県', '栃木県', '群馬県',
       '埼玉県', '千葉県', '東京都', '神奈川県', '新潟県', '富山県', '石川県', '福井県', '山梨県', '長野県',
       '岐阜県', '静岡県', '愛知県', '三重県', '滋賀県', '京都府', '大阪府', '兵庫県', '奈良県', '和歌山県',
       '鳥取県', '島根県', '岡山県', '広島県', '山口県', '徳島県', '香川県', '愛媛県', '高知県', '福岡県',
       '佐賀県', '長崎県', '熊本県', '大分県', '宮崎県', '鹿児島県', '沖縄県'],
      dtype='object')

In [10]:
for cn in columns:
    d[cn] = d[cn].str.replace('-', '0').str.replace(',', '').astype(np.int)

In [11]:
YM_list = np.unique(d['Unnamed: 0'].values)

In [12]:
with open(os.path.join(outdir, 'ym_list.pkl'), 'wb') as f:
    pickle.dump(YM_list, f)

In [13]:
d_list = []
for ym in YM_list:
    d_ym = d.loc[d['Unnamed: 0'] == ym, :]
    d_ym_values = d_ym.iloc[:, 4:].values
    d_ym_values = d_ym_values.astype(np.float)
    d_ym_values[np.isnan(d_ym_values)] = 0.0
    d_ym_values = d_ym_values.astype(np.int)
    
    d_list.append(d_ym_values)

In [14]:
d_list_array = np.array(d_list)

In [15]:
with open('data_processed/X.pkl', 'wb') as f:
    pickle.dump(d_list_array, f)

In [16]:
def estimate_poisson(X, K):
    numpy2ri.activate()
    
    sbm = blockmodels.BM_poisson(membership_type="SBM", adj=np.array(X),
                                 verbosity=0,
                                 exploration_factor=1.5,
                                 explore_min=K,
                                 explore_max=K)

    estimate = dollar(sbm, "estimate")
    estimate()

    pi_list = []
    theta_list = []
    z_posterior_list = []
    
    for k in range(K):
        n_clusters = k + 1
        theta = np.array(dollar(dollar(sbm, "model_parameters")[k], "lambda"))
        z_posterior = np.array(dollar(dollar(sbm, "memberships")[k], "Z"))
        pi = np.sum(z_posterior, axis=0) + 10 * EPS
        pi /= np.sum(pi)

        theta_list.append(theta)
        z_posterior_list.append(z_posterior)
        pi_list.append(pi)

    numpy2ri.deactivate()

    return pi_list, theta_list, z_posterior_list

In [17]:
T = len(YM_list)
h = 2
K = 10

theta_list = [] 
pi_list = []
z_list = []

for t in tqdm.tqdm(range(T)):
    d = d_list_array[t, :, :]
    
    pi, theta, z = estimate_poisson(d, K=K)
    pi_list.append(pi)
    theta_list.append(theta)
    z_list.append(z)

  0%|          | 0/120 [00:00<?, ?it/s]






  1%|          | 1/120 [00:02<05:19,  2.69s/it]






  2%|▏         | 2/120 [00:06<05:42,  2.90s/it]






  2%|▎         | 3/120 [00:08<05:38,  2.89s/it]






  3%|▎         | 4/120 [00:11<05:18,  2.75s/it]






  4%|▍         | 5/120 [00:14<05:39,  2.95s/it]






  5%|▌         | 6/120 [00:18<05:58,  3.15s/it]






  6%|▌         | 7/120 [00:21<05:55,  3.15s/it]






  7%|▋         | 8/120 [00:24<05:35,  2.99s/it]






  8%|▊         | 9/120 [00:26<05:19,  2.87s/it]






  8%|▊         | 10/120 [00:30<05:58,  3.26s/it]






  9%|▉         | 11/120 [00:33<05:34,  3.07s/it]






 10%|█         | 12/120 [00:36<05:17,  2.94s/it]






 11%|█         | 13/120 [00:38<05:02,  2.83s/it]






 12%|█▏        | 14/120 [00:42<05:28,  3.10s/it]






 12%|█▎        | 15/120 [00:45<05:15,  3.01s/it]






 13%|█▎        | 16/120 [00:48<05:14,  3.02s/it]






 14%|█▍        | 17/120 [00:51<05:24,  3.15s/it]






 15%|█▌        | 18/120 [00:55<05:32,  3.26s/it]






 16%|█▌        | 19/120 [00:58<05:31,  3.29s/it]






 17%|█▋        | 20/120 [01:00<04:55,  2.96s/it]






 18%|█▊        | 21/120 [01:02<04:26,  2.69s/it]






 18%|█▊        | 22/120 [01:04<04:04,  2.49s/it]






 19%|█▉        | 23/120 [01:07<03:56,  2.44s/it]






 20%|██        | 24/120 [01:09<04:00,  2.51s/it]






 21%|██        | 25/120 [01:13<04:16,  2.69s/it]






 22%|██▏       | 26/120 [01:17<04:52,  3.11s/it]






 22%|██▎       | 27/120 [01:20<04:57,  3.20s/it]






 23%|██▎       | 28/120 [01:24<05:02,  3.29s/it]






 24%|██▍       | 29/120 [01:27<05:10,  3.41s/it]






 25%|██▌       | 30/120 [01:30<04:48,  3.20s/it]






 26%|██▌       | 31/120 [01:33<04:30,  3.04s/it]






 27%|██▋       | 32/120 [01:35<04:05,  2.79s/it]






 28%|██▊       | 33/120 [01:37<03:46,  2.60s/it]






 28%|██▊       | 34/120 [01:40<03:50,  2.68s/it]






 29%|██▉       | 35/120 [01:42<03:31,  2.49s/it]






 30%|███       | 36/120 [01:44<03:28,  2.48s/it]






 31%|███       | 37/120 [01:49<04:18,  3.12s/it]






 32%|███▏      | 38/120 [01:52<04:09,  3.05s/it]






 32%|███▎      | 39/120 [01:55<04:18,  3.19s/it]






 33%|███▎      | 40/120 [01:59<04:17,  3.21s/it]






 34%|███▍      | 41/120 [02:01<03:53,  2.96s/it]






 35%|███▌      | 42/120 [02:05<04:11,  3.22s/it]






 36%|███▌      | 43/120 [02:09<04:18,  3.36s/it]






 37%|███▋      | 44/120 [02:12<04:07,  3.26s/it]






 38%|███▊      | 45/120 [02:14<03:43,  2.98s/it]






 38%|███▊      | 46/120 [02:16<03:32,  2.87s/it]






 39%|███▉      | 47/120 [02:19<03:29,  2.87s/it]






 40%|████      | 48/120 [02:22<03:21,  2.80s/it]






 41%|████      | 49/120 [02:25<03:15,  2.75s/it]






 42%|████▏     | 50/120 [02:30<04:00,  3.44s/it]






 42%|████▎     | 51/120 [02:33<03:47,  3.30s/it]






 43%|████▎     | 52/120 [02:35<03:27,  3.05s/it]






 44%|████▍     | 53/120 [02:37<03:03,  2.73s/it]






 45%|████▌     | 54/120 [02:41<03:15,  2.96s/it]






 46%|████▌     | 55/120 [02:43<03:09,  2.92s/it]






 47%|████▋     | 56/120 [02:46<03:00,  2.81s/it]






 48%|████▊     | 57/120 [02:50<03:23,  3.24s/it]






 48%|████▊     | 58/120 [02:53<03:06,  3.02s/it]






 49%|████▉     | 59/120 [02:55<02:46,  2.72s/it]






 50%|█████     | 60/120 [02:58<02:50,  2.85s/it]






 51%|█████     | 61/120 [03:01<02:55,  2.97s/it]






 52%|█████▏    | 62/120 [03:04<02:53,  2.98s/it]






 52%|█████▎    | 63/120 [03:07<02:47,  2.94s/it]






 53%|█████▎    | 64/120 [03:11<03:08,  3.36s/it]






 54%|█████▍    | 65/120 [03:15<03:06,  3.38s/it]






 55%|█████▌    | 66/120 [03:18<03:02,  3.37s/it]






 56%|█████▌    | 67/120 [03:22<03:09,  3.58s/it]






 57%|█████▋    | 68/120 [03:25<02:57,  3.41s/it]






 57%|█████▊    | 69/120 [03:28<02:49,  3.32s/it]






 58%|█████▊    | 70/120 [03:31<02:38,  3.18s/it]






 59%|█████▉    | 71/120 [03:34<02:25,  2.98s/it]






 60%|██████    | 72/120 [03:36<02:18,  2.88s/it]






 61%|██████    | 73/120 [03:39<02:13,  2.84s/it]






 62%|██████▏   | 74/120 [03:43<02:26,  3.18s/it]






 62%|██████▎   | 75/120 [03:46<02:18,  3.08s/it]






 63%|██████▎   | 76/120 [03:49<02:10,  2.96s/it]






 64%|██████▍   | 77/120 [03:53<02:29,  3.47s/it]






 65%|██████▌   | 78/120 [03:56<02:21,  3.37s/it]






 66%|██████▌   | 79/120 [04:00<02:20,  3.42s/it]






 67%|██████▋   | 80/120 [04:03<02:11,  3.28s/it]






 68%|██████▊   | 81/120 [04:06<02:04,  3.19s/it]






 68%|██████▊   | 82/120 [04:09<02:03,  3.24s/it]






 69%|██████▉   | 83/120 [04:12<01:59,  3.22s/it]






 70%|███████   | 84/120 [04:16<02:02,  3.40s/it]






 71%|███████   | 85/120 [04:19<01:51,  3.18s/it]






 72%|███████▏  | 86/120 [04:23<02:00,  3.55s/it]






 72%|███████▎  | 87/120 [04:27<01:58,  3.58s/it]






 73%|███████▎  | 88/120 [04:30<01:45,  3.31s/it]






 74%|███████▍  | 89/120 [04:34<01:48,  3.51s/it]






 75%|███████▌  | 90/120 [04:38<01:57,  3.91s/it]






 76%|███████▌  | 91/120 [04:42<01:47,  3.71s/it]






 77%|███████▋  | 92/120 [04:44<01:34,  3.37s/it]






 78%|███████▊  | 93/120 [04:48<01:37,  3.61s/it]






 78%|███████▊  | 94/120 [04:52<01:30,  3.46s/it]






 79%|███████▉  | 95/120 [04:54<01:22,  3.30s/it]






 80%|████████  | 96/120 [04:57<01:11,  2.99s/it]






 81%|████████  | 97/120 [04:59<01:04,  2.81s/it]






 82%|████████▏ | 98/120 [05:03<01:06,  3.01s/it]






 82%|████████▎ | 99/120 [05:05<01:01,  2.91s/it]






 83%|████████▎ | 100/120 [05:10<01:06,  3.33s/it]






 84%|████████▍ | 101/120 [05:12<00:59,  3.14s/it]






 85%|████████▌ | 102/120 [05:15<00:56,  3.16s/it]






 86%|████████▌ | 103/120 [05:18<00:50,  2.97s/it]






 87%|████████▋ | 104/120 [05:21<00:46,  2.89s/it]






 88%|████████▊ | 105/120 [05:23<00:42,  2.80s/it]






 88%|████████▊ | 106/120 [05:27<00:42,  3.04s/it]






 89%|████████▉ | 107/120 [05:30<00:38,  2.94s/it]






 90%|█████████ | 108/120 [05:33<00:35,  2.99s/it]






 91%|█████████ | 109/120 [05:36<00:34,  3.15s/it]






 92%|█████████▏| 110/120 [05:40<00:32,  3.26s/it]






 92%|█████████▎| 111/120 [05:44<00:31,  3.48s/it]






 93%|█████████▎| 112/120 [05:47<00:27,  3.41s/it]






 94%|█████████▍| 113/120 [05:49<00:21,  3.09s/it]






 95%|█████████▌| 114/120 [05:54<00:20,  3.46s/it]






 96%|█████████▌| 115/120 [05:58<00:18,  3.67s/it]






 97%|█████████▋| 116/120 [06:01<00:14,  3.51s/it]






 98%|█████████▊| 117/120 [06:04<00:09,  3.28s/it]






 98%|█████████▊| 118/120 [06:07<00:06,  3.21s/it]






 99%|█████████▉| 119/120 [06:09<00:03,  3.07s/it]






100%|██████████| 120/120 [06:12<00:00,  3.02s/it]


In [18]:
with open(os.path.join(outdir, 'pi_list.pkl'), 'wb') as f:
    pickle.dump(pi_list, f)

with open(os.path.join(outdir, 'theta_list.pkl'), 'wb') as f:
    pickle.dump(theta_list, f)
    
with open(os.path.join(outdir, 'z_list.pkl'), 'wb') as f:
    pickle.dump(z_list, f)