In [1]:
import numpy as np
import pandas as pd
import networkx as nx

import matplotlib.pyplot as plt
import seaborn as sns
sns.reset_orig()

from scipy.stats import bernoulli
from scipy.special import loggamma

import tqdm

In [2]:
import os
import pickle

In [3]:
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

In [4]:
outdir = './data/'
if not os.path.exists(outdir):
    os.makedirs(outdir)

In [5]:
def generate_data_random(N=100, K=2, alpha1=1.0, alpha2=1.0, a0=0.5, b0=0.5, 
                         pi1=None, pi2=None, theta=None, seed=0):
    np.random.seed(seed)
    # Dirichlet dist.
    alpha1 = alpha1 * np.ones(K)
    alpha2 = alpha2 * np.ones(K)
    if pi1 is None:
        #pi1 = np.random.dirichlet(alpha1, 1).ravel()
        pi1 = np.sort(np.random.dirichlet(alpha1, 1).ravel())

    Z1_true = np.random.multinomial(1, pi1, N)
    while np.logical_not(np.all(np.sum(Z1_true, axis=0) >= 1)):
        pi1 = np.sort(np.random.dirichlet(alpha1, 1).ravel())
        Z1_true = np.random.multinomial(1, pi1, N)
    
    if pi2 is None:
        #pi2 = np.random.dirichlet(alpha2, 1).ravel()
        pi2 = np.sort(np.random.dirichlet(alpha2, 1).ravel())
    
    Z2_true = np.zeros((N, K))
    while np.logical_not(np.all(np.sum(Z2_true, axis=0) >= 1)):
        pi2 = np.sort(np.random.dirichlet(alpha2, 1).ravel())
        Z2_true = np.random.multinomial(1, pi2, N)
        
    if theta is None:
        theta = np.random.beta(a0, b0, (K, K))
    
    X = np.zeros((N, N), dtype=np.int)
    for k in range(K):
        for l in range(K):
            point = Z1_true[:, k].reshape(-1, 1).dot(Z1_true[:, l].reshape(1, -1)).astype(bool)
            #X[point] = bernoulli(theta[k, l]).rvs(np.sum(point))[0]
            #X[point] = np.random.binomial(1, theta[k, l], np.sum(point))
            X[point] = bernoulli(p=theta[k, l]).rvs(size=np.sum(point))
    
    return X, Z1_true, Z2_true, pi1, pi2, theta

In [6]:
def generate_data_sequential(X, z, theta=None, n_trans=None, idx_before=None, ratio=0.1, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
    
    K = z.shape[1]
    L = z.shape[1]

    if (n_trans is None) & (idx_before is None):
        n_z = np.sum(z, axis=0)
        n_change = int(ratio * np.min(n_z))
        z_new = z.copy()
        idx_change = []
        for i in range(z.shape[1]):
            idxes_i = np.where(z[:, i] == 1)[0]
            idxes_used = []
            for j in range(z.shape[1]):
                if i == j:
                    continue
                i_change = np.random.choice(np.setdiff1d(idxes_i, idxes_used), n_change, replace=False)
                idxes_used += list(i_change)
                z_replace = np.zeros(z_new.shape[1], dtype=np.int)
                z_replace[j] = 1
                z_new[i_change, :] = z_replace
            idx_change += idxes_used
    elif (n_trans is not None) & (idx_before is not None):
        z_new = z.copy()
        idx_change = np.random.choice(np.where(z[:, idx_before] == 1)[0], n_trans, replace=False)
        z_replace = np.zeros(z_new.shape[1])
        z_replace[-1] = 1
        z_new[idx_change, :] = z_replace
    
    X_new = X.copy()
    
    for i in idx_change:
        idx_i = np.where(z_new[i, :] == 1)[0]
        for j in range(N):
            idx_j = np.where(z_new[j, :] == 1)[0]
            X_new[i, j] = bernoulli(p=theta[idx_i, idx_j]).rvs(size=1)
            X_new[j, i] = bernoulli(p=theta[idx_j, idx_i]).rvs(size=1)
    
    return X_new, z_new

In [7]:
def generate_data_sequential_randomtrans(X, z, theta=None, ratio=0.1, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]

    K = z.shape[1]
    L = z.shape[1]

    n_z = np.sum(z, axis=0)
    n_change = int(ratio * np.min(n_z))
    z_new = z.copy()
    idx_change = []
    for i in range(z.shape[1]):
        idxes_i = np.where(z[:, i] == 1)[0]
        idxes_used = []
        for j in range(z.shape[1]):
            if i == j:
                continue
            i_change = np.random.choice(np.setdiff1d(idxes_i, idxes_used), n_change, replace=False)
            idxes_used += list(i_change)
            z_replace = np.zeros(z_new.shape[1], dtype=np.int)
            z_replace[j] = 1
            z_new[i_change, :] = z_replace
        idx_change += idxes_used
    
    X_new = X.copy()    
    for i in idx_change:
        idx_i = np.where(z_new[i, :] == 1)[0]
        for j in range(N):
            idx_j = np.where(z_new[j, :] == 1)[0]
            X_new[i, j] = bernoulli(p=theta[idx_i, idx_j]).rvs(size=1)
            X_new[j, i] = bernoulli(p=theta[idx_j, idx_i]).rvs(size=1)
    
    return X_new, z_new

In [8]:
def generate_data_abrupt_pi(X, z, theta=None, n_trans=None, idx_before=None, idx_after=-1, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
    
    K = z.shape[1]
    L = z.shape[1]

    z_new = z.copy()
    idx_change = np.random.choice(np.where(z[:, idx_before] == 1)[0], n_trans, replace=False)
    z_replace = np.zeros(z_new.shape[1])
    z_replace[idx_after] = 1
    z_new[idx_change, :] = z_replace

    X_new = X.copy()
    
    for i in idx_change:
        idx_i = np.where(z_new[i, :] == 1)[0]
        for j in range(N):
            idx_j = np.where(z_new[j, :] == 1)[0]
            X_new[i, j] = bernoulli(p=theta[idx_i, idx_j]).rvs(size=1)
            X_new[j, i] = bernoulli(p=theta[idx_j, idx_i]).rvs(size=1)
    
    return X_new, z_new

In [9]:
def generate_data_abrupt_theta(X, z, theta2, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
    
    K = z.shape[1]
    L = z.shape[1]

    #X_new = X.copy()
    X_new = np.zeros((N, N), dtype=np.int)
    
    for k in range(K):
        for l in range(K):
            point = z[:, k].reshape(-1, 1).dot(z[:, l].reshape(1, -1)).astype(bool)
            X_new[point] = bernoulli(p=theta2[k, l]).rvs(size=np.sum(point))

    return X_new

In [10]:
def generate_data_split(X, z, 
                        #transmat_former, transmat_latter, 
                        K_former, K_latter, 
                        pi_former=None, theta_latter=None, 
                        a0=None, b0=None, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
    
    #K_former = transmat_former.shape[0]
    #K_latter = transmat_latter.shape[0]
        
    i_max = np.argmax(pi_former)
    pi_latter = np.hstack((pi_former.copy(), 0))
    pi_latter[i_max] = pi_latter[-1] = pi_former[i_max]/2
    
    theta_latter = np.zeros((K_latter, K_latter))
    theta_latter[:K_former, :K_former] = theta_former.copy()
    #theta_latter[i_max, :-1] = theta_latter[-1, :-1] = theta_former[i_max, :]
    #theta_latter[:-1, i_max] = theta_latter[:-1, -1] = theta_former[:, i_max]
    #theta_latter[:-1, -1] = theta_former[:, i_max]
    #theta_latter[-1, -1] = np.random.beta(a0, b0, 1)
    theta_latter[-1, :] = np.random.beta(a0, b0, K_latter)
    #theta_latter[:-1, -1] = np.random.beta(a0, b0, K_latter-1)
    theta_latter[:-1, -1] = np.random.beta(a0, b0, K_latter-1)
    
    idx_change = np.where(z[:, i_max] == 1)[0]

    z_new = np.zeros((z.shape[0], K_latter))
    z_new[:, :K_former] = z.copy()
    idx_new = np.random.choice(idx_change, int(len(idx_change)/2), replace=False)
    z_new[idx_new, i_max] = 0
    z_new[idx_new, -1] = 1
    
    X_new = X.copy()
    
    for i in idx_new:
        for j in range(N):
            z_i = np.where(z_new[i, :] == 1)[0]
            z_j = np.where(z_new[j, :] == 1)[0]
            X_new[i, j] = bernoulli(theta_latter[z_i, z_j]).rvs(size=1)
            X_new[j, i] = bernoulli(theta_latter[z_j, z_i]).rvs(size=1)

    return X_new, z_new, theta_latter

In [11]:
cluster_range_max = 6

In [12]:
EPS = np.finfo(np.float).eps

T = 80

a0 = 1.0
b0 = 1.0
ratio = 0.02

X_all = []
Z_all = []

for trial in range(10):
    print('trial =', trial)
    
    X_trial = []
    Z_trial = []
    pi1 = None
    pi2 = None
    theta_former = None   
    
    for t in tqdm.tqdm(range(1, T+1)):
        seed = trial*T + t
        if t == 1:
            X, Z1_true, Z2_true, pi1, pi2, theta_former = generate_data_random(
                N=100, K=3, alpha1=1.0, alpha2=1.0, a0=a0, b0=b0, 
                pi1=pi1, pi2=pi2, theta=theta_former, seed=seed)
        elif t <= 19:
            X, Z1_true = generate_data_sequential_randomtrans(
                X, Z1_true, theta=theta_former, ratio=ratio, seed=seed)
        elif t == 20:
            # change of pi
            n_z = np.sum(Z1_true, axis=0)
            
            idx_max_first, idx_max_second = n_z.argsort()[::-1][:2]
            n_z_max_first, n_z_max_second = np.sort(n_z)[::-1][:2]
            n_trans = int((n_z_max_first - n_z_max_second)/2)
            
            X, Z1_true = generate_data_abrupt_pi(
                X, Z1_true, theta=theta_former, n_trans=n_trans, 
                idx_before=idx_max_first, idx_after=idx_max_second, seed=seed)            
            
        elif 21 <= t < 40:
            X, Z1_true = generate_data_sequential_randomtrans(
                X, Z1_true, theta=theta_former, ratio=ratio, seed=seed)
        elif t == 40:
            # change of theta
            u = np.random.uniform(0.01, 0.05, (3, 3))
            theta_former2 = theta_former + u
            #theta_former2[theta_former2 > 1.0] = (theta_former2 - u)[theta_former2 > 1.0]
            theta_former2[theta_former2 > 1.0] = 1.0
            X = generate_data_abrupt_theta(
                X, Z1_true, theta_former2, seed=seed)
        elif 41 <= t < 60:
            X, Z1_true = generate_data_sequential_randomtrans(
                X, Z1_true, theta=theta_former2, ratio=ratio, seed=seed)
        elif t == 60:
            X, Z1_true, theta_latter = generate_data_split(
                X, Z1_true, K_former=3, K_latter=4, 
                pi_former=pi1, a0=a0, b0=b0, seed=seed) 
        else:
            X, Z1_true = generate_data_sequential_randomtrans(
                X, Z1_true, theta=theta_latter, ratio=ratio, seed=seed)
        
        X_trial.append(X)
        Z_trial.append(Z1_true)
    
    X_all.append(X_trial)
    Z_all.append(Z_trial)
            
    X_all_array = np.array(X_all)
    with open(os.path.join(outdir, 'X_abrupt.pkl'), 'wb') as f:
        pickle.dump(X_all_array, f)
    with open(os.path.join(outdir, 'Z_abrupt.pkl'), 'wb') as f:
        pickle.dump(Z_all, f)

  0%|          | 0/80 [00:00<?, ?it/s]

trial = 0


100%|██████████| 80/80 [00:01<00:00, 47.33it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

trial = 1


100%|██████████| 80/80 [00:04<00:00, 17.08it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

trial = 2


100%|██████████| 80/80 [00:04<00:00, 17.09it/s]
 25%|██▌       | 20/80 [00:00<00:00, 103.33it/s]

trial = 3


100%|██████████| 80/80 [00:02<00:00, 37.47it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

trial = 4


100%|██████████| 80/80 [00:03<00:00, 25.25it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

trial = 5


100%|██████████| 80/80 [00:03<00:00, 24.39it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

trial = 6


100%|██████████| 80/80 [00:07<00:00, 11.05it/s]
 25%|██▌       | 20/80 [00:00<00:00, 145.28it/s]

trial = 7


100%|██████████| 80/80 [00:03<00:00, 25.30it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

trial = 8


100%|██████████| 80/80 [00:04<00:00, 16.92it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

trial = 9


100%|██████████| 80/80 [00:05<00:00, 14.90it/s]
