In [1]:
import numpy as np
import pandas as pd
import networkx as nx

import matplotlib.pyplot as plt
import seaborn as sns
sns.reset_orig()

from scipy.stats import bernoulli
from scipy.special import loggamma

import tqdm

In [2]:
import os
import pickle

In [3]:
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

In [4]:
outdir = './data'
if not os.path.exists(outdir):
    os.makedirs(outdir)

In [5]:
def generate_data_random(N=100, K=2, alpha1=1.0, alpha2=1.0, a0=0.5, b0=0.5, 
                         pi1=None, pi2=None, theta=None, seed=0):
    np.random.seed(seed)
    # Dirichlet dist.
    alpha1 = alpha1 * np.ones(K)
    alpha2 = alpha2 * np.ones(K)
    if pi1 is None:
        pi1 = np.sort(np.random.dirichlet(alpha1, 1).ravel())

    Z1_true = np.random.multinomial(1, pi1, N)
    while np.logical_not(np.all(np.sum(Z1_true, axis=0) >= 1)):
        pi1 = np.sort(np.random.dirichlet(alpha1, 1).ravel())
        Z1_true = np.random.multinomial(1, pi1, N)
    
    if pi2 is None:
        pi2 = np.sort(np.random.dirichlet(alpha2, 1).ravel())
    
    Z2_true = np.zeros((N, K))
    while np.logical_not(np.all(np.sum(Z2_true, axis=0) >= 1)):
        pi2 = np.sort(np.random.dirichlet(alpha2, 1).ravel())
        Z2_true = np.random.multinomial(1, pi2, N)
        
    if theta is None:
        theta = np.random.beta(a0, b0, (K, K))
    
    X = np.zeros((N, N), dtype=np.int)
    for k in range(K):
        for l in range(K):
            point = Z1_true[:, k].reshape(-1, 1).dot(Z1_true[:, l].reshape(1, -1)).astype(bool)
            X[point] = bernoulli(p=theta[k, l]).rvs(size=np.sum(point))
    
    return X, Z1_true, Z2_true, pi1, pi2, theta

In [6]:
def generate_data_sequential_randomtrans(X, z, theta=None, ratio=0.1, n_max=10, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]

    K = z.shape[1]
    L = z.shape[1]

    z_new = z.copy()
    
    X_new = X.copy()
    for k in range(K):
        for l in range(K):
            point = z[:, k].reshape(-1, 1).dot(z[:, l].reshape(1, -1 )).astype(bool)
            point_list = np.where(point)
            n = len(point_list[0])
            n_change = np.min([int(ratio * n), n_max])
            idxes = np.random.choice(np.arange(n), n_change)
            X_new[point_list[0][idxes], point_list[1][idxes]] = bernoulli(p=theta[k, l]).rvs(size=n_change)
    return X_new, z_new

In [7]:
def generate_data_abrupt_pi(X, z, theta=None, n_trans=None, idx_before=None, idx_after=-1, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
    
    K = z.shape[1]
    L = z.shape[1]

    z_new = z.copy()
    idx_change = np.random.choice(np.where(z[:, idx_before] == 1)[0], n_trans, replace=False)
    z_new[idx_change, idx_before] = 0
    z_new[idx_change, idx_after] = 1
    
    X_new = X.copy()
    
    for i in idx_change:
        idx_i = np.where(z_new[i, :] == 1)[0]
        for j in range(N):
            idx_j = np.where(z_new[j, :] == 1)[0]
            X_new[i, j] = bernoulli(p=theta[idx_i, idx_j]).rvs(size=1)
            X_new[j, i] = bernoulli(p=theta[idx_j, idx_i]).rvs(size=1)
    
    return X_new, z_new

In [8]:
def generate_data_abrupt_theta(X, z, theta2, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
    
    K = z.shape[1]
    L = z.shape[1]

    X_new = np.zeros((N, N), dtype=np.int)
    
    for k in range(K):
        for l in range(K):
            point = z[:, k].reshape(-1, 1).dot(z[:, l].reshape(1, -1 )).astype(bool)
            if np.sum(point) > 0:
                X_new[point] = bernoulli(p=theta2[k, l]).rvs(size=np.sum(point))

    return X_new

In [9]:
def generate_data_split(X, z, 
                        #transmat_former, transmat_latter, 
                        K_former, K_latter, 
                        pi_former=None, theta_latter=None, 
                        a0=None, b0=None, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
            
    i_max = np.argmax(pi_former)
    pi_latter = np.hstack((pi_former.copy(), 0))
    pi_latter[i_max] = pi_latter[-1] = pi_former[i_max]/2
    
    theta_latter = np.zeros((K_latter, K_latter))
    theta_latter[:K_former, :K_former] = theta_former.copy()
    theta_latter[-1, :] = np.random.beta(a0, b0, K_latter)
    theta_latter[:-1, -1] = np.random.beta(a0, b0, K_latter-1)
    
    idx_change = np.where(z[:, i_max] == 1)[0]

    z_new = np.zeros((z.shape[0], K_latter))
    z_new[:, :K_former] = z.copy()
    idx_new = np.random.choice(idx_change, int(len(idx_change)/2), replace=False)
    z_new[idx_new, i_max] = 0
    z_new[idx_new, -1] = 1
    
    X_new = X.copy()
    
    for i in idx_new:
        for j in range(N):
            z_i = np.where(z_new[i, :] == 1)[0]
            z_j = np.where(z_new[j, :] == 1)[0]
            X_new[i, j] = bernoulli(theta_latter[z_i, z_j]).rvs(size=1)
            X_new[j, i] = bernoulli(theta_latter[z_j, z_i]).rvs(size=1)

    return X_new, z_new, theta_latter

In [10]:
EPS = np.finfo(np.float).eps

T = 80
N = 100

a0 = 1.0
b0 = 1.0
alpha1 = 1.0
alpha2 = 1.0
ratio = 0.005

tol = 0.01

X_all = []
Z_all = []

n_success = 1
trial = 1

while n_success <= 20:
    X_trial = []
    Z_trial = []
    pi1 = None
    pi2 = None
    theta_former = None
    theta_latter = None
    
    # t = 1
    seed = trial*T
    X, Z1_true, Z2_true, pi1, pi2, theta_former = generate_data_random(
            N=N, K=3, alpha1=alpha1, alpha2=alpha2, a0=a0, b0=b0, 
            pi1=pi1, pi2=pi2, theta=theta_former, seed=seed)

    trial += 1
    if np.any(pd.value_counts(np.argmax(Z1_true, axis=1)) < 10):
        continue
    if np.any(theta_former < 0.1):
        continue
    X_trial.append(X)
    Z_trial.append(Z1_true)
    
    n_success += 1
    
    # t = 2 - 19
    for t in range(2, 20):
        X, Z1_true = generate_data_sequential_randomtrans(
                X, Z1_true, theta=theta_former, ratio=ratio, seed=seed)
        X_trial.append(X)
        Z_trial.append(Z1_true)
    
    # t = 20
    # change of theta
    #u = np.random.uniform(-0.02, 0.02, (3, 3))
    u = np.random.uniform(-0.1, 0.1, (3, 3))
    theta_former2 = theta_former + u
    theta_former2[theta_former2 > 1.0] = 1.0 - tol
    theta_former2[theta_former2 < 0.0] = tol
    X = generate_data_abrupt_theta(
            X, Z1_true, theta_former2, seed=seed)
    
    X_trial.append(X)
    Z_trial.append(Z1_true)
    
    # t = 21 - 39
    for t in range(21, 40):
        X, Z1_true = generate_data_sequential_randomtrans(
            X, Z1_true, theta=theta_former2, ratio=ratio, seed=seed)
        X_trial.append(X)
        Z_trial.append(Z1_true)

    # t = 40
    # change of pi
    n_z = np.sum(Z1_true, axis=0)
            
    idx_max_first, idx_max_second = n_z.argsort()[::-1][:2]
    n_z_max_first, n_z_max_second = np.sort(n_z)[::-1][:2]
    #n_trans = int((n_z_max_first - n_z_max_second)/4)
    n_trans = int((n_z_max_first - n_z_max_second)/3)
    #n_trans = 5
    
    X, Z1_true = generate_data_abrupt_pi(
        X, Z1_true, theta=theta_former2, n_trans=n_trans, 
        idx_before=idx_max_first, idx_after=idx_max_second, seed=seed)            

    X_trial.append(X)
    Z_trial.append(Z1_true)
    
    # t = 41 - 59
    for t in range(41, 60):
        X, Z1_true = generate_data_sequential_randomtrans(
            X, Z1_true, theta=theta_former2, ratio=ratio, seed=seed)

        X_trial.append(X)
        Z_trial.append(Z1_true)
        
    # t = 60
    X, Z1_true, theta_latter = generate_data_split(
        X, Z1_true, K_former=3, K_latter=4, 
        pi_former=pi1, a0=a0, b0=b0, seed=seed) 
    
    X_trial.append(X)
    Z_trial.append(Z1_true)

    # t = 61 - 80
    for t in range(61, T+1):
        X, Z1_true = generate_data_sequential_randomtrans(
            X, Z1_true, theta=theta_latter, ratio=ratio, seed=seed)

        X_trial.append(X)
        Z_trial.append(Z1_true)
        
    X_all.append(X_trial)
    Z_all.append(Z_trial)
            
    X_all_array = np.array(X_all)
    with open(os.path.join(outdir, 'X_abrupt.pkl'), 'wb') as f:
        pickle.dump(X_all_array, f)
    with open(os.path.join(outdir, 'Z_abrupt.pkl'), 'wb') as f:
        pickle.dump(Z_all, f)