In [1]:
import numpy as np
import pandas as pd

import tqdm

In [2]:
import pickle

In [3]:
import os

In [4]:
from scipy.stats import bernoulli
from scipy.special import loggamma

In [5]:
np.set_printoptions(threshold=100000)

In [6]:
outdir = './data'
if not os.path.exists(outdir):
    os.makedirs(outdir)

In [7]:
def generate_data_random(N=100, K=2, alpha1=1.0, alpha2=1.0, a0=0.5, b0=0.5, 
                         pi1=None, pi2=None, theta=None, seed=0):
    np.random.seed(seed)
    # Dirichlet dist.
    alpha1 = alpha1 * np.ones(K)
    alpha2 = alpha2 * np.ones(K)
    if pi1 is None:
        pi1 = np.sort(np.random.dirichlet(alpha1, 1).ravel())

    Z1_true = np.random.multinomial(1, pi1, N)
    while np.logical_not(np.all(np.sum(Z1_true, axis=0) >= 1)):
        pi1 = np.sort(np.random.dirichlet(alpha1, 1).ravel())
        Z1_true = np.random.multinomial(1, pi1, N)
    
    if pi2 is None:
        pi2 = np.sort(np.random.dirichlet(alpha2, 1).ravel())
    
    Z2_true = np.zeros((N, K))
    while np.logical_not(np.all(np.sum(Z2_true, axis=0) >= 1)):
        pi2 = np.sort(np.random.dirichlet(alpha2, 1).ravel())
        Z2_true = np.random.multinomial(1, pi2, N)
        
    if theta is None:
        theta = np.random.beta(a0, b0, (K, K))
    
    X = np.zeros((N, N), dtype=np.int)
    for k in range(K):
        for l in range(K):
            point = Z1_true[:, k].reshape(-1, 1).dot(Z1_true[:, l].reshape(1, -1)).astype(bool)
            X[point] = bernoulli(p=theta[k, l]).rvs(size=np.sum(point))
    
    return X, Z1_true, Z2_true, pi1, pi2, theta

In [8]:
def generate_data_sequential_randomtrans(X, z, theta=None, ratio=0.1, n_max=10, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]

    K = z.shape[1]
    L = z.shape[1]

    z_new = z.copy()
    idx_change = []
    
    X_new = X.copy()
    for k in range(K):
        for l in range(K):
            point = z[:, k].reshape(-1, 1).dot(z[:, l].reshape(1, -1 )).astype(bool)
            point_list = np.where(point)
            n = len(point_list[0])
            n_change = np.min([int(ratio * n), n_max])
            idxes = np.random.choice(np.arange(n), n_change)
            X_new[point_list[0][idxes], point_list[1][idxes]] = bernoulli(p=theta[k, l]).rvs(size=len(idxes))

    return X_new, z_new

In [9]:
def generate_data_gradual_pi(X, z, theta=None, n_trans=None, idx_before=None, idx_after=-1, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
    
    K = z.shape[1]
    L = z.shape[1]

    z_new = z.copy()
    idx_change = np.random.choice(np.where(z[:, idx_before] == 1)[0], n_trans, replace=False)
    z_new[idx_change, idx_after] = 1
    z_new[idx_change, idx_before] = 0

    X_new = X.copy()
    
    for i in idx_change:
        idx_i = np.where(z_new[i, :] == 1)[0]
        for j in range(N):
            idx_j = np.where(z_new[j, :] == 1)[0]
            X_new[i, j] = bernoulli(p=theta[idx_i, idx_j]).rvs(size=1)
            X_new[j, i] = bernoulli(p=theta[idx_j, idx_i]).rvs(size=1)
    
    return X_new, z_new

In [10]:
def generate_data_gradual_theta(X, z, n_trans=None, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
    
    K = z.shape[1]
    L = z.shape[1]

    X_new = X.copy()
    
    for k in range(K):
        for l in range(K):
            point = z[:, k].reshape(-1, 1).dot(z[:, l].reshape(1, -1))
            if n_trans[k, l] > 0:
                point = (1 - point).astype(bool)
            else:
                point = point.astype(bool)
            idxes_all = np.vstack(np.where(point)).T
            idxes = np.random.choice(range(idxes_all.shape[0]), np.abs(n_trans[k, l]), replace=False)
            if n_trans[k, l] > 0:
                X_new[idxes_all[idxes][:, 0], idxes_all[idxes][:, 1]] = 1
            else:
                X_new[idxes_all[idxes][:, 0], idxes_all[idxes][:, 1]] = 0
    
    return X_new

In [11]:
def generate_data_split(X, z, 
                        transmat_former, transmat_latter, 
                        pi_former=None, theta_latter=None, 
                        a0=None, b0=None, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
    
    K_former = transmat_former.shape[0]
    K_latter = transmat_latter.shape[0]
        
    i_max = np.argmax(pi_former)
    pi_latter = np.hstack((pi_former.copy(), 0))
    pi_latter[i_max] = pi_latter[-1] = pi_former[i_max]/2
    
    theta_latter = np.zeros((K_latter, K_latter))
    theta_latter[:K_former, :K_former] = theta_former
    theta_latter[-1, :] = np.random.beta(a0, b0, K_latter)
    theta_latter[:-1, -1] = np.random.beta(a0, b0, K_latter-1)
    theta_latter[:-1, :-1] = np.random.beta(a0, b0, K_latter-1)
    
    idx_change = np.where(z[:, i_max] == 1)[0]

    z_new = np.zeros((z.shape[0], K_latter))
    z_new[:, :K_former] = z.copy()
    idx_new = np.random.choice(idx_change, int(len(idx_change)/2), replace=False)
    z_new[idx_new, i_max] = 0
    z_new[idx_new, -1] = 1
    
    X_new = X.copy()
    
    for i in idx_new:
        for j in range(N):
            z_i = np.where(z_new[i, :] == 1)[0]
            z_j = np.where(z_new[j, :] == 1)[0]
            X_new[i, j] = bernoulli(theta_latter[z_i, z_j]).rvs(size=1)
            X_new[j, i] = bernoulli(theta_latter[z_j, z_i]).rvs(size=1)

    return X_new, z_new, theta_latter

In [12]:
def make_split_integer(num, div):
    remainder = num % div
    integer = num/div

    splits=[]
    for i in range(div):
        splits.append(integer)
    for i in range(remainder):
        splits[i] += np.sign(remainder)
        
    splits = np.array(splits).astype(np.int)
    
    return splits

In [13]:
EPS = np.finfo(np.float).eps

T = 90
N = 100

K_former = 3
K_latter = 4

a0 = 1.0
b0 = 1.0
alpha1 = 1.0
alpha2 = 1.0
ratio = 0.1

t_theta_change = 10
t_pi_change = 35
t_model_change = 60

tol = 1e-6

X_all = []
Z_all = []

trial = 1
n_success = 1

while n_success <= 20:
    pi1 = None
    pi2 = None
    theta_former = None
    
    X_trial = []
    Z_trial = []
    
    # t = 1
    seed = trial*T
    X, Z1_true, Z2_true, pi1, pi2, theta_former = generate_data_random(
            N=N, K=K_former, alpha1=alpha1, alpha2=alpha2, a0=a0, b0=b0, 
            pi1=pi1, pi2=pi2, theta=theta_former, seed=seed)
    trial += 1
    if np.any(pd.value_counts(np.argmax(Z1_true, axis=1)) < 10):
        continue
    if np.any(theta_former < 0.1):
        continue

    n_success += 1
    X_trial.append(X)
    Z_trial.append(Z1_true)
    
    # t = 2 - 9
    for t in range(2, 10):
        seed = trial*T + t
        X, Z1_true = generate_data_sequential_randomtrans(
            X, Z1_true, theta=theta_former, ratio=ratio, seed=seed)
        X_trial.append(X)
        Z_trial.append(Z1_true)
 
    # t = 10 - 14
    u = np.random.uniform(-0.1, 0.1, (3, 3))
    theta_former2 = theta_former + u
    theta_former2[theta_former2 > 1.00] = 1.0 - tol
    theta_former2[theta_former2 < 0.00] = tol
    theta_ratio = (theta_former2 - theta_former) / theta_former
    idxes = np.argmax(Z1_true, axis=1)
    nlinks = np.array([ [ np.sum(X[idxes==i, :][:, idxes==j]) for j in range(Z1_true.shape[1]) ] for i in range(Z1_true.shape[1]) ])
    n_trans = (theta_ratio * nlinks).astype(np.int)
    n_trans_per = np.zeros((n_trans.shape[0], n_trans.shape[1], 5), dtype=np.int)
    for i in range(n_trans.shape[0]):
        for j in range(n_trans.shape[1]):
            n_trans_per[i, j, :] = make_split_integer(n_trans[i, j], 5)

    for t in range(t_theta_change, t_theta_change+5):
        seed = trial*T + t
        X = generate_data_gradual_theta(
            X, Z1_true, n_trans_per[:, :, t-t_theta_change], seed=seed)
        X_trial.append(X)
        Z_trial.append(Z1_true)

    # t = 15 - 34
    for t in range(t_theta_change+5, t_theta_change+25):
        seed = trial*T + t
        X, Z1_true = generate_data_sequential_randomtrans(
            X, Z1_true, theta=theta_former2, ratio=ratio, seed=seed)
        X_trial.append(X)
        Z_trial.append(Z1_true)

    # t = 35 - 39
    # change of pi
    n_z = np.sum(Z1_true, axis=0)

    idx_max_first, idx_max_second = n_z.argsort()[::-1][:2]
    idx_min_first = np.argmin(n_z)
    n_z_max_first, n_z_max_second = np.sort(n_z)[::-1][:2]
    n_z_min_first = np.min(n_z)
    n_trans = make_split_integer(int((n_z_max_first - n_z_max_second)/2), 5)
    
    for t in range(t_pi_change, t_pi_change+5):
        seed = trial*T + t
        X, Z1_true = generate_data_gradual_pi(
            X, Z1_true, theta=theta_former2, n_trans=n_trans[t-t_pi_change], 
            idx_before=idx_max_first, idx_after=idx_max_second, seed=seed)            
        X_trial.append(X)
        Z_trial.append(Z1_true)

    # t = 40 - 59
    for t in range(t_pi_change+5, t_pi_change+25):
        seed = trial*T + t
        X, Z1_true = generate_data_sequential_randomtrans(
            X, Z1_true, theta=theta_former2, ratio=ratio, seed=seed)
        X_trial.append(X)
        Z_trial.append(Z1_true)

    # t = 60 - 69
    # model change
    Z1_true = np.hstack((Z1_true, np.zeros(Z1_true.shape[0]).reshape(-1, 1)))
   
    theta_trans = np.zeros((4, 4))
    theta_trans[:3, :3] = theta_former2.copy()
    theta_trans[-1, :] = np.random.beta(a0, b0, 4)
    theta_trans[:-1, -1] = np.random.beta(a0, b0, 3)

    i_max = np.argmax(np.sum(Z1_true, axis=0))
    n_trans = int(np.sum(Z1_true[:, i_max] == 1)/4)
    n_trans_per = make_split_integer(n_trans, 10)

    for t in range(t_model_change, t_model_change+10):
        seed = trial*T + t
        X, Z1_true = generate_data_gradual_pi(
            X, Z1_true, theta=theta_trans, n_trans=n_trans_per[t-t_model_change], 
            idx_before=i_max, idx_after=3, seed=seed)
        X_trial.append(X)
        Z_trial.append(Z1_true)

    # t = 70 - 89
    theta_latter = theta_trans.copy()
    for t in range(t_model_change+10, T+1):
        seed = trial*T + t
        X, Z1_true = generate_data_sequential_randomtrans(
        X, Z1_true, theta=theta_latter, ratio=ratio, seed=seed)        
        
        X_trial.append(X)
        Z_trial.append(Z1_true) 
    
    trial += 1
    
    X_all.append(X_trial)
    X_all_array = np.array(X_all)

    with open(os.path.join(outdir, 'X_gradual.pkl'), 'wb') as f:
        pickle.dump(X_all_array, f)
        
    Z_all.append(Z_trial)
    with open(os.path.join(outdir, 'Z_gradual.pkl'), 'wb') as f:
        pickle.dump(Z_all, f)