In [1]:
import numpy as np
import pandas as pd

import tqdm

In [2]:
import pickle

In [3]:
import os

In [4]:
from scipy.stats import bernoulli
from scipy.special import loggamma

In [5]:
np.set_printoptions(threshold=100000)

In [6]:
outdir = './data'
if not os.path.exists(outdir):
    os.makedirs(outdir)

In [7]:
def generate_data_random(N=100, K=2, alpha1=1.0, alpha2=1.0, a0=0.5, b0=0.5, 
                         pi1=None, pi2=None, theta=None, seed=0):
    np.random.seed(seed)
    # Dirichlet dist.
    alpha1 = alpha1 * np.ones(K)
    alpha2 = alpha2 * np.ones(K)
    if pi1 is None:
        pi1 = np.sort(np.random.dirichlet(alpha1, 1).ravel())

    Z1_true = np.random.multinomial(1, pi1, N)
    while np.logical_not(np.all(np.sum(Z1_true, axis=0) >= 1)):
        pi1 = np.sort(np.random.dirichlet(alpha1, 1).ravel())
        Z1_true = np.random.multinomial(1, pi1, N)
    
    if pi2 is None:
        pi2 = np.sort(np.random.dirichlet(alpha2, 1).ravel())
    
    Z2_true = np.zeros((N, K))
    while np.logical_not(np.all(np.sum(Z2_true, axis=0) >= 1)):
        pi2 = np.sort(np.random.dirichlet(alpha2, 1).ravel())
        Z2_true = np.random.multinomial(1, pi2, N)
        
    if theta is None:
        theta = np.random.beta(a0, b0, (K, K))
    
    X = np.zeros((N, N), dtype=np.int)
    for k in range(K):
        for l in range(K):
            point = Z1_true[:, k].reshape(-1, 1).dot(Z1_true[:, l].reshape(1, -1)).astype(bool)
            X[point] = bernoulli(p=theta[k, l]).rvs(size=np.sum(point))
    
    return X, Z1_true, Z2_true, pi1, pi2, theta

In [8]:
def generate_data_sequential(X, z, theta=None, n_trans=None, idx_before=None, idx_after=-1, ratio=0.1, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
    
    K = z.shape[1]
    L = z.shape[1]

    if (n_trans is None) & (idx_before is None):
        n_z = np.sum(z, axis=0)
        n_change = int(ratio * np.min(n_z))
        z_new = z.copy()
        idx_change = []
        for i in range(z.shape[1]):
            idxes_i = np.where(z[:, i] == 1)[0]
            idxes_used = []
            for j in range(z.shape[1]):
                if i == j:
                    continue
                i_change = np.random.choice(np.setdiff1d(idxes_i, idxes_used), n_change, replace=False)
                idxes_used += list(i_change)
                z_replace = np.zeros(z_new.shape[1], dtype=np.int)
                z_replace[j] = 1
                z_new[i_change, :] = z_replace
            idx_change += idxes_used
    elif (n_trans is not None) & (idx_before is not None):
        z_new = z.copy()
        idx_change = np.random.choice(np.where(z[:, idx_before] == 1)[0], n_trans, replace=False)
        z_replace = np.zeros(z_new.shape[1])
        z_replace[idx_after] = 1
        z_new[idx_change, :] = z_replace
    
    X_new = X.copy()
    
    for i in idx_change:
        idx_i = np.where(z_new[i, :] == 1)[0]
        for j in range(N):
            idx_j = np.where(z_new[j, :] == 1)[0]
            X_new[i, j] = bernoulli(p=theta[idx_i, idx_j]).rvs(size=1)
            X_new[j, i] = bernoulli(p=theta[idx_j, idx_i]).rvs(size=1)
    
    #print('X != X_new')
    #print(np.sum(X != X_new))
    #print('z != z_new')
    #print(np.sum(np.any(z != z_new, axis=1)))
    
    return X_new, z_new

In [9]:
def generate_data_sequential_randomtrans(X, z, theta=None, ratio=0.1, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]

    K = z.shape[1]
    L = z.shape[1]

    n_z = np.sum(z, axis=0)
    n_change = int(ratio * np.min(n_z))
    z_new = z.copy()
    idx_change = []
    for i in range(z.shape[1]):
        idxes_i = np.where(z[:, i] == 1)[0]
        idxes_used = []
        for j in range(z.shape[1]):
            if i == j:
                continue
            i_change = np.random.choice(np.setdiff1d(idxes_i, idxes_used), n_change, replace=False)
            idxes_used += list(i_change)
            z_replace = np.zeros(z_new.shape[1], dtype=np.int)
            z_replace[j] = 1
            z_new[i_change, :] = z_replace
        idx_change += idxes_used
    
    X_new = X.copy()    
    for i in idx_change:
        idx_i = np.where(z_new[i, :] == 1)[0]
        for j in range(N):
            idx_j = np.where(z_new[j, :] == 1)[0]
            X_new[i, j] = bernoulli(p=theta[idx_i, idx_j]).rvs(size=1)
            X_new[j, i] = bernoulli(p=theta[idx_j, idx_i]).rvs(size=1)
    
    return X_new, z_new

In [10]:
def generate_data_gradual_pi(X, z, theta=None, n_trans=None, idx_before=None, idx_after=-1, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
    
    K = z.shape[1]
    L = z.shape[1]

    z_new = z.copy()
    idx_change = np.random.choice(np.where(z[:, idx_before] == 1)[0], n_trans, replace=False)
    z_new[idx_change, idx_after] = 1
    z_new[idx_change, idx_before] = 0

    X_new = X.copy()
    
    for i in idx_change:
        idx_i = np.where(z_new[i, :] == 1)[0]
        for j in range(N):
            idx_j = np.where(z_new[j, :] == 1)[0]
            X_new[i, j] = bernoulli(p=theta[idx_i, idx_j]).rvs(size=1)
            X_new[j, i] = bernoulli(p=theta[idx_j, idx_i]).rvs(size=1)
    
    return X_new, z_new

In [11]:
def generate_data_gradual_theta(X, z, n_trans=None, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
    
    K = z.shape[1]
    L = z.shape[1]

    X_new = X.copy()
    
    for k in range(K):
        for l in range(K):
            point = z[:, k].reshape(-1, 1).dot(z[:, l].reshape(1, -1))
            if n_trans[k, l] > 0:
                #point = z[:, k].reshape(-1, 1).dot(z[:, l].reshape(1, -1)).astype(bool)
                point = (1 - point).astype(bool)
            else:
                point = point.astype(bool)
            idxes_all = np.vstack(np.where(point)).T
            idxes = np.random.choice(range(idxes_all.shape[0]), np.abs(n_trans[k, l]), replace=False)
            if n_trans[k, l] > 0:
                X_new[idxes_all[idxes][:, 0], idxes_all[idxes][:, 1]] = 1
            else:
                X_new[idxes_all[idxes][:, 0], idxes_all[idxes][:, 1]] = 0
    
    return X_new

In [12]:
def generate_data_split(X, z, 
                        transmat_former, transmat_latter, 
                        pi_former=None, theta_latter=None, 
                        a0=None, b0=None, seed=0):
    np.random.seed(seed)
    
    N = X.shape[0]
    
    K_former = transmat_former.shape[0]
    K_latter = transmat_latter.shape[0]
        
    i_max = np.argmax(pi_former)
    pi_latter = np.hstack((pi_former.copy(), 0))
    pi_latter[i_max] = pi_latter[-1] = pi_former[i_max]/2
    
    theta_latter = np.zeros((K_latter, K_latter))
    theta_latter[:K_former, :K_former] = theta_former
    theta_latter[-1, :] = np.random.beta(a0, b0, K_latter)
    theta_latter[:-1, -1] = np.random.beta(a0, b0, K_latter-1)
    theta_latter[:-1, :-1] = np.random.beta(a0, b0, K_latter-1)
    
    idx_change = np.where(z[:, i_max] == 1)[0]

    z_new = np.zeros((z.shape[0], K_latter))
    z_new[:, :K_former] = z.copy()
    idx_new = np.random.choice(idx_change, int(len(idx_change)/2), replace=False)
    z_new[idx_new, i_max] = 0
    z_new[idx_new, -1] = 1
    
    X_new = X.copy()
    
    for i in idx_new:
        for j in range(N):
            z_i = np.where(z_new[i, :] == 1)[0]
            z_j = np.where(z_new[j, :] == 1)[0]
            X_new[i, j] = bernoulli(theta_latter[z_i, z_j]).rvs(size=1)
            X_new[j, i] = bernoulli(theta_latter[z_j, z_i]).rvs(size=1)

    return X_new, z_new, theta_latter

In [13]:
def make_split_integer(num, div):
    remainder = num % div
    integer = num/div

    splits=[]
    for i in range(div):
        splits.append(integer)
    for i in range(remainder):
        #splits[i]+=1
        splits[i] += np.sign(remainder)
        
    splits = np.array(splits).astype(np.int)
    
    return splits

In [14]:
cluster_range_max = 6

In [15]:
theta_latter_range_max = 6
EPS = np.finfo(np.float).eps

T = 100

N = 100
K_former = 3
K_latter = 4

a0 = 1.0
b0 = 1.0
alpha1 = 1.0
alpha2 = 1.0
ratio = 0.01

t_theta_change = 10
t_pi_change = 40
t_model_change = 70

X_all = []
Z_all = []

trial = 1
n_try = 0

while trial <= 10:
    print('trial =', trial)
    pi1 = None
    pi2 = None
    theta = None
    
    X_trial = []
    Z_trial = []
    
    n_gen = 0
    
    # t = 1
    seed = trial*T + 100
    X, Z1_true, Z2_true, pi1, pi2, theta_former = generate_data_random(
        N=N, K=K_former, alpha1=alpha1, alpha2=alpha2, a0=a0, b0=b0, 
        pi1=pi1, pi2=pi2, theta=theta, seed=seed)
    n_try += 1
    
    count = np.sort(np.array([ np.sum(np.argmax(Z1_true, axis=1) == k) for k in range(K_former)]))
    
    X_trial.append(X)
    Z_trial.append(Z1_true)
    trial += 1
    
    # t = 2 - 9
    for t in range(2, 10):
        seed = trial*T + t
        X, Z1_true = generate_data_sequential_randomtrans(
            X, Z1_true, theta=theta_former, ratio=ratio, seed=seed)
        X_trial.append(X)
        Z_trial.append(Z1_true)
    
    # t = 10
    u = np.random.uniform(0.05, 0.10, (3, 3))
    theta_former2 = theta_former + u
    theta_former2[theta_former2 > 1.00] = 1.0
    theta_ratio = (theta_former2 - theta_former) / theta_former
    idxes = np.argmax(Z1_true, axis=1)
    nlinks = np.array([ [ np.sum(X[idxes==i, :][:, idxes==j]) for j in range(Z1_true.shape[1]) ] for i in range(Z1_true.shape[1]) ])
    n_trans = (theta_ratio * nlinks).astype(np.int)
    n_trans_per = np.zeros((n_trans.shape[0], n_trans.shape[1], 10), dtype=np.int)
    for i in range(n_trans.shape[0]):
        for j in range(n_trans.shape[1]):
            n_trans_per[i, j, :] = make_split_integer(n_trans[i, j], 10)
        
    # t = 11 - 19
    for t in range(t_theta_change, t_theta_change+10):
        seed = trial*T + t
        X = generate_data_gradual_theta(
            X, Z1_true, n_trans_per[:, :, t-t_theta_change], seed=seed)
        X_trial.append(X)
        Z_trial.append(Z1_true)
     
    # t = 20 - 39
    for t in range(20, 40):
        seed =  trial*T + t
        X, Z1_true = generate_data_sequential_randomtrans(
            X, Z1_true, theta=theta_former2, ratio=ratio, seed=seed)
        X_trial.append(X)
        Z_trial.append(Z1_true)

    # t = 40
    # change of pi
    n_z = np.sum(Z1_true, axis=0)
    
    idx_max_first, idx_max_second = n_z.argsort()[::-1][:2]
    idx_min_first = np.argmin(n_z)
    n_z_max_first, n_z_max_second = np.sort(n_z)[::-1][:2]
    n_z_min_first = np.min(n_z)
    n_trans = make_split_integer(int((n_z_max_first - n_z_max_second)/2), 10)

    # t = 41 - 49
    for t in range(t_pi_change, t_pi_change+10):
        seed = trial*T + t
        X, Z1_true = generate_data_gradual_pi(
            X, Z1_true, theta=theta_former2, n_trans=n_trans[t-t_pi_change], 
                idx_before=idx_max_first, idx_after=idx_max_second, seed=seed)            
                #idx_before=idx_max_first, idx_after=idx_min_first, seed=seed)
        X_trial.append(X)
        Z_trial.append(Z1_true)

    # t = 50 - 69
    for t in range(50, 70):
        seed = trial*T + t
        X, Z1_true = generate_data_sequential_randomtrans(
            X, Z1_true, theta=theta_former2, ratio=ratio, seed=seed)
        X_trial.append(X)
        Z_trial.append(Z1_true)

    # t = 70
    # model change
    Z1_true = np.hstack((Z1_true, np.zeros(Z1_true.shape[0]).reshape(-1, 1)))
   
    theta_trans = np.zeros((4, 4))
    theta_trans[:3, :3] = theta_former2.copy()
    theta_trans[-1, :] = np.random.beta(a0, b0, 4)
    theta_trans[:-1, -1] = np.random.beta(a0, b0, 3)
    
    i_max = np.argmax(np.sum(Z1_true, axis=0))
    n_trans = int(np.sum(Z1_true[:, i_max] == 1)/2)
    n_trans_per = make_split_integer(n_trans, 10)
    
    seed = trial*T + t_model_change
    X, Z1_true = generate_data_gradual_pi(
        X, Z1_true, theta=theta_trans, n_trans=n_trans_per[0], 
        idx_before=i_max, idx_after=3, seed=seed)
    X_trial.append(X)
    Z_trial.append(Z1_true)
    
    # t = 71 - 80
    for t in range(t_model_change+1, t_model_change+10):
        seed = trial*T + t
        X, Z1_true = generate_data_gradual_pi(
            X, Z1_true, theta=theta_trans, n_trans=n_trans_per[t-t_model_change],
            idx_before=i_max, idx_after=3, seed=seed)
        X_trial.append(X)
        Z_trial.append(Z1_true)

    # t = 80 - 100
    for t in range(t_model_change+10, T+1):
        seed = trial*T + t
        theta_latter = theta_trans.copy()
        X, Z1_true = generate_data_sequential_randomtrans(
            X, Z1_true, theta=theta_latter, ratio=ratio, seed=seed)        
        X_trial.append(X)
        Z_trial.append(Z1_true) 
    
    X_all.append(X_trial)
    X_all_array = np.array(X_all)
    
    with open(os.path.join(outdir, 'X_gradual.pkl'), 'wb') as f:
        pickle.dump(X_all_array, f)
        
    Z_all.append(Z_trial)
    with open(os.path.join(outdir, 'Z_gradual.pkl'), 'wb') as f:
        pickle.dump(Z_all, f)

trial = 1
trial = 2
trial = 3
trial = 4
trial = 5
trial = 6
trial = 7
trial = 8
trial = 9
trial = 10
