In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from numpy import random
from scipy.stats import binom
import scipy.stats as stats
from statsmodels.tsa.arima_process import ArmaProcess
from scipy.special import digamma,polygamma
from scipy.linalg import sqrtm
from tqdm import tqdm

In [2]:
# the time-varying probability; return a vector lag 1 with exo variables version 
def threshold_logit(X_t,c=0):
    """
    Return the threshold logit transformation of X_t
    """
    X_t_trans = list(map(lambda x: min(max(c,x),1-c), X_t))
    X_t_trans[:] = list(map(lambda x: np.log(x/(1-x)), X_t_trans))
                         
    return X_t_trans

def re_mu_t(phi_true,gamma_true,X_t,W_t,functiontype, c=0): 
    """
    return the probability vector 
    X_t is the generated ts vector--(m+1)*1
    W_t is the exo ts -- length greater than (m+1)
    functiontype -- string indicating which function A is. If functiontype is 'logit', we use logit transformation.
    If functiontype is 'identity', we use X_t directly in the link function.
    """
    p = len(phi_true)-1
    q = len(gamma_true)
    m = len(X_t)-p
    mu_list = []
        
    for i in range(m):        
        X_tsub = X_t[i:(i+p)]
        W_tsub = W_t[i:(i+q)]
        if functiontype=='logit':
            hX_tsub = threshold_logit(X_tsub,c)
        if functiontype=='identity':
            hX_tsub = X_tsub
        mu_t = (1+np.exp(-phi_true[0]-np.inner(phi_true[1:],hX_tsub)-np.inner(W_tsub, gamma_true)))**(-1)
        mu_list.append(mu_t)
    return mu_list 

def Gm(X_t,W_t,phi_true,tau_true,gamma_true,functiontype = 'logit', c=0):
    """
    Return the matrix of the score vector G of the samples.
    """
    p = len(phi_true)-1
    q = len(gamma_true)
    mu_t = re_mu_t(phi_true,gamma_true,X_t,W_t, functiontype, c)  
    X_t_p = X_t[p:]
#     X_t_0 = X_t[:-1]
    m = len(X_t)-p
#     W_t_1 = W_t[:m]
    
    X_t_trans = [np.log(x/(1-x)) for x in X_t_p]
    mu_t_trans  = [digamma(tau_true*mu)-digamma(tau_true*(1-mu)) for mu in mu_t]
    diff = np.subtract(X_t_trans,mu_t_trans)    
    f_mu_t = list(map(lambda x: x*(1-x), mu_t))
    if functiontype=='logit':
        X_t_0_trans = threshold_logit(X_t,c)
    if functiontype == 'identity':
        X_t_0_trans = X_t
        
    G_mat = []
    # Score vector G 
    G_element = list(map(lambda x,y,z: x*y+np.log(1-z)-digamma(tau_true*(1-x))+digamma(tau_true), mu_t,diff,X_t_p))
    G_mat.append(G_element)
    G_element = list(map(lambda x,y: tau_true*x*y, diff, f_mu_t))
    G_mat.append(G_element)

    for i in range(p):
        X_t_0_temp = X_t_0_trans[(p-1-i):(m+p-i-1)]
        G_element = list(map(lambda x,y,z: tau_true*x*y*z, diff, f_mu_t, X_t_0_temp))
        G_mat.append(G_element)
    for i in range(q):
        W_t_0_temp = W_t[(q-1-i):(m+q-i-1)]
        G_element = list(map(lambda x,y,z: tau_true*x*y*z, diff, f_mu_t, W_t_0_temp))
        G_mat.append(G_element)
    
    return np.array(G_mat)

# given the first m+1 obs, the first derivative vector
def PSVm(X_t,W_t,phi_true,tau_true,gamma_true,functiontype, c=0):
    
    G_mat = Gm(X_t,W_t,phi_true,tau_true,gamma_true,functiontype, c=0)
    PSV_m = np.sum(G_mat, axis = 1)
    
    return PSV_m

def PSVmT(X_t,W_t,phi_true,tau_true,gamma_true,functiontype, c=0):
    """
    This function calculates the rescaled partial score vector. The taus of the first derivatives wrt Z are removed.
    """
    PSV_m = PSVm(X_t,W_t,phi_true,tau_true,gamma_true,functiontype, c)
    for i in range(1,len(PSV_m)):
        PSV_m[i] = PSV_m[i]/tau_true
    return np.asarray(PSV_m)

# given the true parameter vector beta, we can generate the first m-99 observations
def geneX_wth_exo(m,tau_true, phi_true, gamma_true,mu_0,W_t,randomSeed, functiontype='logit', c=0):
    """
    This function is used to generate betaAR(p) process based on the given true beta and true tau.

    m: the number of samples needed to be generated from the process.
    phi_true: parameter vector for x_t in the link function. p+1 dimensions.
    tau_true: dispersion parameter for the beta distribution.
    beta_true: the coefficient of x_t in the link function. scalar.
    mu_0: the expectation of the first p observations.
    W_t: the time series of exogenous variables. The length of X_t is equal to or greater than m-99.
    """
    X_t = []
    p = len(phi_true)-1
    q = len(gamma_true)
    alpha_0 = mu_0*tau_true
    beta_0  = (1-mu_0)*tau_true
        
    np.random.seed(randomSeed)
    x_t = list(np.random.beta(alpha_0,beta_0,p))
    X_t.append(x_t)
    X_t = sum(X_t,[])
    
    for i in range(m+p+100):
        X_tsub = X_t[i:(i+p)]
        W_tsub = W_t[i:(i+q)]
        if functiontype=='logit':
            hX_tsub = threshold_logit(X_tsub,c)
        if functiontype=='identity':
            hX_tsub = X_tsub
        mu_t = (1+np.exp(-phi_true[0]-np.inner(phi_true[1:],hX_tsub)-np.inner(W_tsub, gamma_true)))**(-1)
        alpha_t = mu_t*tau_true
        beta_t  = (1-mu_t)*tau_true
                
        x_t = np.random.beta(alpha_t,beta_t,1)
        
        X_t.extend(x_t)
        
    X_t = X_t[(100+p):]   
    W_t_used = W_t[(100+p):(100+m+p+q)]

    return X_t, W_t_used

def geneX_and_W(m,tau_true, phi_true, gamma_true,mu_0,ar_coef, ma_coef ,randomSeed, functiontype, c=0):

    AR_object1 = ArmaProcess(ar_coef, ma_coef)
    simulated_data_1 = AR_object1.generate_sample(nsample=m+200)

    X_t_logit,W_t = geneX_wth_exo(m,tau_true,phi_true,gamma_true,mu_0,simulated_data_1,randomSeed, functiontype, c)
    
    return X_t_logit, W_t

def PSVm_sp_opt(parVector,*args):
    """
    args is (X_t,W_t), which are the data sets 
    """
    X_t,W_t,p,q = args[0],args[1],args[2],args[3]
    functiontype = args[4]
    c = (args[5] if len(args)==6 else 0)
    
    tau_true = parVector[0]
    phi_true = parVector[1:(p+2)]
    gamma_true = parVector[(p+2):]

    PSV_m = PSVmT(X_t,W_t,phi_true,tau_true,gamma_true,functiontype, c=c)

    return np.linalg.norm(PSV_m)


In [29]:
m = 1000
tau_true = 100
phi_true = [0.5,0.1,0.2,0.2]
gamma_true = [0.5,0.5,0.5]

def simulate_X_for_Ha(randomSeed, save=True):

    random.seed(randomSeed)
    ar_coef1 = np.array([1, 0.1])
    ma_coef1 = np.array([1])
    AR_object1 = ArmaProcess(ar_coef1, ma_coef1)
    W_t1 = AR_object1.generate_sample(nsample=int(m/2)+100)

    # If W_t has a change in variance
    ar_coef2 = np.array([10, 2,3])
    ma_coef2 = np.array([1, 0.5])
    AR_object2 = ArmaProcess(ar_coef2, ma_coef2)
    W_t2 = AR_object2.generate_sample(nsample=int(m/2)+100)-0.1

    # # If W_t has a mean shift.
    # W_t2 = AR_object1.generate_sample(nsample=int(m/2)+100)+0.1

    W_t = np.append(W_t1, W_t2)

    functiontype = 'logit'
    mu_0 = 0.5
    X_t, W_t_used = geneX_wth_exo(m,tau_true, phi_true, gamma_true,mu_0,W_t,randomSeed, functiontype, c=0.01)
    if save:
        pd.Series(X_t).to_csv(f"{os.getcwd()}/ds/Ha_X_datasets/X_seed_{str(randomSeed)}.csv", index=None)

In [30]:
for seed in range(200, 300):
    simulate_X_for_Ha(seed)