In [14]:
import pandas as pd 
import numpy as np 
import scipy
# import xlrd 
import sklearn

from Gibbs_model import Gibbs_sampling

from sklearn.model_selection import train_test_split

from utils import baseline_lr,baseline_esnet

from scipy.stats import binom 
from scipy.stats import norm
from sklearn.linear_model import Ridge
from tqdm import trange
import time

In [2]:
np.random.seed(123)

# generate simu data
N_sample = 500
N_feature_t = 6
N_feature_f = 44
noise_level = 0.1
N_feature = N_feature_t + N_feature_f

feature_t = np.random.normal(0, 1, size=(N_sample, N_feature_t)) # true feature (should be select)
feature_f = np.random.normal(1, 0.5, size=(N_sample, N_feature_f)) # false feaures

W_t = np.array([-3, 3, -2, 2, -1, 1])#np.random.normal(0, 1, size=(N_feature_t,1))

Y = np.matmul(feature_t, W_t)

Y = Y + noise_level*np.random.normal(0, 1, size=Y.shape)

X = np.concatenate((feature_t,feature_f),1)
print(X.shape,Y.shape)


(500, 50) (500,)


In [3]:
# random split the group
K = 10 # nunber of group
group_ind = []

'''
for i in range(K):
    N_t = N_feature_t//K
    N_f = N_feature_f//K
    idx_t = [N_t*i +j for j in range(N_t)]
    idx_f = [N_feature_t + N_f*i +j for j in range(N_f)]
    group_ind.append(idx_t + idx_f)

# so in each group, first 10 idx are the ture features
'''
for i in range(K):
    if i < 3:
        N_t = 2 #N_feature_t//K
        N_f = 3 #N_feature_f//K
        idx_t = [N_t*i +j for j in range(N_t)]
        idx_f = [N_feature_t + N_f*i +j for j in range(N_f)]
        group_ind.append(idx_t + idx_f)
    else:
        N_f = 5
        idx_f = [ N_f*i +j for j in range(N_f)]
        group_ind.append(idx_f)

group_ind
# 0-5 : relevant features
# 6-40: unrelevant features

[[0, 1, 6, 7, 8],
 [2, 3, 9, 10, 11],
 [4, 5, 12, 13, 14],
 [15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24],
 [25, 26, 27, 28, 29],
 [30, 31, 32, 33, 34],
 [35, 36, 37, 38, 39],
 [40, 41, 42, 43, 44],
 [45, 46, 47, 48, 49]]

In [4]:
# re-arrange the features of X based on the group split order
X_new = np.concatenate([X[:,group_ind[i]] for i in range(K)],axis=1)

# add all-one column at the last 
bias_col = np.ones(N_sample).reshape((N_sample,1))
X_new = np.concatenate((X_new,bias_col),axis=1)

print(X_new.shape)

(500, 51)


In [5]:
# split train & test
X_train, X_test, y_train, y_test = train_test_split(X_new, Y.squeeze(),test_size=0.2)
data_dict = {'X_tr':X_train, 'y_tr':y_train, 'X_test':X_test, 'y_test':y_test}                                                       

In [6]:
# init hyper-parameters
alpha = 0.5
beta = 0.5
r0 = 1e-3
r1 = 1.0
a0 = 1.0
b0 = 1.0
JITTER = 1e-3

INTERVAL = 500
VALITA_INTERVAL = 1000
BURNING = 5000
MAX_NUMBER = 10000

hyper_paras = {'INTERVAL':INTERVAL, 'BURNING':BURNING,'MAX_NUMBER':MAX_NUMBER,'VALITA_INTERVAL':VALITA_INTERVAL,
'alpha':alpha, 'beta':beta,'r0':r0,'r1':r1,'JITTER':JITTER}

In [7]:
# init parameters
z_array_init = np.random.binomial(size=K, n=1, p= alpha)
s_list_init = [np.random.binomial(size=len(item), n=1, p= beta) for item in group_ind]
b_init = np.random.normal(loc=0.0, scale=r1,size=None)
tau_init = np.random.gamma(shape=alpha, scale=1.0/beta, size=None)
W_init = []

for i in range(K):
    mask1 = 1-z_array_init[i] * s_list_init[i]
    mask2 = z_array_init[i] * s_list_init[i]
    spike = np.random.normal(loc=0.0, scale=r0,size=len(s_list_init[i]))
    slab = np.random.normal(loc=0.0, scale=r1,size=len(s_list_init[i]))
    W_group = spike * mask1 + slab * mask2
    W_init.append(W_group)

init_paras = {'z':z_array_init, 's':s_list_init, 'b':b_init, 'tau':tau_init, 'W':W_init,'a0':a0,'b0':b0}

In [13]:
_ = baseline_lr(data_dict)

with lr, rmse is 0.11033


In [15]:
_ = baseline_esnet(data_dict)

with elanet, rmse is 2.54218


In [9]:
model = Gibbs_sampling(data_dict,init_paras, hyper_paras)
model.model_run()

  0%|          | 24/10000 [00:00<00:42, 235.92it/s]
 running test-rmse = 2.89812
running train-rmse = 2.81747

 10%|█         | 1032/10000 [00:03<00:30, 290.50it/s]
 running test-rmse = 0.12274
running train-rmse = 0.10769

 21%|██        | 2053/10000 [00:06<00:25, 307.30it/s]
 running test-rmse = 0.12072
running train-rmse = 0.10534

 31%|███       | 3056/10000 [00:10<00:22, 302.25it/s]
 running test-rmse = 0.11554
running train-rmse = 0.10517

 41%|████      | 4064/10000 [00:13<00:20, 293.41it/s]
 running test-rmse = 0.11766
running train-rmse = 0.10328

 51%|█████     | 5054/10000 [00:17<00:16, 299.95it/s]
 running test-rmse = 0.12189
running train-rmse = 0.10253

 60%|██████    | 6049/10000 [00:20<00:12, 304.36it/s]
 running test-rmse = 0.12184
running train-rmse = 0.10567

 70%|███████   | 7029/10000 [00:23<00:10, 287.47it/s]
 running test-rmse = 0.11419
running train-rmse = 0.10406

 80%|████████  | 8039/10000 [00:27<00:06, 291.40it/s]
 running test-rmse = 0.11640
running train-r

NameError: name 'model' is not defined

In [10]:
model.s_mean

[array([1, 1, 0, 0, 0]),
 array([1, 1, 0, 0, 0]),
 array([1, 1, 0, 0, 0]),
 array([0, 0, 0, 0, 0]),
 array([0, 1, 0, 1, 0]),
 array([1, 1, 0, 1, 0]),
 array([0, 1, 1, 1, 1]),
 array([0, 1, 0, 1, 1]),
 array([1, 0, 0, 0, 0]),
 array([1, 0, 1, 0, 1])]

In [11]:
model.tau

65.70450879667705

In [12]:
model.b

0.052693481985073905