In [20]:
import pandas as pd 
import numpy as np 
import scipy
# import xlrd 
import sklearn

from Gibbs_model_probit import Gibbs_sampling

from sklearn.model_selection import train_test_split
from scipy.stats import multivariate_normal
from utils import baseline_lr,baseline_esnet,baseline_justmean
from utils import baseline_LogitElsnet,baseline_justmode,baseline_random,baseline_LogitLR,baseline_RanForest,baseline_Gibbs_zhe
from sklearn.model_selection import KFold
from scipy.stats import binom 
from scipy.stats import norm
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestClassifier
from tqdm import trange
from Gibbd_zhe import GibbsSampling3
from sklearn.metrics import roc_auc_score
import time

In [21]:
# data_loading 
np.random.seed(123)
data_table = pd.read_csv('../data/processed/all_feature_p1_lip_specie.csv')
target = '1= death; 0=alive'


# normalization

# min-max
df = data_table[target]


# check nan
data_table[target].isnull().values.any()
data_table.fillna(data_table.mean(), inplace=True) # fill nan with column mean

In [22]:
all_feature = data_table.iloc[:,1:-7]
#lip_feature = data_table.iloc[:,1:775]#for p2
# gene_feature = data_table.iloc[:,1:-372]#for p1

Y = data_table[target].values


In [None]:
all_feature

In [23]:
feature_names = list(all_feature.columns)
K_lip=41 # group number, from data process notebook
K_gene=3
K = K_lip + K_gene 

group_ind_dict = {}
group_ind = []
group_ind_concat = []
for i in range(K_lip):
    group_ind_dict['lip'+'_'+str(i)] = []
for i in range(K_gene):
    group_ind_dict['gene'+'_'+str(i)] = []

for name in feature_names:
    cate = name.split('_')[0]
    id = name.split('_')[-1]
    group_ind_dict[cate+'_'+id].append(name)

for i in range(K_lip):
    group_ind.append(group_ind_dict['lip'+'_'+str(i)])
    group_ind_concat = group_ind_concat + group_ind_dict['lip'+'_'+str(i)]

for i in range(K_gene):
    group_ind.append(group_ind_dict['gene'+'_'+str(i)])
    group_ind_concat = group_ind_concat + group_ind_dict['gene'+'_'+str(i)]

# group_ind

In [24]:
# re-arrange the features of X based on the group split order
X_new = all_feature[group_ind_concat].values

N_sample, _ = X_new.shape
# add all-one column at the last 
bias_col = np.ones(N_sample).reshape((N_sample,1))
X_new = np.concatenate((X_new,bias_col),axis=1)

print(X_new.shape)

(71, 1140)


In [28]:

# init hyper-parameters
alpha = 0.5
beta = 0.5
r0 = 1e-6
r1 = 100.0
a0 = 1.0
b0 = 1.0
JITTER = 1e-3

INTERVAL = 1
VALITA_INTERVAL = 10
BURNING = 20
MAX_NUMBER = 100

hyper_paras = {'INTERVAL':INTERVAL, 'BURNING':BURNING,'MAX_NUMBER':MAX_NUMBER,'VALITA_INTERVAL':VALITA_INTERVAL,
'alpha':alpha, 'beta':beta,'r0':r0,'r1':r1,'JITTER':JITTER}

In [26]:
def zero_init_paras():
    z_array_init = np.ones(K) #np.random.binomial(size=K, n=1, p= alpha)
    s_list_init = [np.ones(len(item)) for item in group_ind]
    # s_list_init = [np.ones(len(item)) for item in group_ind]#[np.random.binomial(size=len(item), n=1, p= beta) for item in group_ind]
    b_init = 0.0#np.random.normal(loc=0.0, scale=r1,size=None)
    # tau_init = 1.0#np.random.gamma(shape=alpha, scale=1.0/beta, size=None)

    W_init = [np.zeros(len(item)) for item in group_ind]

    init_paras = {'z':z_array_init, 's':s_list_init, 'b':b_init,  'W':W_init,'a0':a0,'b0':b0}
    return init_paras

In [27]:
# init parameters with lr_result
def get_init_paras(w_lr):
    z_array_init = np.ones(K) #np.random.binomial(size=K, n=1, p= alpha)
    s_list_init = [np.ones(len(item)) for item in group_ind]#[np.random.binomial(size=len(item), n=1, p= beta) for item in group_ind]
    b_init = w_lr[-1]#np.random.normal(loc=0.0, scale=r1,size=None)
    # tau_init = 1.0#np.random.gamma(shape=alpha, scale=1.0/beta, size=None)

    W_init = []
    offset=0
    for i in range(K):
        # mask1 = 1-z_array_init[i] * s_list_init[i]
        # mask2 = z_array_init[i] * s_list_init[i]
        # spike = np.random.normal(loc=0.0, scale=r0,size=len(s_list_init[i]))
        # slab = np.random.normal(loc=0.0, scale=r1,size=len(s_list_init[i]))
        # W_group = spike * mask1 + slab * mask2

        
        group_len = len(s_list_init[i])
        W_group= w_lr[offset:offset+group_len]
        offset = offset + group_len
        W_init.append(W_group)

    init_paras = {'z':z_array_init, 's':s_list_init, 'b':b_init,  'W':W_init,'a0':a0,'b0':b0}
    return init_paras

In [37]:

N = 3
lr_acc = np.zeros(N)
rf_acc = np.zeros(N)
esnet_acc = np.zeros(N)
mode_acc = np.zeros(N)
random_acc = np.zeros(N)
ours_acc = np.zeros(N)
zhe_gibs_acc = np.zeros(N)

lr_auc = np.zeros(N)
rf_auc = np.zeros(N)
esnet_auc = np.zeros(N)
ours_auc = np.zeros(N)
zhe_gibs_auc = np.zeros(N)

for i in range(N):
    X_train, X_test, y_train, y_test = train_test_split(X_new, Y.squeeze(),test_size=0.3)

    data_dict = {'X_tr':X_train, 'y_tr':y_train, 'X_test':X_test, 'y_test':y_test}  
    dict_lr = baseline_LogitLR(data_dict)
    dict_els = baseline_LogitElsnet(data_dict)
    dict_rf = baseline_RanForest(data_dict)
    dict_mode = baseline_justmode(data_dict)
    dict_random = baseline_random(data_dict)
    dict_gibbs_zhe = baseline_Gibbs_zhe(data_dict,hyper_paras)

    # model = Gibbs_sampling(data_dict,get_init_paras(dict_lr['clf'].coef_.squeeze()), hyper_paras)
    model = Gibbs_sampling(data_dict,zero_init_paras(), hyper_paras)
    dict_ours = model.model_run()

    lr_acc[i] = dict_lr['acr']
    esnet_acc[i] = dict_els['acr']
    rf_acc[i] = dict_rf['acr']
    mode_acc[i] = dict_mode['acr']
    random_acc[i] = dict_random['acr']
    ours_acc[i] = dict_ours['acr']
    zhe_gibs_acc[i] = dict_gibbs_zhe['acr']

    lr_auc[i] = dict_lr['auc']
    rf_auc[i] = dict_rf['auc']
    esnet_auc[i] = dict_els['auc']
    ours_auc[i] = dict_ours['auc']
    zhe_gibs_auc[i] = dict_gibbs_zhe['auc']

print('\n\nours_acr_mean: %.4f,ours_acr_std: %.4f '%( ours_acc.mean(), ours_acc.std() ) )
print('gibbs_zhe_acr_mean: %.4f,gibbs_zhe_acr_std: %.4f '%(zhe_gibs_acc.mean(),zhe_gibs_acc.std() ) )
print('lr_acr_mean: %.4f,lr_acr_std: %.4f '%(lr_acc.mean(),lr_acc.std() ) )
print('esnet_acr_mean: %.4f,esnet_acr_std: %.4f '%(esnet_acc.mean(),esnet_acc.std() ) )
print('rf_acr_mean: %.4f,rf_acr_mean: %.4f '%(rf_acc.mean(),rf_acc.std() ) )
print('just-mode_acr_mean: %.4f,mode_acr_std: %.4f '%(mode_acc.mean(),mode_acc.std() ) )
print('just-random_acr_mean: %.4f,just-random_acr_std: %.4f '%(random_acc.mean(),random_acc.std() ) )


print('\nours_AUC_mean: %.4f,ours_AUC_std: %.4f '%(ours_auc.mean(),ours_auc.std() ) )
print('gibbs_zhe_AUC_mean: %.4f,gibbs_zhe_AUC_std: %.4f '%(zhe_gibs_auc.mean(),zhe_gibs_auc.std() ) )
print('lr_AUC_mean: %.4f,lr_AUC_std: %.4f '%(lr_auc.mean(),lr_auc.std() ) )
print('esnet_AUC_mean: %.4f,esnet_AUC_mean: %.4f '%(esnet_auc.mean(),esnet_auc.std() ) )
print('rf_AUC_mean: %.4f,rf_AUC_std: %.4f '%(rf_auc.mean(),rf_auc.std() ) )



 3.20it/s]ours:W_max:31.4637,W_min:-28.7030
 87%|████████▋ | 87/100 [00:27<00:04,  3.24it/s]ours:W_max:37.8530,W_min:-28.1590
 88%|████████▊ | 88/100 [00:27<00:03,  3.20it/s]ours:W_max:35.7719,W_min:-33.1187
 89%|████████▉ | 89/100 [00:27<00:03,  3.22it/s]ours:W_max:26.7341,W_min:-42.8702
 90%|█████████ | 90/100 [00:27<00:03,  3.18it/s]ours:W_max:34.7218,W_min:-29.4597

 running test-auc = 0.45089
running train-auc = 1.00000

 91%|█████████ | 91/100 [00:28<00:02,  3.17it/s]ours:W_max:26.1862,W_min:-29.9596
 92%|█████████▏| 92/100 [00:28<00:02,  3.11it/s]ours:W_max:37.4203,W_min:-29.2417
 93%|█████████▎| 93/100 [00:28<00:02,  3.11it/s]ours:W_max:30.6178,W_min:-32.5564
 94%|█████████▍| 94/100 [00:29<00:01,  3.17it/s]ours:W_max:29.2723,W_min:-29.7473
 95%|█████████▌| 95/100 [00:29<00:01,  3.13it/s]ours:W_max:34.0698,W_min:-31.3444
 96%|█████████▌| 96/100 [00:29<00:01,  3.17it/s]ours:W_max:27.8414,W_min:-34.8666
 97%|█████████▋| 97/100 [00:30<00:00,  3.18it/s]ours:W_max:30.8483,W_min:-30.1

[array([1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1

In [22]:
# non-sparse feature of the last samples
len(np.where(np.concatenate((model.W))>0)[0])

581

In [28]:
# non-sparse feature of the mean of all samples (no truncated)
N_collect = len(model.W_collect)
if N_collect > 0:
    W_samples = np.zeros((N_collect,model.N_feature))
    for i in range(N_collect):
        W_samples[i,:-1] = np.concatenate((model.W_collect[i]))
        W_samples[i,-1] = model.b_collect[i]

    W_avg = np.mean(W_samples,axis=0).reshape(-1,1)
len(np.where(np.abs(W_avg)>0.01)[0])

1132

In [37]:
N_collect = len(model.W_collect)
if N_collect > 0:
    # W_samples = np.zeros((N_collect,model.N_feature))
    W_samples = np.zeros((model.N_feature,N_collect))
    for i in range(N_collect):
        W_samples[:-1,i] = np.concatenate((model.W_collect[i]))
        W_samples[-1,i] = model.b_collect[i]

    # W_avg = np.mean(W_samples,axis=0).reshape(-1,1)
W_samples.shape    

(1140, 30)

In [16]:
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
clf = LogisticRegressionCV(l1_ratios =[.001,.01, .1, .5, .7, .9, .95, .99, 1],penalty='elasticnet',solver='saga',fit_intercept=False).fit(data_dict['X_tr'],data_dict['y_tr'])




In [19]:
clf.C_

array([21.5443469])

In [32]:
# non-sparse feature of the Gibbs_zhe
dict_gibbs_zhe['model'][1][:, -10:].

(1140, 10)