In [1]:
import sys
sys.path.append('/home/ec2-user/CpGPython/code/')
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.base import BaseEstimator,TransformerMixin
from functools import partial
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from datetime import datetime
from sklearn.model_selection import StratifiedShuffleSplit
import Logger

In [2]:
#-----------------------------------------------------------------------------
def kl_divergence(p,q):
    return p*tf.log(p/q)+(1-p)*tf.log((1-p)/(1-q))

#----------------------------------------------------------------------------
def get_learning_rate(initial_rate=0.1,decay_steps=10000,decay_rate=1/10):
    global_step = tf.Variable(0,trainable=False,name='global_step')
    learning_rate = tf.train.exponential_decay(initial_rate,global_step,decay_steps,decay_rate)
    return learning_rate

#-----------------------------------------------------------------------------


In [3]:
home='/home/ec2-user/CpGPython/'
reg_log_dir = home+'logs/'
logger = Logger.Logger(reg_log_dir,False).get_logger()
with pd.HDFStore(home+'data/selected_features','r') as h5s:
    train_x =h5s['train_x'] 
    train_label = h5s['train_label'] 
    test_x = h5s['test_x'] 
    test_label = h5s['test_label']   
    sample_weights_train = h5s['sample_weights_train'] 
    sample_weights_test = h5s['sample_weights_test'] 

In [4]:
root_log_dir = home+'tensor_logs/' 
log_dir = "{}run-{}".format(root_log_dir,datetime.utcnow().strftime("%Y%m%d%H%M%S"))
log_writer = tf.summary.FileWriter(log_dir,tf.get_default_graph())
   
n_input = train_x.shape[1]
n_hidden1 = int(1.5*n_input)
n_output = n_input
l2_reg = 0.01
sparsity_target = 0.01
sparsity_weight = 0.2
top_k = 30
logger.info('Sparse autencoder: input layer node number: %d\n hidden layer number is: 1\n \
            hidden layer node number is: %d\n L2 regularization: %f\n \
            sparsity target: %f\n sparsity penalty: %f\n selected number of features: %d',n_input,n_hidden1,l2_reg,
            sparsity_target,sparsity_weight,top_k)
train_x = MinMaxScaler(copy=True,feature_range=(0,1)).fit_transform(train_x)
test_x = MinMaxScaler().fit_transform(test_x)

In [5]:
activation = tf.nn.sigmoid
regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
he_init = tf.contrib.layers.variance_scaling_initializer()
dense_layer = partial(tf.layers.dense,kernel_regularizer=regularizer,kernel_initializer=he_init)
inputs = tf.placeholder(tf.float32,shape=(None,n_input),name='inputs')
sample_weights = tf.placeholder(tf.float32,shape=(None),name='weights')
training = tf.placeholder_with_default(False,shape=(),name='training')

In [6]:
with tf.name_scope("dnn"):
    hidden1 = dense_layer(inputs,n_hidden1,name='hidden1')
    bn1 = tf.layers.batch_normalization(hidden1,training=training,momentum=0.9)
    bn1_act = activation(bn1)
    hidden1_mean = tf.reduce_mean(bn1_act,axis=0)
    logits_before_bn = dense_layer(bn1_act,n_output,name='outputs',activation=None)
    logits = tf.layers.batch_normalization(logits_before_bn,training=training,momentum=0.9)



In [8]:
#------------------------------------------------------------------------------
with tf.name_scope('loss'):
    reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)    
    reconstruction_loss = tf.reduce_sum(tf.nn.weighted_cross_entropy_with_logits(targets=inputs,logits=logits,pos_weight=sample_weights))
    sparsity_loss = tf.reduce_sum(kl_divergence(sparsity_target,hidden1_mean))
    loss = tf.add_n([reconstruction_loss,sparsity_loss]+reg_loss,name='loss')
    reconstruction_loss_summary = tf.summary.scalar('logloss',reconstruction_loss)


In [9]:
#------------------------------------------------------------------------------    
with tf.name_scope('train'):
    learning_rate = get_learning_rate()
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,momentum=0.9,use_nesterov=True)
    training_op = optimizer.minimize(loss)



In [10]:
init = tf.global_variables_initializer()
n_epochs = 50
n_batch = 20
batch_size = train_x.shape[0]//n_batch
logger.info('Training epochs: %d, training batches: %d, batch_size: %d',n_epochs,n_batch,batch_size)
saver = tf.train.Saver() 
sample_weights_train.reset_index(drop=True,inplace=True)
sample_weights_test.reset_index(drop=True,inplace=True)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

In [12]:
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        random_ix = np.random.permutation(train_x.shape[0]) 
        for iteration in range(n_batch):
            start_ix = iteration*batch_size 
            end_ix = np.minimum((iteration+1)*batch_size,train_x.shape[0])
            ixs = random_ix[start_ix:end_ix]
            x_batch = train_x[ixs,:]
            weights_batch = sample_weights_train[ixs].values.reshape([-1,1])
            if iteration == 0:
                summary_str = reconstruction_loss_summary.eval(feed_dict={inputs:x_batch,sample_weights:weights_batch,training:True})
                step = n_epochs*n_batch
                log_writer.add_summary(summary_str,step)
                logger.info("Loss at epoch %d: %f",epoch,loss.eval(feed_dict={inputs:x_batch,sample_weights:weights_batch,training:True}))
           # print(hidden1_mean.eval(feed_dict={inputs:x_batch,sample_weights:weights_batch,training:True}))
            sess.run([training_op,update_ops],feed_dict={inputs:x_batch,sample_weights:weights_batch,training:True})
    logger.info("Loss at end: %f",loss.eval(feed_dict={inputs:x_batch,sample_weights:weights_batch,training:True}))
    total_x = np.concatenate([train_x,test_x])
    total_label = np.concatenate([np.array(train_label),np.array(test_label)])
    total_weights = pd.concat([sample_weights_train,sample_weights_test],ignore_index=True).values.reshape((-1,1))
    new_features = bn1_act.eval(feed_dict={inputs:total_x,sample_weights:total_weights})
    print(new_features)
    saver.save(sess,'/home/ec2-user/tensor_model/sparse_autoencoder.ckpt')        


[[  1.75142835e-04   1.89812417e-04   1.00000000e+00 ...,   2.90107837e-05
    6.02610235e-04   7.66981742e-04]
 [  1.96966842e-01   7.96413042e-06   2.20743345e-06 ...,   4.40971735e-06
    7.46686816e-01   5.24982903e-03]
 [  9.76211846e-01   2.02843206e-04   3.07010814e-05 ...,   1.79100280e-05
    5.82559884e-01   2.98234692e-04]
 ..., 
 [  2.29470793e-13   5.07725948e-14   9.54134334e-15 ...,   2.41967825e-07
    1.89910235e-13   2.28022863e-08]
 [  1.97556004e-01   6.95512805e-04   1.14361323e-01 ...,   2.59221797e-05
    1.02499407e-03   2.42193323e-03]
 [  2.98912615e-01   3.66853172e-04   1.95805825e-07 ...,   5.61964089e-06
    3.10334587e-03   1.06405416e-04]]


In [30]:
new_features_mean = np.mean(new_features,axis=0)
max_active_ix = np.argpartition(new_features_mean,top_k)[:-top_k-1:-1]
new_features1 = new_features[:,max_active_ix]
new_features1 = StandardScaler().fit_transform(new_features1)
split = StratifiedShuffleSplit(n_splits=1,test_size=0.1,random_state=17)
for train_index, test_index in split.split(new_features1,total_label):
    train_set = pd.DataFrame(new_features1[train_index])
    test_set = pd.DataFrame(new_features1[test_index])
    train_label = pd.Series(total_label[train_index])
    test_label = pd.Series(total_label[test_index])
    sample_weights_train = pd.DataFrame(total_weights[train_index])[0]
    sample_weights_test = pd.DataFrame(total_weights[test_index])[0]
with pd.HDFStore(home+'data/new_features','w') as h5s:
    h5s['train_x'] = train_set
    h5s['train_label'] = train_label
    h5s['test_x'] = test_set
    h5s['test_label'] = test_label
    h5s['sample_weights_train'] = sample_weights_train
    h5s['sample_weights_test'] = sample_weights_test

In [29]:
new_features1 

array([[ 0.44611529, -0.28089699,  0.58303916, ..., -1.11837137,
        -0.31271407, -0.2425992 ],
       [ 1.42341995, -0.78983718,  1.61772645, ...,  0.69567531,
        -0.31271434, -0.2425992 ],
       [-0.44995463, -0.60869014,  0.32352176, ..., -0.78662258,
        -0.31271434, -0.23492101],
       ..., 
       [-1.50923538, -0.76276058, -0.68531454, ...,  1.50223434,
         3.52411079, -0.2425992 ],
       [-0.61526322, -0.76695049, -0.68435878, ..., -1.08873916,
        -0.3127141 , -0.24259892],
       [ 0.1860432 , -0.64420503,  1.90069449, ..., -0.99953932,
        -0.31270862, -0.2425992 ]], dtype=float32)