In [1]:
import sys
sys.path.append('/home/ec2-user/CpGPython/')
import pandas as pd
import numpy as np
import tensorflow as tf
import Logger
from functools import partial
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score,roc_auc_score

In [2]:
#---------------------------------------------------------------------------
def scores(predicts,y,average='macro'):
    class_num = len(np.unique(y))
    score_map = {}
    if class_num <=2:
        recall = recall_score(y,predicts)      
        precision = precision_score(y,predicts)      
        accuracy = accuracy_score(y,predicts)
        f1 = f1_score(y,predicts)
        auc = roc_auc_score(y,predicts)
        score_map['auc'] = auc
    else:
        recall = recall_score(y,predicts,average=average)      
        precision = precision_score(y,predicts,average=average)      
        accuracy = accuracy_score(y,predicts)
        f1 = f1_score(y,predicts,average=average)
        
    score_map['recall'] = recall
    score_map['precision'] = precision
    score_map['accuracy'] = accuracy
    score_map['f1'] = f1
    return score_map 

In [3]:
home='/home/ec2-user/CpGPython/'
log_dir = home+'logs/'
logger = Logger.Logger(log_dir,False).get_logger()

In [4]:
##features selecetd by traditional methods
with pd.HDFStore(home+'data/selected_features','r') as h5s:
    train_x =h5s['train_x'] 
    train_label = h5s['train_label'] 
    test_x = h5s['test_x'] 
    test_label = h5s['test_label']   
    sample_weights_train = h5s['sample_weights_train'] 
    sample_weights_test = h5s['sample_weights_test'] 
logger.info('Features used in training are from traditional feature selection')

In [5]:
n_inputs = train_x.shape[1]
n_hidden1= n_inputs*2
n_hidden2 = int(n_inputs*1)
n_hidden3 = int(n_inputs*0.5)
n_outputs = len(train_label.unique())
l2_reg = 0.1
dropout_rate = 0.5
train_x = MinMaxScaler(copy=True,feature_range=(0,1)).fit_transform(train_x)
test_x = MinMaxScaler().fit_transform(test_x)

In [6]:
inputs = tf.placeholder(tf.float32,shape=(None,n_inputs),name='inputs')
sample_weights = tf.placeholder(tf.float32,shape=(None),name='weights')
targets = tf.placeholder(tf.float32,shape=(None),name='targets')
training = tf.placeholder_with_default(False,shape=(),name='training')
activation = tf.nn.elu
he_init = tf.contrib.layers.variance_scaling_initializer()
regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
dense_layer = partial(tf.layers.dense,kernel_regularizer=regularizer,kernel_initializer=he_init)
dropout = partial(tf.layers.dropout,rate=dropout_rate,training=training)
batch_norm = partial(tf.layers.batch_normalization,training=training,momentum=0.9)

In [7]:
with tf.name_scope('dnn'):
    input_drop = dropout(inputs,name='dropout_input')
    hidden1 = dense_layer(input_drop,n_hidden1,name='hidden1')
    bn1 = batch_norm(hidden1,name='batch_norm1')
    bn1_act = activation(bn1,name='batch_norm_act1')
    hidden1_drop = dropout(bn1_act,name='dropout1')
    
    hidden2 = dense_layer(hidden1_drop,n_hidden2,name='hidden2')
    bn2 = batch_norm(hidden2,name='batch_norm2')
    bn2_act = activation(bn2,name='batch_norm_act2')
    hidden2_drop = dropout(bn2_act,name='dropout2')
    
    hidden3 = dense_layer(hidden2_drop,n_hidden3,name='hidden3')
    bn3 = batch_norm(hidden3,name='batch_norm3')
    bn3_act = activation(bn3,name='batch_norm_act3')
    hidden3_drop = dropout(bn3_act,name='dropout3')
    
    logits_before_bn = dense_layer(hidden3_drop,n_outputs,name='logits_before_bn')
    logits = batch_norm(logits_before_bn,name='logits')

In [11]:
with tf.name_scope('loss'):
    reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    base_loss = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(targets=targets,
                                                                        logits=logits,pos_weight=sample_weights),name='base_loss')
    loss = tf.add_n([base_loss]+reg_loss,name='loss')

In [12]:
with tf.name_scope('train'):
    optimizer = tf.train.AdadeltaOptimizer()
    training_op =optimizer.minimize(loss)

In [13]:
with tf.name_scope('eval'):
    pred = tf.argmax(logits,1)

In [14]:
init = tf.global_variables_initializer()
n_epochs = 200
n_batch = 30
batch_size = train_x.shape[0]//n_batch
logger.info('Training epochs: %d, training batches: %d, batch_size: %d',n_epochs,n_batch,batch_size)
saver = tf.train.Saver() 
sample_weights_train.reset_index(drop=True,inplace=True)
train_label.reset_index(drop=True,inplace=True)
sample_weights_test.reset_index(drop=True,inplace=True)
test_label.reset_index(drop=True,inplace=True)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

In [16]:
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        random_ix = np.random.permutation(train_x.shape[0]) 
        for iteration in range(n_batch):
            start_ix = iteration*batch_size 
            end_ix = np.minimum((iteration+1)*batch_size,train_x.shape[0])
            ixs = random_ix[start_ix:end_ix]
            x_batch = train_x[ixs,:]
            weights_batch = sample_weights_train[ixs].values.reshape([-1,1])
            y_batch = train_label[ixs].values.reshape([-1,1])
            if iteration == 15:
                pred_results,losses = sess.run([pred,loss],feed_dict={inputs:x_batch,sample_weights:weights_batch,training:True,targets:y_batch})
                score_maps = scores(pred_results,y_batch)
                print("At epoch"+str(epoch)+" loss: "+str(losses)+"\n"+str(score_maps))
            # print(hidden1_mean.eval(feed_dict={inputs:x_batch,sample_weights:weights_batch,training:True}))
            sess.run([training_op,update_ops],feed_dict={inputs:x_batch,sample_weights:weights_batch,training:True,targets:y_batch})    
    pred_results,losses = sess.run([pred,loss],feed_dict={inputs:x_batch,sample_weights:weights_batch,training:True,targets:y_batch})
    score_maps = scores(pred_results,y_batch)
    print("At the end loss: "+str(losses)+"\n"+str(score_maps))
#    total_x = np.concatenate([np.array(train_x),np.array(test_x)])
#    total_label = np.concatenate([np.array(train_label),np.array(test_label)])
#    total_weights = pd.concat([sample_weights_train,sample_weights_test],ignore_index=True).values.reshape((-1,1))
    test_weights = sample_weights_test.values.reshape([-1,1])
    test_labels = test_label.values.reshape([-1,1])
    test_preds = pred.eval(feed_dict={inputs:test_x,sample_weights:test_weights,targets:test_labels})    
    test_score_maps = scores(test_preds,test_labels)
    saver.save(sess,'/home/ec2-user/tensor_model/deep_network.ckpt') 
    logger.info('Test scores: \n'+str(test_score_maps))

At epoch0 loss: 63.761467
{'auc': 0.6088709677419355, 'recall': 0.75, 'precision': 0.15384615384615385, 'accuracy': 0.5, 'f1': 0.25531914893617025}
target shape is <unknown> logits shape is (?, 2)
At epoch1 loss: 63.71978
{'auc': 0.3177083333333333, 'recall': 0.16666666666666666, 'precision': 0.02857142857142857, 'accuracy': 0.44285714285714284, 'f1': 0.048780487804878044}
target shape is <unknown> logits shape is (?, 2)
At epoch2 loss: 63.632065
{'auc': 0.7063492063492064, 'recall': 0.8571428571428571, 'precision': 0.17647058823529413, 'accuracy': 0.5857142857142857, 'f1': 0.29268292682926833}
target shape is <unknown> logits shape is (?, 2)
At epoch3 loss: 63.77637
{'auc': 0.35080645161290325, 'recall': 0.25, 'precision': 0.05555555555555555, 'accuracy': 0.42857142857142855, 'f1': 0.0909090909090909}
target shape is <unknown> logits shape is (?, 2)
At epoch4 loss: 63.766075
{'auc': 0.47580645161290325, 'recall': 0.5, 'precision': 0.10526315789473684, 'accuracy': 0.45714285714285713, 