In [1]:
import sys
sys.path.append('/home/ec2-user/CpGPython/code/')
import pandas as pd
import numpy as np
import tensorflow as tf
import Logger
from functools import partial
import re
from datetime import datetime

In [2]:
def dnn_model(features,labels,mode,params):
    if mode == tf.estimator.ModeKeys.TRAIN:
        training = True
    else:
        training = False
    if mode != tf.estimator.ModeKeys.PREDICT:
        sample_weights = features.pop('sample_weights')
        print(sample_weights)
    if_batch_norm = params['batch_normalization'] if 'batch_normalization' in params else True
    l2_reg = params['l2_reg'] if 'l2_reg' in params else 0
    n_classes = params['n_classes'] if 'n_classes' in params else 2
    hidden_layers = params['hidden_layers']
    dropout_rate = params['dropout'] if 'dropout' in params else 0
    activation = tf.nn.elu
    he_init = tf.contrib.layers.variance_scaling_initializer()
    regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
    dense_layer = partial(tf.layers.dense,kernel_regularizer=regularizer,kernel_initializer=he_init)
    dropout = partial(tf.layers.dropout,rate=dropout_rate,training=training)
    batch_norm = partial(tf.layers.batch_normalization,training=training,momentum=0.9)
    net = tf.feature_column.input_layer(features,params['feature_columns'])
    for units in hidden_layers:
        net_drop = dropout(net)
        
        if if_batch_norm:
            hidden = dense_layer(net_drop,units)
            bn = batch_norm(hidden)
            net = activation(bn)
        else:
            net = dense_layer(net_drop,units,activation=activation)
    
    logits_before_bn = dense_layer(net,n_classes)
    logits = batch_norm(logits_before_bn)
    #prediction 
    predicts = tf.arg_max(logits,1)
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
                'class_ids':predicts[:,tf.newaxis],
                'probabilities': tf.nn.softmax(logits),
                'logits': logits,}
        return tf.estimator.EstimatorSpec(mode,predictions=predictions)
    
    ##loss
    unweighted_base_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,logits=logits,name='base_loss')
    tf.summary.scalar('unweighted base losses',tf.reduce_mean(unweighted_base_losses))
    base_losses = tf.reduce_mean(tf.multiply(tf.cast(sample_weights,dtype=tf.float32),unweighted_base_losses))
    tf.summary.scalar('weighted base losses',base_losses)
    weight_max = tf.reduce_max(sample_weights)
    weight_min = tf.reduce_min(sample_weights)
    tf.summary.scalar('max weight',weight_max)
    tf.summary.scalar('min weight',weight_min)
    reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    loss = tf.add_n([base_losses]+reg_loss,name='loss')
    
    #evaluation 
    accuracy = tf.metrics.accuracy(labels=labels,predictions=predicts,name='acc_op')
    recall = tf.metrics.recall(labels=labels,predictions=predicts,name='recall_op')
    precision = tf.metrics.precision(labels=labels,predictions=predicts,name='precision_op')
    auc = tf.metrics.auc(labels,predicts,name='auc_op')
    metrics = {'accuracy':accuracy,'recall':recall,'precision':precision,'auc':auc}
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode,loss=base_losses,eval_metric_ops=metrics)
    
    #training
    assert mode == tf.estimator.ModeKeys.TRAIN
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    optimizer = tf.train.AdamOptimizer(learning_rate=0.1)
    with tf.control_dependencies(update_ops):
        train_op = optimizer.minimize(loss,global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode,loss=loss,train_op=train_op)

def train_input_fn(data,labels,sample_weights,batch_size):
    data = data.copy()
    data['sample_weights'] = sample_weights
    dataset = tf.data.Dataset.from_tensor_slices((dict(data),labels))
    shuffle_len = int(len(labels)*2)
    return dataset.shuffle(shuffle_len).repeat().batch(batch_size)

def eval_input_fn(data,labels,sample_weights,batch_size):
    features = dict(data)
    if labels is None:
        inputs = features
    else:
        data = data.copy()
        data['sample_weights'] = sample_weights
        inputs = (dict(data),labels)
    
    dataset = tf.data.Dataset.from_tensor_slices(inputs)
    assert batch_size is not None
    return dataset.batch(batch_size)
 

In [3]:

home='/home/ec2-user/CpGPython/'
log_dir = home+'logs/'
logger = Logger.Logger(log_dir,False).get_logger()
tensorboard_log = home+'tensor_logs/'+datetime.utcnow().strftime("%Y%m%d%H%M%S")
##features selecetd by traditional methods
with pd.HDFStore(home+'data/selected_features','r') as h5s:
    train_x =h5s['train_x'] 
    train_label = h5s['train_label'] 
    test_x = h5s['test_x'] 
    test_label = h5s['test_label']   
    sample_weights_train = h5s['sample_weights_train'] 
    sample_weights_test = h5s['sample_weights_test'] 
sample_weights_train = np.power(sample_weights_train,1.5)
sample_weights_test = np.power(sample_weights_test,1.5)

In [4]:
train_label = train_label.astype('i8')
test_label = test_label.astype('i8')

In [5]:
pattern = ".*[+].*"
reg = re.compile(pattern)
for key in train_x.keys():
    if len(reg.findall(key))>0:
        key1 = key.replace('+','plus')
        train_x.rename({key:key1},axis=1,inplace=True)
        test_x.rename({key:key1},axis=1,inplace=True)

In [6]:
feature_cols = []
for key in train_x.keys():
    feature_cols.append(tf.feature_column.numeric_column(key=key))

In [7]:
train_num = train_x.shape[0]
params = {'feature_columns':feature_cols,'batch_normalization':True,'l2_reg':0.01,'n_classes':len(train_label.unique()),'hidden_layers':[int(train_num*1.5),int(train_num*1),int(train_num*0.5)],'drop_out':0.5}
estimator = tf.estimator.Estimator(model_fn=dnn_model,params=params,model_dir=tensorboard_log)   

In [8]:
train_loss = estimator.train(input_fn=lambda:train_input_fn(train_x,train_label,sample_weights_train,30),steps=2000)    

Tensor("IteratorGetNext:176", shape=(?,), dtype=float64)


In [9]:
train_loss.model_dir

'/home/ec2-user/CpGPython/tensor_logs/20180409022157'

In [16]:
eval_results = estimator.evaluate(input_fn=lambda:eval_input_fn(test_x,test_label,sample_weights_test,235))

Tensor("IteratorGetNext:176", shape=(?,), dtype=float64)


In [17]:
scorings=['precision','recall','auc']
{key:val for key,val in eval_results.items() if key in scorings}

{'auc': 0.5, 'precision': 0.0, 'recall': 0.0}

In [11]:
print('Test set scores'+str(eval_results))

Test set scores{'accuracy': 0.5148936, 'auc': 0.48783615, 'loss': 0.88729197, 'precision': 0.08928572, 'recall': 0.45454547, 'global_step': 2000}


In [13]:
predict_results = estimator.predict(input_fn=lambda:eval_input_fn(test_x,None,None,1))

generator

In [14]:
probs = []
for pred_dict,expec in zip(predict_results,test_label):
    class_id = pred_dict['class_ids'][0]
    prob = pred_dict['probabilities']
    probs.extend(prob)
    print('Predict is {}, expect is {},Proba is {}'.format(class_id,expec,prob))

Predict is 0, expect is 0,Proba is [0.673224  0.3267761]
Predict is 0, expect is 0,Proba is [0.8650728  0.13492724]
Predict is 0, expect is 0,Proba is [0.8345747  0.16542536]
Predict is 0, expect is 0,Proba is [0.6831557  0.31684425]
Predict is 0, expect is 0,Proba is [0.88707155 0.11292844]
Predict is 0, expect is 0,Proba is [0.5387781 0.4612219]
Predict is 0, expect is 0,Proba is [0.8625271  0.13747294]
Predict is 0, expect is 0,Proba is [0.98484033 0.01515965]
Predict is 0, expect is 0,Proba is [0.8114418  0.18855825]
Predict is 0, expect is 1,Proba is [0.68408865 0.31591138]
Predict is 0, expect is 0,Proba is [0.6793191 0.3206809]
Predict is 0, expect is 0,Proba is [0.7617459  0.23825406]
Predict is 0, expect is 0,Proba is [0.84842193 0.1515781 ]
Predict is 0, expect is 0,Proba is [0.7113269  0.28867307]
Predict is 0, expect is 0,Proba is [0.9219892  0.07801083]
Predict is 0, expect is 0,Proba is [0.7232468 0.2767532]
Predict is 0, expect is 0,Proba is [0.74165213 0.2583478 ]
Predi

In [15]:
probs

[0.673224,
 0.3267761,
 0.8650728,
 0.13492724,
 0.8345747,
 0.16542536,
 0.6831557,
 0.31684425,
 0.88707155,
 0.11292844,
 0.5387781,
 0.4612219,
 0.8625271,
 0.13747294,
 0.98484033,
 0.015159647,
 0.8114418,
 0.18855825,
 0.68408865,
 0.31591138,
 0.6793191,
 0.3206809,
 0.7617459,
 0.23825406,
 0.84842193,
 0.1515781,
 0.7113269,
 0.28867307,
 0.9219892,
 0.078010835,
 0.7232468,
 0.2767532,
 0.74165213,
 0.2583478,
 0.79624754,
 0.20375249,
 0.9691831,
 0.03081697,
 0.9895662,
 0.010433776,
 0.7645466,
 0.23545341,
 0.62699145,
 0.37300846,
 0.80356294,
 0.19643706,
 0.82035315,
 0.17964691,
 0.87276816,
 0.12723179,
 0.6142474,
 0.38575262,
 0.782373,
 0.21762697,
 0.6187477,
 0.38125223,
 0.874505,
 0.125495,
 0.9719177,
 0.028082335,
 0.9017216,
 0.09827843,
 0.7064573,
 0.29354265,
 0.91411275,
 0.085887276,
 0.672141,
 0.327859,
 0.92466664,
 0.07533342,
 0.70980734,
 0.29019266,
 0.85211223,
 0.14788772,
 0.8138158,
 0.18618424,
 0.9392522,
 0.0607478,
 0.678124,
 0.3218760