# DNN evaluation across different sizes for a given p-value threshold

In [1]:
%matplotlib notebook
import tensorflow as tf
import numpy as np
from pandas_plink import read_plink
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from joblib import Parallel, delayed
from sklearn import metrics
from math import sqrt

import random
from sklearn.metrics import roc_curve,roc_auc_score

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.preprocessing import OneHotEncoder

In [None]:
''' Minibatch function'''
def shuffle_batch(X, y, batch_size):
    rnd_idx = np.random.permutation(len(X))
    n_batches = len(X) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch

In [None]:
''' Parameters for experiment '''
threshold="0.01"
path_logs="/work/breastcancer/clean_test/logs/"
path_to_files="/work/breastcancer/clean_test/"

In [None]:
''' getting bim,fam,bed for training,validation and test sets '''
(bim, fam, bed)=read_plink(path_to_files+"train/sig"+threshold)
(bim2, fam2, bed2)=read_plink(path_to_files+"validation/val"+threshold)
(bim3, fam3, bed3)=read_plink(path_to_files+"test/test"+threshold)

path_logs="/work/breastcancer/clean_test/logs/"

print(bim)

print(fam)

''' Creating arrays with optimal data structure and filling missing values with 2--> Homozygous major '''
print("Convertion")
bed=bed.astype('uint8')
print("Compute")
X=bed.compute()
print("Filling Null Data")
X[np.isnan(X)]=2
#validation
print("Convertion")
bed2=bed2.astype('uint8')
print("Compute")
X_val=bed2.compute()
print("Filling Null Data")
X_val[np.isnan(X_val)]=2
#test
print("Convertion")
bed3=bed3.astype('uint8')
print("Compute")
X_test=bed3.compute()
print("Filling Null Data")
X_test[np.isnan(X_test)]=2

''' Preparing data.shape=(individuals,SNP) '''
#train
Y=fam["trait"].astype("int")
Y=Y-1
Xdf=pd.DataFrame(X.T)
Xdf["Y"]=Y

#validation
Y_val=fam2["trait"].astype("int")
Y_val=Y_val-1
Xdf_val=pd.DataFrame(X_val.T)
Xdf_val["Y"]=Y_val

#test
Y_test=fam3["trait"].astype("int")
Y_test=Y_test-1
Xdf_test=pd.DataFrame(X_test.T)
Xdf_test["Y"]=Y_test

''' Getting np arrays '''
x_train=Xdf.drop(['Y'],axis=1).values
y_train=Xdf[['Y']].values

x_val=Xdf_val.drop(['Y'],axis=1).values
y_val=Xdf_val[['Y']].values

x_test=Xdf_test.drop(['Y'],axis=1).values
y_test=Xdf_test[['Y']].values

In [21]:
''' MODEL '''

''' Inputs tensors '''
tf.reset_default_graph()
X=tf.placeholder(tf.float32,shape=(None,x_train.shape[1]),name="X")
Y=tf.placeholder(tf.float32,shape=(None,1),name="Y")

''' DNN with dropout rate=0.5 and Batch Norm'''

with tf.name_scope("dnn"):
    training = tf.placeholder_with_default(False, shape=(), name='training')
    initializer = tf.contrib.layers.xavier_initializer()
    hidden00_drop= tf.layers.dropout(X, 0.5, training=training)
    hidden01=tf.layers.dense(hidden00_drop, 1000, name="hidden01",activation=None, kernel_initializer=initializer)
    hidden01_norm=tf.layers.batch_normalization(hidden01, training=training, momentum=0.9)
    act_hidden01=tf.nn.leaky_relu(hidden01_norm)
    hidden01_drop = tf.layers.dropout(act_hidden01, 0.5, training=training)
    hidden0=tf.layers.dense(hidden01_drop, 250, name="hidden0",activation=None, kernel_initializer=initializer)
    hidden0_norm=tf.layers.batch_normalization(hidden0, training=training, momentum=0.9)
    act_hidden0=tf.nn.leaky_relu(hidden0_norm)
    hidden0_drop = tf.layers.dropout(act_hidden0, 0.5, training=training)
    hidden1=tf.layers.dense(hidden0_drop, 50, name="hidden1",activation=None, kernel_initializer=initializer)
    hidden1_norm=tf.layers.batch_normalization(hidden1, training=training, momentum=0.9)
    act_hidden1=tf.nn.leaky_relu(hidden1_norm)
    hidden1_drop = tf.layers.dropout(act_hidden1, 0.5, training=training)
    hidden1_norm=tf.layers.batch_normalization(hidden1_drop, training=training, momentum=0.9)
    output=tf.layers.dense(  hidden1_norm, 1, name="output_final",activation=None)

''' Log-Loss '''
with tf.name_scope("loss"):
    cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=Y, logits=output)
    weights = tf.trainable_variables()
    loss=tf.reduce_mean(cross_entropy)
    error=loss

''' Adam Optimizer '''
with tf.name_scope("train"):
    optimizer =tf.train.AdamOptimizer(learning_rate=0.0001,beta1=0.9,beta2=0.999,epsilon=1e-08,use_locking=False,name='Adam')
    extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(extra_update_ops):
        training_op = optimizer.minimize(error)

''' Metrics '''
with tf.name_scope("eval"):
    predicted = tf.nn.sigmoid(output)
    correct_pred = tf.equal(tf.round(predicted), Y)
    acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    _,auc = tf.metrics.auc(labels=Y,predictions=predicted)

saver=tf.train.Saver(max_to_keep=100000)

In [22]:
''' Train models with different size and record the best per size'''
open('/work/breastcancer/Test_preprocessed_filter7.txt', 'w')
sizes=[10000,20000,30000,x_train.shape[0]]
best_val_size=[]
for size in sizes:
    best_auc=0
    init=tf.global_variables_initializer()
    loc=tf.local_variables_initializer()
    sess = tf.InteractiveSession(config=tf.ConfigProto(device_count={ "CPU": 44}))
    init.run()
    loc.run()
    auc_tab=[]
    accuracy_tab=[]
    loss_tab=[]
    epoch_tab=[]
    auc_tab_val=[]
    accuracy_tab_val=[]
    loss_tab_val=[]
    #Training
    for epoch in range(200):
        iteration=0
        batch_size=512
        for x_batch,y_batch in shuffle_batch(x_train[0:size,:], y_train[0:size,:], batch_size):
            sess.run(training_op,feed_dict={X:x_batch,Y:y_batch,training:True})
            print("%d ITERATION:%d/%d "%(epoch,iteration,len(x_train[0:size,:])//batch_size),end='\r')
            iteration+=1

        loc.run()
        loss_train,acc_train,auc_train=sess.run([loss,acc,auc],feed_dict={X:x_batch,Y:y_batch,training:False})
        print(epoch,"Train accuracy:",acc_train,"Loss:",loss_train,"AUC:",auc_train)
        auc_tab.append(auc_train)
        accuracy_tab.append(acc_train)
        epoch_tab.append(epoch)
        loss_tab.append(loss_train)
        #validation
        loc.run()
        loss_val,acc_val,auc_val=sess.run([loss,acc,auc],feed_dict={X:x_val,Y:y_val,training:False})
        auc_tab_val.append(auc_val)
        accuracy_tab_val.append(acc_val)
        loss_tab_val.append(loss_val)
        if auc_val>best_auc:
            save_path = saver.save(sess,path_logs+"preprocessed"+threshold+"_data_"+size+".ckpt")
            best_auc=auc_val
        print(epoch,"Validation accuracy:",acc_val,"Loss:",loss_val,"AUC:",auc_val)
        print("\n")
    sess.close()
    sess = tf.InteractiveSession(config=tf.ConfigProto(device_count={ "CPU": 44}))
    init.run()
    loc.run()
    #Test score for the best model for AUC score on validation set. This final test is made on the test set 
    saver.restore(sess, path_logs+"preprocessed"+threshold+"_data_"+size+".ckpt")
    auc_test=sess.run(auc,feed_dict={X:x_test,Y:y_test,training:False})
    print(best_auc,auc_test)
    best_val_size.append([best_auc,auc_test])
    sess.close()

0 Train accuracy: 0.5605469 Loss: 0.6900287 AUC: 0.6126945
0 Validation accuracy: 0.51893324 Loss: 0.6968433 AUC: 0.5877841


1 Train accuracy: 0.5527344 Loss: 0.6890689 AUC: 0.58742785
1 Validation accuracy: 0.5437704 Loss: 0.6874343 AUC: 0.5987137


2 Train accuracy: 0.6074219 Loss: 0.66499853 AUC: 0.66092736
2 Validation accuracy: 0.5557818 Loss: 0.68379676 AUC: 0.60897225


3 Train accuracy: 0.6269531 Loss: 0.66191626 AUC: 0.65828145
3 Validation accuracy: 0.5639251 Loss: 0.6794243 AUC: 0.612411


4 Train accuracy: 0.546875 Loss: 0.67965126 AUC: 0.6190354
4 Validation accuracy: 0.56575733 Loss: 0.67785466 AUC: 0.6139107


5 Train accuracy: 0.5644531 Loss: 0.6730809 AUC: 0.6252152
5 Validation accuracy: 0.5661645 Loss: 0.67751026 AUC: 0.6146542


6 Train accuracy: 0.59765625 Loss: 0.66382754 AUC: 0.63462126
6 Validation accuracy: 0.56779313 Loss: 0.6755578 AUC: 0.61677927


7 Train accuracy: 0.5546875 Loss: 0.6838799 AUC: 0.6145605
7 Validation accuracy: 0.56820035 Loss: 0.6750002 A

64 Train accuracy: 0.5449219 Loss: 0.67012775 AUC: 0.60180235
64 Validation accuracy: 0.5828583 Loss: 0.6599215 AUC: 0.6348238


65 Train accuracy: 0.5800781 Loss: 0.65482473 AUC: 0.6562294
65 Validation accuracy: 0.58245116 Loss: 0.6596307 AUC: 0.6346357


66 Train accuracy: 0.6484375 Loss: 0.6405036 AUC: 0.6975275
66 Validation accuracy: 0.5830619 Loss: 0.6596881 AUC: 0.6346492


67 Train accuracy: 0.6425781 Loss: 0.6417783 AUC: 0.70168775
67 Validation accuracy: 0.5840798 Loss: 0.6596288 AUC: 0.6342294


68 Train accuracy: 0.609375 Loss: 0.65432316 AUC: 0.6544032
68 Validation accuracy: 0.58245116 Loss: 0.659519 AUC: 0.6346892


69 Train accuracy: 0.609375 Loss: 0.64022183 AUC: 0.6795474
69 Validation accuracy: 0.5828583 Loss: 0.65933883 AUC: 0.6340315


70 Train accuracy: 0.6640625 Loss: 0.6239643 AUC: 0.7303816
70 Validation accuracy: 0.5840798 Loss: 0.65894866 AUC: 0.63504565


71 Train accuracy: 0.6171875 Loss: 0.64672196 AUC: 0.687709
71 Validation accuracy: 0.58448696 Loss: 0.

127 Validation accuracy: 0.5950733 Loss: 0.65445995 AUC: 0.6437206


128 Train accuracy: 0.5996094 Loss: 0.64475757 AUC: 0.6637969
128 Validation accuracy: 0.5958876 Loss: 0.65407664 AUC: 0.64381284


129 Train accuracy: 0.6015625 Loss: 0.6518576 AUC: 0.65174323
129 Validation accuracy: 0.5950733 Loss: 0.653942 AUC: 0.64396775


130 Train accuracy: 0.6621094 Loss: 0.6305556 AUC: 0.73444724
130 Validation accuracy: 0.59425896 Loss: 0.6542924 AUC: 0.64310837


131 Train accuracy: 0.609375 Loss: 0.6508033 AUC: 0.6666054
131 Validation accuracy: 0.59324104 Loss: 0.6538204 AUC: 0.64298034


132 Train accuracy: 0.625 Loss: 0.6392325 AUC: 0.6859182
132 Validation accuracy: 0.59548044 Loss: 0.6542061 AUC: 0.6425667


133 Train accuracy: 0.6074219 Loss: 0.65272063 AUC: 0.6627034
133 Validation accuracy: 0.5922231 Loss: 0.6539032 AUC: 0.6427117


134 Train accuracy: 0.6074219 Loss: 0.6462543 AUC: 0.6628028
134 Validation accuracy: 0.59263027 Loss: 0.6537783 AUC: 0.6427508


135 Train accuracy: 0

190 Validation accuracy: 0.59710914 Loss: 0.6516105 AUC: 0.64700603


191 Train accuracy: 0.65234375 Loss: 0.6389883 AUC: 0.71499634
191 Validation accuracy: 0.5973127 Loss: 0.6510963 AUC: 0.6469791


192 Train accuracy: 0.63671875 Loss: 0.63871294 AUC: 0.68155223
192 Validation accuracy: 0.5969055 Loss: 0.651721 AUC: 0.64691603


193 Train accuracy: 0.6386719 Loss: 0.645851 AUC: 0.6640764
193 Validation accuracy: 0.59995925 Loss: 0.6507644 AUC: 0.6471027


194 Train accuracy: 0.6152344 Loss: 0.6476971 AUC: 0.6589778
194 Validation accuracy: 0.60016286 Loss: 0.6507624 AUC: 0.64784384


195 Train accuracy: 0.65234375 Loss: 0.62657714 AUC: 0.72216797
195 Validation accuracy: 0.59995925 Loss: 0.65133053 AUC: 0.647626


196 Train accuracy: 0.6152344 Loss: 0.63472223 AUC: 0.6810827
196 Validation accuracy: 0.59934855 Loss: 0.65084076 AUC: 0.6479344


197 Train accuracy: 0.6738281 Loss: 0.61933136 AUC: 0.7387885
197 Validation accuracy: 0.598127 Loss: 0.6507917 AUC: 0.6471975


198 Train acc

W1118 13:39:30.842445 47373910624576 deprecation.py:323] From /home/adbadre/.local/lib/python3.6/site-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.


0.6479344 0.6561092


In [None]:
'''Save Results'''
np.savetxt(path_logs+"best_value_test_auc.csv", np.array(best_val_size), delimiter=",",)