# Deep Neural Network on reduced dataset

In [None]:
%matplotlib notebook
import tensorflow as tf
import numpy as np
from pandas_plink import read_plink
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from joblib import Parallel, delayed
from sklearn import metrics
from math import sqrt

import random
from sklearn.metrics import roc_curve,roc_auc_score

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.preprocessing import OneHotEncoder

In [None]:
''' Minibatch function'''
def shuffle_batch(X, y, batch_size):
    rnd_idx = np.random.permutation(len(X))
    n_batches = len(X) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch

In [None]:
''' Parameters for experiment '''
threshold="0.01"
path_logs="/work/breastcancer/clean_test/logs/"

In [None]:
''' getting bim,fam,bed for training,validation and test sets '''
(bim, fam, bed)=read_plink("/work/breastcancer/clean_test/train/sig"+threshold)
(bim2, fam2, bed2)=read_plink("/work/breastcancer/clean_test/validation/val"+threshold)
(bim3, fam3, bed3)=read_plink("/work/breastcancer/clean_test/test/test"+threshold)

In [None]:
path_logs="/work/breastcancer/clean_test/logs/"

In [None]:
print(bim)

In [None]:
print(fam)

In [None]:
''' Creating arrays with optimal data structure and filling missing values with 2--> Homozygous major '''
print("Convertion")
bed=bed.astype('uint8')
print("Compute")
X=bed.compute()
print("Filling Null Data")
X[np.isnan(X)]=2
#validation
print("Convertion")
bed2=bed2.astype('uint8')
print("Compute")
X_val=bed2.compute()
print("Filling Null Data")
X_val[np.isnan(X_val)]=2
#test
print("Convertion")
bed3=bed3.astype('uint8')
print("Compute")
X_test=bed3.compute()
print("Filling Null Data")
X_test[np.isnan(X_test)]=2

In [None]:
''' Preparing data.shape=(individuals,SNP) '''
#train
Y=fam["trait"].astype("int")
Y=Y-1
Xdf=pd.DataFrame(X.T)
Xdf["Y"]=Y

#validation
Y_val=fam2["trait"].astype("int")
Y_val=Y_val-1
Xdf_val=pd.DataFrame(X_val.T)
Xdf_val["Y"]=Y_val

#test
Y_test=fam3["trait"].astype("int")
Y_test=Y_test-1
Xdf_test=pd.DataFrame(X_test.T)
Xdf_test["Y"]=Y_test

In [None]:
''' Getting np arrays '''
x_train=Xdf.drop(['Y'],axis=1).values
y_train=Xdf[['Y']].values

x_val=Xdf_val.drop(['Y'],axis=1).values
y_val=Xdf_val[['Y']].values

x_test=Xdf_test.drop(['Y'],axis=1).values
y_test=Xdf_test[['Y']].values

In [None]:
''' MODEL '''

In [None]:
''' Inputs tensors '''
tf.reset_default_graph()
X=tf.placeholder(tf.float32,shape=(None,x_train.shape[1]),name="X")
Y=tf.placeholder(tf.float32,shape=(None,1),name="Y")

In [None]:
''' DNN with dropout rate=0.5 and Batch Norm'''

with tf.name_scope("dnn"):
    training = tf.placeholder_with_default(False, shape=(), name='training')
    initializer = tf.contrib.layers.xavier_initializer()
    hidden00_drop= tf.layers.dropout(X, 0.5, training=training)
    hidden01=tf.layers.dense(hidden00_drop, 1000, name="hidden01",activation=None, kernel_initializer=initializer)
    hidden01_norm=tf.layers.batch_normalization(hidden01, training=training, momentum=0.9)
    act_hidden01=tf.nn.leaky_relu(hidden01_norm)
    hidden01_drop = tf.layers.dropout(act_hidden01, 0.5, training=training)
    hidden0=tf.layers.dense(hidden01_drop, 250, name="hidden0",activation=None, kernel_initializer=initializer)
    hidden0_norm=tf.layers.batch_normalization(hidden0, training=training, momentum=0.9)
    act_hidden0=tf.nn.leaky_relu(hidden0_norm)
    hidden0_drop = tf.layers.dropout(act_hidden0, 0.5, training=training)
    hidden1=tf.layers.dense(hidden0_drop, 50, name="hidden1",activation=None, kernel_initializer=initializer)
    hidden1_norm=tf.layers.batch_normalization(hidden1, training=training, momentum=0.9)
    act_hidden1=tf.nn.leaky_relu(hidden1_norm)
    hidden1_drop = tf.layers.dropout(act_hidden1, 0.5, training=training)
    hidden1_norm=tf.layers.batch_normalization(hidden1_drop, training=training, momentum=0.9)
    output=tf.layers.dense(  hidden1_norm, 1, name="output_final",activation=None)

In [None]:
''' Log-Loss '''
with tf.name_scope("loss"):
    l2_regularizer = tf.contrib.layers.l2_regularizer(scale=0.01, scope=None)
    cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=Y, logits=output)
    weights = tf.trainable_variables()
    loss=tf.reduce_mean(cross_entropy)
    error=loss

In [None]:
''' Adam Optimizer '''
with tf.name_scope("train"):
    optimizer =tf.train.AdamOptimizer(learning_rate=0.0001,beta1=0.9,beta2=0.999,epsilon=1e-08,use_locking=False,name='Adam')
    extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(extra_update_ops):
        training_op = optimizer.minimize(error)

In [None]:
''' Metrics '''
with tf.name_scope("eval"):
    predicted = tf.nn.sigmoid(output)
    correct_pred = tf.equal(tf.round(predicted), Y)
    acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    _,auc = tf.metrics.auc(labels=Y,predictions=predicted)

In [None]:
#Optional
sess.close()

In [None]:
''' Init model '''
init=tf.global_variables_initializer()
loc=tf.local_variables_initializer()
sess = tf.InteractiveSession(config=tf.ConfigProto(device_count={ "CPU": 44}))
init.run()
loc.run()
auc_tab=[]
accuracy_tab=[]
loss_tab=[]
epoch_tab=[]
auc_tab_val=[]
accuracy_tab_val=[]
loss_tab_val=[]

In [None]:
saver=tf.train.Saver()

In [None]:
''' Training '''

In [None]:
import time
start = time.time()
batch_size=512
best_auc=0
open(path_logs+'Test_preprocessed_filter'+threshold+'.txt', 'w')
for epoch in range(200):
    iteration=0
    for x_batch,y_batch in shuffle_batch(x_train, y_train, batch_size):
        sess.run(training_op,feed_dict={X:x_batch,Y:y_batch,training:True})
        print("%d ITERATION:%d/%d "%(epoch,iteration,len(x_train)//batch_size),end='\r')
        iteration+=1
        
    loc.run()
    loss_train,acc_train,auc_train=sess.run([loss,acc,auc],feed_dict={X:x_batch,Y:y_batch,training:False})
    print(epoch,"Train accuracy:",acc_train,"Loss:",loss_train,"AUC:",auc_train)
    with open(path_logs+'Test_preprocessed_filter'+threshold+'.txt', 'a+') as file:
        file.write("Epoch"+str(epoch)+" Training accuracy:"+str(acc_train)+" Loss:"+str(loss_train)+" AUC:"+str(auc_train)+"\n\n")
    #train value
    auc_tab.append(auc_train)
    accuracy_tab.append(acc_train)
    epoch_tab.append(epoch)
    loss_tab.append(loss_train)
    #validation
    loc.run()
    loss_val,acc_val,auc_val=sess.run([loss,acc,auc],feed_dict={X:x_val,Y:y_val,training:False})
    auc_tab_val.append(auc_val)
    accuracy_tab_val.append(acc_val)
    loss_tab_val.append(loss_val)
    if best_auc<auc_val:
        saver.save(sess,path_logs+"preprocessed"+threshold+".ckpt")
        best_auc=auc_val
    with open(path_logs+'Test_preprocessed_filter'+threshold+'.txt', 'a+') as file:
        file.write("Epoch"+str(epoch)+" Validation accuracy:"+str(acc_val)+" Loss:"+str(loss_val)+" AUC:"+str(auc_val)+"\n\n")
    print(epoch,"Validation accuracy:",acc_val,"Loss:",loss_val,"AUC:",auc_val)
    print("\n")
end = time.time()

In [None]:
''' Max AUC validation '''
np.max(auc_tab_val)

In [None]:
''' Max AUC validation epoch'''
np.argmax(auc_tab_val)

In [None]:
''' Training time '''
print(end - start)

In [None]:
''' Max AUC validation model retrieving'''
sess.close()
init=tf.global_variables_initializer()
loc=tf.local_variables_initializer()
sess = tf.InteractiveSession(config=tf.ConfigProto(device_count={ "CPU": 44}))
init.run()
loc.run()
saver.restore(sess,path_logs+"preprocessed"+threshold+".ckpt")

In [None]:
''' Predictions on validation set '''
pred=sess.run(predicted,feed_dict={X:x_val,Y:y_val,training:False})

In [None]:
''' Predictions on test set '''
pred_test=predicted.eval(feed_dict={X:x_test,Y:y_test})

In [None]:
''' Score metrics on validation set '''
loc.run()
sess.run([loss,acc,auc],feed_dict={X:x_val,Y:y_val,training:False})

In [None]:
''' Score metrics on test set '''
loc.run()
sess.run([loss,acc,auc],feed_dict={X:x_test,Y:y_test,training:False})

In [None]:
''' Score metrics on test set computed with sklearn.metrics.roc_auc_score'''
roc_auc_score(y_test[:,0], pred[:,0])

In [None]:
''' Plot AUC and accuracy over learning '''
#AUC
fig, ax = plt.subplots(figsize = (5,5))
ax.plot(epoch_tab,auc_tab,label='AUC')
ax.plot(epoch_tab,auc_tab_val,label='AUC Validation p='+thereshold+' Dataset')
ax.set_xlabel("epoch")
ax.set_ylabel("Metrics' values")
ax.set_title('Training auc vs epoch full p='+thereshold+' Dataset')
legend = ax.legend(loc='upper right', shadow=True, fontsize='x-small')
plt.savefig(path_logs+'auc_'+threshold+'.png')

#Accuracy
fig, ax = plt.subplots(figsize = (5,5))
ax.plot(epoch_tab,accuracy_tab,label='Accuracy')
ax.plot(epoch_tab,accuracy_tab_val,label='Accuracy Validation Full Dataset')
ax.set_xlabel("epoch")
ax.set_ylabel("Metrics' values")
ax.set_title('Training accuracy vs epoch p='+thereshold+' Dataset')
legend = ax.legend(loc='upper right', shadow=True, fontsize='x-small')
plt.savefig(path_logs+'acc_'+threshold+'.png')

In [None]:
''' Plot prediction histogram on test set  '''
results=pd.DataFrame(pred_test)
target=pd.DataFrame(y_test)
target.columns=["Prediction Target"]
result_pred=pd.concat([results,target],axis=1)
result_pred.columns=["Predicted","Prediction Target"]
fig, ax = plt.subplots(figsize = (5,5))
r=result_pred.groupby('Prediction Target')['Predicted']
r.plot(kind='hist',alpha=.4,legend=True)
ax.set_xlabel("Probabilities")
ax.set_ylabel("Density")
ax.set_title('Histogram of the predicted values Full Dataset')
plt.savefig(path_logs+'histprediction_'+threshold+'.png')