- Correlation tests between leave one out and connection weights feature reductions.
- Try using zero vectors for cw.

In [1]:
import numpy as np
import pandas as pd
import pydot
import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Dropout, Lambda, Activation, BatchNormalization, LocallyConnected1D, Reshape
from keras.utils import plot_model
import time
import itertools
import random

num_situ = 84
num_all = 8924
b = pd.read_csv('data/bdtnp.csv')
#Changes 'na' to 'naa' and 'nan' to 'nana'
d = pd.read_csv('data/dge_raw.csv', index_col=0, header=None, encoding='ISO-8859-1').T
labels = pd.read_csv('data/labels.csv', index_col=0, header=None).T
#Move in-situ 84 genes to the begining
cols = list(b) + list(set(list(d)) - set(list(b)))
d = d.loc[:,cols]
d = d.div(d.sum(axis=1), axis=0)
d.reset_index(drop=True, inplace=True)
labels.reset_index(drop=True, inplace=True)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#Create the true label pairs. Left: d-array, right: b-array 
#Labels start from 1 in the original file. They indicate a specific row in b table.
print(time.ctime(),'Create true list of tuples')
d['label'] = labels['label'] - 1
d['one'] = 1
d_true = list(zip(d.index, d.label, d.one))
d = d.drop(['one', 'label'], 1)

#Create the false label pairs
print(time.ctime(), 'Create false list of tuples')
a_list = [i for i in range(0,1297)] # d-array
b_list = [j for j in range(0,3039)] # b-array
d_prod = list(itertools.product(a_list, b_list))
d_false = [x+(0,) for x in d_prod if x not in d_true]

#Merge the two lists. Select 10003 samples from d and 1297 from b for training.
print(time.ctime(), 'Merging lists')
indicies = random.sample(range(len(d_false)), 10003)
d_false1 = [d_false[i] for i in indicies]
d_list = d_true + d_false1
random.shuffle(d_list)
len_list = len(d_list)
print(time.ctime(), f'len(d_list): {len_list}') #11300

print(time.ctime(), 'Create train input arrays')
X1_train = np.empty((len_list, num_situ)) #Can create a test array using X1_test = np.empty((1300, 84))
X2_train = np.empty((len_list, num_situ))
X3_train = np.empty((len_list, num_all - num_situ)) #8840
y_train = np.empty((len_list), dtype=int)
batch=0
for i in d_list[0:len_list]:
    try:
        X1_train[batch] = b.iloc[i[1]]
        X2_train[batch] = d.iloc[i[0]][0:num_situ]
        X3_train[batch] = d.iloc[i[0]][num_situ:]
        y_train[batch] = i[2]
    except:
        print('Exception in train.............', i)
    finally:
        batch = batch + 1

Mon Oct  8 16:52:41 2018 Create true list of tuples
Mon Oct  8 16:52:41 2018 Create false list of tuples
Mon Oct  8 16:54:40 2018 Merging lists
Mon Oct  8 16:54:40 2018 len(d_list): 11300
Mon Oct  8 16:54:40 2018 Create train input arrays


In [5]:
#Model build
print(time.strftime("%H:%M:%S"), ' Model build')

#First input model
input_a = Input(shape=(num_situ,))
dense_a = Dense(200, activation='softplus')(input_a)

#Second input model
input_b = Input(shape=(num_situ,))
dense_b = Dense(200, activation='softplus')(input_b)

#Third input model
input_c = Input(shape=(num_all - num_situ,))
dense_c = Dense(50, activation='softplus')(input_c)
drop_c = Dropout(0.2)(dense_c)

concat_a = concatenate([dense_a, dense_b])
dense_d = Dense(num_situ, activation='softplus')(concat_a)
drop_d = Dropout(0.2)(dense_d)

concat_b = concatenate([drop_d, drop_c])
dense_e = Dense(50, activation='softplus')(concat_b)
dense_f = Dense(1, activation='sigmoid')(dense_e)
model = Model(inputs=[input_a, input_b, input_c], outputs=[dense_f])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.save_weights('model.h5')
print(model.summary())
#plot_model(model, to_file='my_model.png')

16:55:29  Model build
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 84)           0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 84)           0                                            
__________________________________________________________________________________________________
dense_7 (Dense)                 (None, 200)          17000       input_4[0][0]                    
__________________________________________________________________________________________________
dense_8 (Dense)                 (None, 200)          17000       input_5[0][0]                    
_______________________________________________________________________________________

In [None]:
print(time.strftime("%H:%M:%S"), ' Fit')

#Try differnet training ommitting one gene at a time.
val_acc=np.empty((num_situ,2))
for i in range(num_situ):
    model.load_weights('model.h5')
    X2_temp = np.delete(X2_train, i, axis=1)
    #tbCallBack = keras.callbacks.TensorBoard(log_dir='.', histogram_freq=0, write_graph=True, write_images=True)
    history = model.fit(x=[X1_train, X2_temp, X3_train],
                        y=y_train,
                        batch_size=50,
                        epochs=20,
                        verbose=0,
                        validation_split=0.3,
                        class_weight={0:1, 1:10}) #, use_multiprocessing=True, workers=8) #, callbacks=[tbCallBack])
    val_acc[i,0] = np.average(history.history['val_acc'])
    val_acc[i,1] = np.max(history.history['val_acc'])
    print(time.ctime(), f'i: {i}, val_acc average: {val_acc[i,0]}, max: {val_acc[i,1]}')

np.save('val_acc.npy', val_acc)

In [6]:
print(time.strftime("%H:%M:%S"), ' Fit')

#tbCallBack = keras.callbacks.TensorBoard(log_dir='.', histogram_freq=0, write_graph=True, write_images=True)
history = model.fit(x=[X1_train, X2_train, X3_train],
                        y=y_train,
                        batch_size=50,
                        epochs=20,
                        verbose=1,
                        validation_split=0.3,
                        class_weight={0:1, 1:10}) #, use_multiprocessing=True, workers=8) #, callbacks=[tbCallBack])

16:55:33  Fit
Train on 7909 samples, validate on 3391 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
Fit with new model:
    16:49:21  Fit
Train on 7909 samples, validate on 3391 samples
Epoch 1/20
7909/7909 [==============================] - 11s 1ms/step - loss: 1.6888 - acc: 0.5183 - val_loss: 1.6191 - val_acc: 0.4397
Epoch 2/20
7909/7909 [==============================] - 8s 957us/step - loss: 1.3548 - acc: 0.5750 - val_loss: 1.2811 - val_acc: 0.5397
Epoch 3/20
7909/7909 [==============================] - 8s 953us/step - loss: 1.1800 - acc: 0.6404 - val_loss: 1.3882 - val_acc: 0.5600
Epoch 4/20
7909/7909 [==============================] - 8s 949us/step - loss: 1.0538 - acc: 0.6617 - val_loss: 1.2379 - val_acc: 0.6293
Epoch 5/20
7909/7909 [==============================] - 7s 943us/step - loss: 0.9956 - acc: 0.6874 - val_loss: 1.3259 - val_acc: 0.6101
Epoch 6/20
7909/7909 [==============================] - 7s 922us/step - loss: 0.9359 - acc: 0.7102 - val_loss: 1.1044 - val_acc: 0.7157
Epoch 7/20
7909/7909 [==============================] - 7s 920us/step - loss: 0.8704 - acc: 0.7311 - val_loss: 1.2743 - val_acc: 0.6724
Epoch 8/20
7909/7909 [==============================] - 7s 928us/step - loss: 0.8424 - acc: 0.7471 - val_loss: 1.1750 - val_acc: 0.6591
Epoch 9/20
7909/7909 [==============================] - 7s 925us/step - loss: 0.7924 - acc: 0.7631 - val_loss: 1.1649 - val_acc: 0.7623
Epoch 10/20
7909/7909 [==============================] - 7s 937us/step - loss: 0.7576 - acc: 0.7743 - val_loss: 1.0999 - val_acc: 0.8012
Epoch 11/20
7909/7909 [==============================] - 7s 930us/step - loss: 0.6911 - acc: 0.7996 - val_loss: 0.9562 - val_acc: 0.7986
Epoch 12/20
7909/7909 [==============================] - 7s 940us/step - loss: 0.6674 - acc: 0.8106 - val_loss: 3.0000 - val_acc: 0.7741
Epoch 13/20
7909/7909 [==============================] - 7s 943us/step - loss: 0.6517 - acc: 0.8158 - val_loss: 1.8273 - val_acc: 0.7729
Epoch 14/20
7909/7909 [==============================] - 7s 929us/step - loss: 0.6508 - acc: 0.8139 - val_loss: 1.2773 - val_acc: 0.8145
Epoch 15/20
7909/7909 [==============================] - 7s 931us/step - loss: 0.6247 - acc: 0.8169 - val_loss: 1.8678 - val_acc: 0.8458
Epoch 16/20
7909/7909 [==============================] - 7s 932us/step - loss: 0.5798 - acc: 0.8341 - val_loss: 2.1245 - val_acc: 0.8582
Epoch 17/20
7909/7909 [==============================] - 8s 1ms/step - loss: 0.5619 - acc: 0.8473 - val_loss: 1.8136 - val_acc: 0.8458
Epoch 18/20
7909/7909 [==============================] - 8s 950us/step - loss: 0.5455 - acc: 0.8476 - val_loss: 2.1344 - val_acc: 0.8699
Epoch 19/20
7909/7909 [==============================] - 7s 935us/step - loss: 0.5099 - acc: 0.8627 - val_loss: 3.0092 - val_acc: 0.8632
Epoch 20/20
7909/7909 [==============================] - 7s 932us/step - loss: 0.4993 - acc: 0.8588 - val_loss: 3.2009 - val_acc: 0.8841

In [None]:
from keras.models import load_model

model = load_model('logs/model_str.h5')
model.load_weights('logs/model.h5')
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
score = model.evaluate(x=[X1_train[0:1000], X2_train[0:1000]], y=y_train[0:1000], verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))

In [None]:
X1 = np.ones((1,84))
X2 = np.ones((1,84))
#X1[0,3] = 1
#X2[0,1] = 1
print(X2)
#model.predict([X1_train[0:100],X2_train[0:100]])
model.predict([X1,X2])

In [None]:
from scipy import stats

loo = pd.read_csv('logs/2/loo.csv')
cw = pd.read_csv('logs/2/cw.csv')
tau, p_value = stats.kendalltau(loo['x'], cw['x'])
print(f'tau - {tau}, p-value - {p_value}')

In [None]:
loo.iloc[0:60,:]

In [2]:
glist=[3,16,80,77,19,52,53,57,78,68,62,0,75,21,66,26,81,51,63,7,8,56,35,18,83,6,1,61,65,55,74,22,64,20,59,23,79,48,58,31,69,73,76,24,33,17,47,14,25,15,67,42,54,46,50,28,27,49,43,13]
glist_20 = [3,16,80,77,19,52,53,57,78,68,62,0,75,21,66,26,81,51,63,7]
glist_20_knn = [35,41,70,24,14,56,3,64,58,79,27,30,67,44,73,59,49,83,57,16]
b.iloc[2000][glist]

Antp            0.272935
CG17724         0.172097
twi             0.061407
Traf4           0.320034
CG8147          0.145717
knrl            0.130250
Kr              0.023738
MESR3           0.492552
trn             0.588104
pxb             0.039282
nub             0.452095
aay             0.252578
tll             0.047353
croc            0.175709
peb             0.212944
Dfd             0.235307
zen             0.047592
kni             0.189640
numb            0.181323
bowl            0.224969
brk             0.469506
Mes2            0.137385
E(spl)m5-HLH    0.152172
CG43394         0.121330
zfh1            0.433250
bmm             0.248249
Ama             0.073663
noc             0.120742
odd             0.711892
Mdr49           0.118106
tkv             0.113421
Cyp310a1        0.036230
oc              0.082026
cnc             0.281792
Nek2            0.489942
D               0.216639
tsh             0.210204
ImpE2           0.134337
mfas            0.361207
edl             0.192922
