In [1]:
# Tensorflow Version 1.15 
# Keras Version 2.2.4
# Python Version 3.6 

import scipy
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split
from keras.layers import Dense
import numpy as np

Using TensorFlow backend.


In [3]:
# import shuffled dataset. The dataset combined Network Embedding and Morgan fingerprint. 
# t1-t14: labels, marked in "0" or "1" 
# f1-f512: fingerprint, marked in "0" or "1" 
# f513-f912: Network Embedding vector 

data, meta = scipy.io.arff.loadarff('512_400_S.arff')
df = pd.DataFrame(data)

In [4]:
# Show the dataset in dataframe, 3883 rows × 926 columns

df

Unnamed: 0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,...,f903,f904,f905,f906,f907,f908,f909,f910,f911,f912
0,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'1',...,-0.018144,-0.025215,0.095249,0.003427,0.015036,-0.087453,-0.144500,0.013167,-0.044638,0.021302
1,b'1',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',...,-0.104640,-0.016924,0.172780,-0.063416,0.086858,-0.131860,0.073103,-0.024830,0.109920,-0.276610
2,b'0',b'0',b'0',b'1',b'0',b'0',b'0',b'0',b'0',b'0',...,-0.089728,0.003638,0.035825,-0.170860,0.060144,0.061022,0.066092,-0.064063,0.051568,-0.087834
3,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'1',b'0',b'0',...,-0.054098,0.016692,-0.085323,0.107890,-0.031168,0.082244,0.349440,0.059118,-0.052747,0.042570
4,b'1',b'0',b'0',b'0',b'1',b'0',b'0',b'0',b'0',b'0',...,-0.004209,0.049898,0.004564,0.084685,0.123060,0.043732,-0.207800,0.006222,-0.031472,0.151940
5,b'0',b'0',b'0',b'0',b'0',b'0',b'1',b'0',b'0',b'0',...,-0.097898,-0.001036,-0.131910,0.035836,0.034680,-0.149480,0.042433,0.003595,0.001252,0.015591
6,b'0',b'0',b'0',b'0',b'0',b'1',b'0',b'0',b'0',b'0',...,0.013768,0.209610,0.022874,0.071848,0.016253,0.000263,0.080965,0.108560,0.038364,0.018786
7,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',...,0.125980,-0.047228,-0.046675,-0.018364,0.066447,-0.027180,-0.039965,0.053237,-0.001351,0.066985
8,b'0',b'0',b'0',b'0',b'0',b'0',b'1',b'0',b'0',b'0',...,0.157910,0.102270,0.046079,0.062806,-0.019416,0.021459,0.061922,-0.055947,-0.067805,-0.069081
9,b'0',b'0',b'0',b'0',b'0',b'0',b'1',b'0',b'0',b'0',...,-0.075607,0.176020,0.011429,0.024623,0.113620,0.025990,-0.057841,-0.072412,0.046958,0.059869


In [5]:
# Set 1-14 col as labels, set 15-926 col as feature vectors

X = df.iloc[:,14:926].values
y = df.iloc[:,0:14].values


In [6]:
# Building a multilayer perceptron for multi-label classification. 
# 5-layer in MLP, the nodes in each layer are 4096, 2048, 1024, 512, 256 respectively. 
# Activation function for the output layer is sigmoid. So there are 14-D output. 
# optimizer='adam'   loss='binary_crossentropy' 


def deep_model(feature_dim,label_dim):
    from keras.models import Sequential
    from keras.layers import Dense
    model = Sequential()
    print("create model. feature_dim ={}, label_dim ={}".format(feature_dim, label_dim))
    model.add(Dense(4096, activation='relu', input_dim=feature_dim))
    model.add(Dense(2048, activation='relu'))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(label_dim, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [7]:
# Train a MLP, batch_size=16-3883, epochs=20-30 
# "times" here stands for training times in 10-Cross validation, times=0-9 

def train_deep(X_train,y_train,X_test,y_test,times):
    feature_dim = X_train.shape[1]
    label_dim = y_train.shape[1]
    model = deep_model(feature_dim,label_dim)   # Establish an empty model
    model.summary()
    model.fit(X_train,y_train,batch_size=16, epochs=30,validation_data=(X_test,y_test))   # Train the model
    name = 'my_model'+str(times)+'.h5'
    model.save(name)   # Save the model

In [8]:
# Training process of 10-Cross validation. 
# Here if you use cuda to train it, the minimun requirement of graph card memory is 8GB. 
# Variable i stands for the times of 10-Cross validation. 


for i in range(10):
      j = (3883/10)*(i+1)   # Variable j used as a mark to identify the beginning part of val-set.  
      print(j)
    
      X_train_1 = df.iloc[0:(int(j)-388),14:926].values   # To establish the feature vectors train-set
      X_train_2 = df.iloc[int(j):3883,14:926].values
      X_train = np.vstack((X_train_1, X_train_2))
      #print(X_train.shape)
      X_test = df.iloc[(int(j)-388):int(j),14:926].values   # To establish the feature vectors test-set
    
      y_train_1 = df.iloc[0:(int(j)-388),0:14].values   # To establish the labels train-set
      y_train_2 = df.iloc[int(j):3883,0:14].values
      y_train = np.vstack((y_train_1, y_train_2))
      #print(y_train.shape)
      y_test = df.iloc[(int(j)-388):int(j),0:14].values   # To establish the labels train-set
    
      y_train = y_train.astype(np.float64)
      y_test = y_test.astype(np.float64)
      #print(X_test.shape)
      #print(y_test.shape)
    
      train_deep(X_train,y_train,X_test,y_test,i)   # Train and save the model, you get 10 models correspond to 10-Cross Validation

.1649 - val_acc: 0.9477
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
3106.4
create model. feature_dim =912, label_dim =14
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_43 (Dense)             (None, 4096)              3739648   
_________________________________________________________________
dense_44 (Dense)             (None, 2048)              8390656   
_________________________________________________________________
dense_45 (Dense)             (None, 1024)              2098176   
_________________________________________________________________
dense_46 (Dense)             (None, 512)               