This program produces a Bi-CNN + LSTM model for prediciting whether a sequence is an EF hand

In [None]:
from __future__ import print_function
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Reshape
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.convolutional import Convolution1D, MaxPooling1D
import _pickle as cPickle
from sklearn.model_selection import train_test_split 

In [None]:
# Converts sequence to vector 
def trans(str):
    a = []
    dic = {'A':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'H':7, 'I':8, 'K':9, 'L':10, 'M':11, 'N':12, 'P':13, 'Q':14, 'R':15, 'S':16, 'T':17, 'V':18, 'W':19, 'Y':20, 'X':21}
    for i in range(len(str)):
        a.append(dic.get(str[i]))
    return a

In [None]:
# Converts dataset into list containing sequence and it's label
def createTrainData(str1):
    sequence_num = []
    label_num = []
    for line in open(str1):
        label, sequence = line.split(",")
        # sequence = sequence.strip(' \t\r\n');
        sequence_num.append(trans(sequence))
        # label = label.strip(' \t\r\n');
        label_num.append(int(label))

    return sequence_num,label_num

In [None]:
# Converting dataset into x and y data
a,b=createTrainData("efHandData_neg_pos_interleukins.csv")
t = (a, b)
cPickle.dump(t,open("data.pkl","wb"))

In [None]:
# Splits training data and test data
def createTrainTestData(str_path, nb_words=None, skip_top=0,
              maxlen=None, test_split=0.25, seed=69420,
              start_char=1, oov_char=2, index_from=3):
    X,labels = cPickle.load(open(str_path, "rb"))

    for x in X:
      del x[-1]
      del x[-1]
    np.random.seed(seed) # for reproducibility
    np.random.shuffle(X)
    np.random.seed(seed)
    np.random.shuffle(labels)

    if maxlen:
        new_X = []
        new_labels = []
        for x, y in zip(X, labels):
            if len(x) < maxlen:
                new_X.append(x)
                new_labels.append(y)
        X = new_X
        labels = new_labels
    if not nb_words:
        nb_words = max([max(x) for x in X])

    X_train = np.array(X[:int(len(X) * (1 - test_split))])
    y_train = np.array(labels[:int(len(X) * (1 - test_split))])

    X_test = np.array(X[int(len(X) * (1 - test_split)):])
    y_test = np.array(labels[int(len(X) * (1 - test_split)):])
    
    return (X_train, y_train), (X_test, y_test)

In [None]:
# Embedding
max_features = 23
maxlen = 12
embedding_size = 256

# Convolution
nb_filter = 10
pool_length = 2

# LSTM
lstm_output_size = 12

# Training
batch_size = 128
nb_epoch = 150

In [None]:
# Creates training dataset
(X_train, y_train), (X_test, y_test) = createTrainTestData("data.pkl",nb_words=max_features, test_split=0.2)
print(X_train)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

Loading data...
[list([3, 20, 5, 6, 9, 10, 4, 13, 9, 8, 16])
 list([3, 15, 11, 11, 16, 20, 10, 16, 16])
 list([3, 17, 3, 9, 16, 6, 17, 8, 17, 20, 4]) ...
 list([3, 3, 5, 18, 11, 16, 20, 10, 5, 1, 14])
 list([3, 17, 12, 9, 16, 6, 17, 10, 3, 8, 7])
 list([3, 18, 18, 16, 10, 18, 14, 7, 20, 18, 1, 16])]
3445 train sequences
862 test sequences




In [None]:
# Converts the list of str to a number
def listToNum(data):
  new = []
  for x in data: 
    temp = ''.join(map(str,x))
    temp = int(temp)
    new.append(temp)
  return new

In [None]:
# Pre processing data into correct shape:
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

# Implementing Embedding Layer
model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))

# Adding Bi-CNN Layers
model.add(Dropout(0.5))
model.add(Convolution1D(activation='relu',filters=nb_filter, kernel_size=1))
model.add(MaxPooling1D(pool_size=pool_length, padding='same'))

model.add(Convolution1D(activation='relu',filters=nb_filter, kernel_size=1))
model.add(MaxPooling1D(pool_size=pool_length, padding='same'))

# Adding LSTM Layer
model.add(LSTM(lstm_output_size))
model.add(Dense(1))
model.add(Activation('sigmoid'))

# Compiling model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Training Model
model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch,
          validation_data=(X_test, y_test))

Pad sequences (samples x time)
X_train shape: (3445, 12)
X_test shape: (862, 12)
Build model...
Train...
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 7

<keras.callbacks.History at 0x7f7196ecb810>

In [None]:
# Saving Model
model.save("epoch150.h5") 
# Evaluating acurracy of model
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.06801965832710266
Test accuracy: 0.9791183471679688
