In [1]:
import csv
import numpy as np

from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers.core import Dense, Flatten, Reshape, Activation, Dropout
from keras.models import Sequential
from keras.utils.np_utils import to_categorical
from keras import optimizers
from keras.layers.recurrent import LSTM

Using TensorFlow backend.


In [44]:
def read_inpt_file(file_name):
    csvFile = open(file_name,'r')
    reader = csv.reader(csvFile)
    features = []
    labels = []
    for item in reader:    
        if reader.line_num == 1:# ignore line 1
            continue
        feature = alpha_to_num(item[1])
        features.append(feature)
        labels.append(item[2])
    csvFile.close()
    features = np.array(features)
    features = features.astype('float64')
    labels = np.array(labels)
    labels = labels.astype('float64')
    return features, labels

#convert 'A,C,T,G'to integer '0,1,2,3'
def alpha_to_num(alpha):
    feature_num = np.zeros(len(alpha))
    for i in range(len(alpha)):
        if alpha[i] == 'A':
            feature_num[i] = 0
        elif alpha[i] == 'C':
            feature_num[i] = 1
        elif alpha[i] == 'T':
            feature_num[i] = 2
        elif alpha[i] == 'G':
            feature_num[i] = 3
        else:
            raise AssertionError("cannot handle: " + alpha[i] )
    return feature_num

def one_hot_encoding(features, dna_length):
    y = np.zeros((features.shape[0], dna_length, 4))
    for i in range(0, len(features)):
        y[i] = to_categorical(features[i],4)
    return y

def read_test_file(file_name):
    csvFile = open(file_name,'r')
    reader = csv.reader(csvFile)
    features = []
    for item in reader:
        if reader.line_num == 1:# ignore line 1
            continue
        feature = alpha_to_num(item[1])
        features.append(feature)
    csvFile.close()
    features = np.array(features)
    features = features.astype('float64')
    return features

In [28]:
#create a CNN model
model_CNN = Sequential()

model_CNN.add(BatchNormalization(input_shape =(14, 4, 1) ))

model_CNN.add(Conv2D(128, (4, 4), padding='valid', input_shape=(14, 4, 1),activation='relu'))

model_CNN.add(MaxPooling2D(pool_size=(3,1), padding='valid'))

#model_CNN.add(Conv2D(32, (2, 1), padding='valid',activation='relu'))

#model_CNN.add(Conv2D(16, (1, 1), padding='valid', activation='relu'))

model_CNN.add(Flatten())
model_CNN.add(Dense(32, activation='relu')) #fully connected layer with 32 neurons

model_CRNN.add(Dropout(rate=0.5))

model_CNN.add(Dense(1, activation='sigmoid'))

adam = optimizers.Adam(lr=0.001)

sgd = optimizers.SGD(lr=0.003, momentum=0.9)

model_CNN.compile(loss='mean_squared_error', optimizer= adam)

model_CNN.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_9 (Batch (None, 14, 4, 1)          4         
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 11, 1, 128)        2176      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 3, 1, 128)         0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 384)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 32)                12320     
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 33        
Total params: 14,533
Trainable params: 14,531
Non-trainable params: 2
_________________________________________________________________


In [4]:
#create a RNN model
model_RNN = Sequential()

model_RNN.add(LSTM(10, input_shape=(14, 4),return_sequences=True))
model_RNN.add(LSTM(5,return_sequences=True))
model_RNN.add(LSTM(1))

sgd = optimizers.SGD(lr=0.001, momentum=0.9)

model_RNN.compile(loss='mean_squared_error', optimizer= sgd)#'mean_squared_error'

model_RNN.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 14, 10)            600       
_________________________________________________________________
lstm_2 (LSTM)                (None, 14, 5)             320       
_________________________________________________________________
lstm_3 (LSTM)                (None, 1)                 28        
Total params: 948
Trainable params: 948
Non-trainable params: 0
_________________________________________________________________


In [34]:
#create a CNN + RNN model
model_CRNN = Sequential()

model_CRNN.add(BatchNormalization(input_shape =(14, 4, 1) ))

model_CRNN.add(Conv2D(128, (4, 4), padding='valid', input_shape=(14, 4, 1),activation='relu'))

#model_CRNN.add(MaxPooling2D(pool_size=(3,1), padding='valid'))

model_CRNN.add(Conv2D(16, (2, 1), padding='valid',activation='relu'))

#model_CRNN.add(Conv2D(16, (1, 1), padding='valid', activation='relu'))


model_CRNN.add(Reshape((16,10)))
#model_CRNN.add(Reshape((128,11)))


model_CRNN.add(LSTM(16, return_sequences=True))
#model_CRNN.add(LSTM(4, return_sequences=True))

model_CRNN.add(Flatten())
model_CRNN.add(Dense(32, activation='sigmoid'))
model_CRNN.add(Dropout(rate=0.5))
model_CRNN.add(Dense(1, activation='sigmoid'))

adam = optimizers.Adam(lr=0.001)

#(y_true, y_pred)
model_CRNN.compile(loss='mean_squared_error', optimizer= adam)
model_CRNN.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_14 (Batc (None, 14, 4, 1)          4         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 11, 1, 128)        2176      
_________________________________________________________________
conv2d_22 (Conv2D)           (None, 10, 1, 16)         4112      
_________________________________________________________________
reshape_11 (Reshape)         (None, 16, 10)            0         
_________________________________________________________________
lstm_15 (LSTM)               (None, 16, 16)            1728      
_________________________________________________________________
flatten_11 (Flatten)         (None, 256)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 32)                8224      
__________

In [40]:
#train
def train(model_name = 'CNN',batch=16, epoch = 5):
    if model_name == 'CNN':
        model = model_CNN
    elif model_name == 'RNN':
        model = model_RNN
    else:
         model = model_CRNN
    features, labels = read_inpt_file('train.csv')
    train_features = features[0:1600]
    train_labels = labels[0:1600]
    validate_features = features[1600:]
    validate_lables = labels[1600:]
    t = one_hot_encoding(train_features, 14)
    v = one_hot_encoding(validate_features, 14)
    if model_name == 'CNN' or  model_name == 'CRNN': #reshape input for conv layer
        t = t.reshape((t.shape[0], t.shape[1], t.shape[2], 1))
        v = v.reshape((v.shape[0], v.shape[1], v.shape[2], 1))
    model.fit(t, train_labels,
          batch_size=16, epochs=epoch,
          validation_data=(v, validate_lables ))

In [7]:
def predict(model_name = 'CNN'):
    if model_name == 'CNN':
        model = model_CNN
    elif model_name == 'RNN':
        model = model_RNN
    else:
         model = model_CRNN
    test = read_test_file('test.csv')
    t = one_hot_encoding(test, 14)
    if model_name == 'CNN' or  model_name == 'CRNN': #reshape input for conv layer
        t = t.reshape((t.shape[0], t.shape[1], t.shape[2], 1))
    predicted = model.predict(t, batch_size=32).flatten()
    predicted_list =[]
    for p in predicted:   
        if p < 0.5:
            predicted_list.append(0)
        else:
            predicted_list.append(1)
    print (predicted_list)
    sum = 0
    for i in predicted_list:
        sum+=i
    print(sum)
    return predicted_list

In [42]:
train(model_name = 'CNN',batch=16, epoch = 3)#model_name = CNN, RNN or CRNN

Train on 1600 samples, validate on 400 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [36]:
predicted_list = predict('CRNN')#return predicted result

[1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 

In [37]:
#write result predicted into a csv file
fileHeader = ['id', 'prediction']
csvFile = open('Predictions.csv','w',newline='')
writer = csv.writer(csvFile)
writer.writerow(fileHeader)
for i in range(len(predicted_list)):
    writer.writerow([i,predicted_list[i]])
csvFile.close()