In [1]:
# with open('dataset/raw/polyA_cs.fasta') as f:
#     lines = f.readlines()
#     for line in lines[:2]:
#         print(line)
#     print(len(lines))

In [4]:
!pip install --upgrade scikit-learn

Requirement already up-to-date: scikit-learn in /opt/conda/lib/python3.6/site-packages (0.22.1)


In [5]:
import sys
import numpy as np
import random
import argparse
from textwrap import dedent
from time import strftime
from itertools import product
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Bidirectional, Flatten, Dropout, GRU
from keras import optimizers
from sklearn.model_selection import KFold
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report, confusion_matrix
import sklearn

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
# One hot encoding for bases
BASES = {'A': 0,  # 0001
         'C': 1,  # 0010
         'G': 2,  # 0100
         'T': 3,  # 1000
         'a': 0,
         'c': 1,
         'g': 2,
         't': 3}
VERSION = 'v0.0.1'


def one_hot_encoding(seq, k_list, kmer_encoding):
    encoded_list = np.zeros(len(seq) * len(k_list) - sum(k_list) + len(k_list)) # 200 * 4 - sum(4,6,8,10) + 4
    seq = seq.upper()
    for k_i in range(len(k_list)):
        for i in range(len(seq) - k_list[k_i] + 1):
            kmer = seq[i: i + k_list[k_i]]
            encoded_list[i + len(seq) * k_i - sum(k_list[:k_i]) + k_i] = kmer_encoding[kmer]
    return encoded_list


def fasta_to_vectors(in_fasta, k_list):
    with open(in_fasta) as f:
        header_seq = f.readlines()
    
    seq = [header_seq[i * 2 + 1].strip() for i in range(int(len(header_seq)/2))] # extract seq data only

    # Generate all unique kmers and their one hot encoding
    unique_kmers = sum(pow(4, k_list)) # sum(4^ [4, 6, 8, 10])
    all_kmers = []
    for k in k_list:
        # AAAA, AAAT, AAAC, AAAG, GGGG k = 4
        # ..., GGGGGGGGGG, k = 10
        all_kmers.extend(list(product(['A', 'T', 'C', 'G'], repeat=k))) 

    all_kmers = [''.join(x) for x in all_kmers] # AAAA ~ GGGGGGGGGG
    kmer_encoding = dict(zip(all_kmers, range(unique_kmers))) # unique kmers: 1118464

    seq_vector = [one_hot_encoding(x, k_list, kmer_encoding) for x in seq]
    return seq_vector


def create_model(l, k, weights=''):
    model = Sequential()
    length = l * len(k) - sum(k) + len(k)
    model.add(Embedding(sum(pow(4, k)), 128, input_length=length))
    model.add(Flatten())

    model.add(Dense(512, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(3, activation='softmax'))

    adam = optimizers.Adam(lr=0.001)

    if weights:
        model.load_weights(weights)
        print("Created model and loaded weights from file: ", weights)
    else:
        print(model.summary())
        print("adam: 0.001", )
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])

    return model


lab1_fasta = '../dataset/raw/polyA_cs.fasta' # args.polya
lab2_fasta = '../dataset/raw/non-polyA_cs.fasta' # args.cs
lab3_fasta = '../dataset/raw/non-cs.fasta'
l = 200

k = np.array([4, 6, 8, 10])

seq_vector_lab1 = np.array(fasta_to_vectors(lab1_fasta, k))
seq_vector_lab2 = np.array(fasta_to_vectors(lab2_fasta, k))
seq_vector_lab3 = np.array(fasta_to_vectors(lab3_fasta, k))
lab_vector_lab1 = np.tile([1, 0, 0], (len(seq_vector_lab1), 1))
lab_vector_lab2 = np.tile([0, 1, 0], (len(seq_vector_lab2), 1))
lab_vector_lab3 = np.tile([0, 0, 1], (len(seq_vector_lab3), 1))

1118464


KeyboardInterrupt: 

In [4]:
# Build the whole model
random.seed(123)

size1 = len(seq_vector_lab1)
size2 = len(seq_vector_lab2)
size3 = len(seq_vector_lab3)

if size1 != size2 or size1 != size3:
    train_i1 = random.sample(range(size1), int(0.7 * size1))
    train_i2 = random.sample(range(size2), int(0.7 * size2))
    train_i3 = random.sample(range(size3), int(0.7 * size3))

else:
    train_i1 = random.sample(range(size1), int(0.7 * size1))
    train_i2 = train_i1
    train_i3 = train_i1

test_val1 = [i for i in range(size1) if i not in train_i1]
val_i1 = random.sample(test_val1, int(0.2 * size1))

test_val2 = [i for i in range(size2) if i not in train_i2]
val_i2 = random.sample(test_val2, int(0.2 * size2))

test_val3 = [i for i in range(size3) if i not in train_i3]
val_i3 = random.sample(test_val3, int(0.2 * size3))

x_train = np.concatenate((seq_vector_lab1[train_i1], seq_vector_lab2[train_i2], seq_vector_lab3[train_i3]))
y_train = np.concatenate((lab_vector_lab1[train_i1], lab_vector_lab2[train_i2], lab_vector_lab3[train_i3]))
x_val = np.concatenate((seq_vector_lab1[val_i1], seq_vector_lab2[val_i2], seq_vector_lab3[val_i3]))
y_val = np.concatenate((lab_vector_lab1[val_i1], lab_vector_lab2[val_i2], lab_vector_lab3[val_i3]))

#     sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Start training\n")
#     sys.stdout.flush()

In [5]:
weight_file = ''
model = create_model(l, k)
early_stop = EarlyStopping(monitor='val_loss', patience=10)
filepath = weight_file + "terminator.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True)

model.fit(x_train, y_train, batch_size=64, verbose=1, epochs=100,
          validation_data=(x_val, y_val),
          callbacks=[checkpoint, early_stop])
    
#     sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 776, 128)          143163392 
_________________________________________________________________
flatten_1 (Flatten)          (None, 99328)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               50856448  
_________________________________________________________________
dense_2 (Dense)              (None, 64)                32832     
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 99        
Total params: 194,054,851
Trainable params: 194,054,851
Non-trainable params: 0
______________________________________________________________

  num_elements)


Train on 92757 samples, validate on 26502 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


<keras.callbacks.History at 0x7fe6300fed68>

In [6]:
# val_loss: 0.4679 - val_acc: 0.8073

In [5]:
model = load_model('terminator-0.4679.hdf5')

  num_elements)


In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 776, 128)          143163392 
_________________________________________________________________
flatten_1 (Flatten)          (None, 99328)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               50856448  
_________________________________________________________________
dense_2 (Dense)              (None, 64)                32832     
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 99        
Total params: 194,054,851
Trainable params: 194,054,851
Non-trainable params: 0
______________________________________________________________

In [6]:
y_val.shape

(26502, 3)

In [7]:
x_val.shape

(26502, 776)

In [8]:
y_pred = model.predict(x_val)
print(classification_report(np.argmax(y_val, axis=-1), np.argmax(y_pred, axis=-1), target_names=[str(i) for i in range(3)]))

              precision    recall  f1-score   support

           0       0.91      0.96      0.94     10491
           1       0.78      0.31      0.44      5519
           2       0.72      0.92      0.81     10492

    accuracy                           0.81     26502
   macro avg       0.81      0.73      0.73     26502
weighted avg       0.81      0.81      0.78     26502



In [9]:
print(confusion_matrix(np.argmax(y_val, axis=-1), np.argmax(y_pred, axis=-1)))

[[10061    60   370]
 [  488  1712  3319]
 [  449   420  9623]]


In [10]:
y_pred_label = np.argmax(y_pred, axis=-1)
y_val_label = np.argmax(y_val, axis=-1)

In [11]:
y_pred_label[y_pred_label > 0] = 1
y_val_label[y_val_label > 0] = 1

In [12]:
y_pred_label = 1 - y_pred_label
y_val_label = 1- y_val_label

In [13]:
print(classification_report(y_val_label, y_pred_label))

              precision    recall  f1-score   support

           0       0.97      0.94      0.96     16011
           1       0.91      0.96      0.94     10491

    accuracy                           0.95     26502
   macro avg       0.94      0.95      0.95     26502
weighted avg       0.95      0.95      0.95     26502



In [14]:
y_pred_label.shape, 
y_val_label.shape

(26502,)

In [2]:
def sensitivy(y_pred, y_true):
    CM = confusion_matrix(y_true, y_pred)

    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    return TP / (FN + TP)
def acc(y_pred, y_true):
    CM = confusion_matrix(y_true, y_pred)

    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    
    return sklearn.metrics.accuracy_score(y_true, y_pred), (TP + TN) / (TP + TN + FN + FP)
def specificity(y_pred, y_true):
    CM = confusion_matrix(y_true, y_pred)

    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    return TN/(TN+FP)

In [27]:
acc(y_pred_label, y_val_label)

(0.9484189872462455, 0.9484189872462455)

In [23]:
sensitivy(y_pred_label, y_val_label)

0.9590124868935278

In [3]:
specificity(y_pred_label, y_val_label)

NameError: name 'y_pred_label' is not defined