In [1]:
with open('dataset/raw/polyA_cs.fasta') as f:
    lines = f.readlines()
    for line in lines[:2]:
        print(line)
    print(len(lines))

>chr19:47535882:-:T(-)

TATCACTGGCTTGCTGTCAATAAATATGTGGGTAAATCTCTGTTCAGGGCTCTTAGCTCTGAAAGCTGTGAGACCCCTGATTTCCCACTCCACTCCTCTATATTTCTGTGTGTGTCTTTAATTCCTCTAGCGCCGCTGGGTTAGGGTCTCCCCGACCGAGCTGGTCTCGGCAGCGGAGGTTGCAGTAAGCCGAGACCATG

104914


In [1]:
import sys
import numpy as np
import random
import argparse
from textwrap import dedent
from time import strftime
from itertools import product
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Bidirectional, Flatten, Dropout, GRU
from keras import optimizers
from sklearn.model_selection import KFold
from keras.callbacks import EarlyStopping, ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [13]:
# One hot encoding for bases
BASES = {'A': 0,  # 0001
         'C': 1,  # 0010
         'G': 2,  # 0100
         'T': 3,  # 1000
         'a': 0,
         'c': 1,
         'g': 2,
         't': 3}
VERSION = 'v0.0.1'


def one_hot_encoding(seq, k_list, kmer_encoding):
    encoded_list = np.zeros(len(seq) * len(k_list) - sum(k_list) + len(k_list))
    print(len(encoded_list))
    seq = seq.upper()
    for k_i in range(len(k_list)):
        for i in range(len(seq) - k_list[k_i] + 1):
            kmer = seq[i: i + k_list[k_i]]
            encoded_list[i + len(seq) * k_i - sum(k_list[:k_i]) + k_i] = kmer_encoding[kmer]
    print(encoded_list[:3])
    return encoded_list


def fasta_to_vectors(in_fasta, k_list):
    with open(in_fasta) as f:
        header_seq = f.readlines()[:2]

    seq = [header_seq[i * 2 + 1].strip() for i in range(int(len(header_seq)/2))]

    # Generate all unique kmers and their one hot encoding
    unique_kmers = sum(pow(4, k_list))
    print(unique_kmers, pow(4, k_list))
    all_kmers = []
    for k in k_list:
        all_kmers.extend(list(product(['A', 'T', 'C', 'G'], repeat=k)))
    all_kmers = [''.join(x) for x in all_kmers]
#     print(all_kmers[:1000])
    kmer_encoding = dict(zip(all_kmers, range(unique_kmers)))

    seq_vector = [one_hot_encoding(x, k_list, kmer_encoding) for x in seq]
    return seq_vector


def create_model(l, k, weights=''):
    model = Sequential()
    length = l * len(k) - sum(k) + len(k)
    model.add(Embedding(sum(pow(4, k)), 128, input_length=length))
    model.add(Flatten())

    model.add(Dense(512, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(3, activation='softmax'))

    adam = optimizers.Adam(lr=0.001)

    if weights:
        model.load_weights(weights)
        print("Created model and loaded weights from file: ", weights)
    else:
        print(model.summary())
        print("adam: 0.001", )
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])

    return model


# def main():
#     parser = argparse.ArgumentParser(
#         description=dedent('''
#         Terminitor training
#         -----------------------------------------------------------
#         Given three fasta files with different labels: poly(A) CS, 
#         non-poly(A) CS, and non-CS, train the model
#         '''),
#         formatter_class=argparse.RawDescriptionHelpFormatter)

#     parser.add_argument('-v', '--version', action='version', version='Terminitor ' + VERSION)
#     parser.add_argument('-polya', help="Poly(A) CS, fasta file", required=True)
#     parser.add_argument('-cs', help="Non-poly(A) CS, fasta file", required=True)
#     parser.add_argument('-non', help="Non-CS, fasta file", required=True)
#     parser.add_argument('-model', help="File name of trained model", required=True)
#     parser.add_argument('-l', help="Length of input sequences", required=True, type=int)

#     sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Start reading fasta\n")
#     sys.stdout.flush()

#     args = parser.parse_args()

lab1_fasta = 'dataset/raw/polyA_cs.fasta' # args.polya
lab2_fasta = 'dataset/raw/non-polyA_cs.fasta' # args.cs
lab3_fasta = 'dataset/raw/non-cs.fasta'
# weight_file = args.model
l = 200

k = np.array([4, 6, 8, 10])

seq_vector_lab1 = np.array(fasta_to_vectors(lab1_fasta, k))
# seq_vector_lab2 = np.array(fasta_to_vectors(lab2_fasta, k))
# seq_vector_lab3 = np.array(fasta_to_vectors(lab3_fasta, k))
lab_vector_lab1 = np.tile([1, 0, 0], (len(seq_vector_lab1), 1))
# lab_vector_lab2 = np.tile([0, 1, 0], (len(seq_vector_lab2), 1))
# lab_vector_lab3 = np.tile([0, 0, 1], (len(seq_vector_lab3), 1))

1118464 [    256    4096   65536 1048576]
776
[70. 24. 98.]


In [5]:
seq_vector_lab1.shape

(1, 776)

In [4]:
# Build the whole model
random.seed(123)

size1 = len(seq_vector_lab1)
size2 = len(seq_vector_lab2)
size3 = len(seq_vector_lab3)

if size1 != size2 or size1 != size3:
    train_i1 = random.sample(range(size1), int(0.7 * size1))
    train_i2 = random.sample(range(size2), int(0.7 * size2))
    train_i3 = random.sample(range(size3), int(0.7 * size3))

else:
    train_i1 = random.sample(range(size1), int(0.7 * size1))
    train_i2 = train_i1
    train_i3 = train_i1

test_val1 = [i for i in range(size1) if i not in train_i1]
val_i1 = random.sample(test_val1, int(0.2 * size1))

test_val2 = [i for i in range(size2) if i not in train_i2]
val_i2 = random.sample(test_val2, int(0.2 * size2))

test_val3 = [i for i in range(size3) if i not in train_i3]
val_i3 = random.sample(test_val3, int(0.2 * size3))

x_train = np.concatenate((seq_vector_lab1[train_i1], seq_vector_lab2[train_i2], seq_vector_lab3[train_i3]))
y_train = np.concatenate((lab_vector_lab1[train_i1], lab_vector_lab2[train_i2], lab_vector_lab3[train_i3]))
x_val = np.concatenate((seq_vector_lab1[val_i1], seq_vector_lab2[val_i2], seq_vector_lab3[val_i3]))
y_val = np.concatenate((lab_vector_lab1[val_i1], lab_vector_lab2[val_i2], lab_vector_lab3[val_i3]))

#     sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Start training\n")
#     sys.stdout.flush()

In [None]:
weight_file = ''
model = create_model(l, k)
early_stop = EarlyStopping(monitor='val_loss', patience=10)
filepath = weight_file + "terminator.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True)

model.fit(x_train, y_train, batch_size=64, verbose=1, epochs=100,
          validation_data=(x_val, y_val),
          callbacks=[checkpoint, early_stop])
    
#     sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 776, 128)          143163392 
_________________________________________________________________
flatten_1 (Flatten)          (None, 99328)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               50856448  
_________________________________________________________________
dense_2 (Dense)              (None, 64)                32832     
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 99        
Total params: 194,054,851
Trainable params: 194,054,851
Non-trainable params: 0
______________________________________________________________

  num_elements)


Train on 92757 samples, validate on 26502 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100