In [2]:
import os
import sys

import matplotlib as mpl
import numpy as np
import tensorflow as tf
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dense, Dropout, Embedding, LSTM, Conv1D, MaxPooling1D, Flatten, Bidirectional, Merge, GRU
from keras.models import Sequential, load_model, save_model
from keras.optimizers import Adam
from sklearn import cluster
from sklearn import preprocessing as prep
from sklearn.model_selection import StratifiedKFold

mpl.use('Agg')
import matplotlib.pyplot as plt



In [3]:
def pep1(path, seq_len):
	result = []
	resultLabel = []
	data = open(path).readlines()
	for i in range(len(data)):
		cut = (len(data[i].split()[0]) - 1 - seq_len) // 2
		trainData = data[i].split()[0][cut:-cut]
		trainLabel = int(data[i].split()[-1])
		trainDataArray = []
		for j in range(len(trainData)):
			if trainData[j] == 'A':
				trainDataArray.append([1, 0, 0, 0])
			elif trainData[j] == 'C':
				trainDataArray.append([0, 1, 0, 0])
			elif trainData[j] == 'G':
				trainDataArray.append([0, 0, 1, 0])
			elif trainData[j] == 'U':
				trainDataArray.append([0, 0, 0, 1])
			else:
				trainDataArray.append([0, 0, 0, 0])
		result.append(trainDataArray)
		resultLabel.append(trainLabel)
	result = np.array(result)
	resultLabel = np.array(resultLabel)
	result = result.reshape((result.shape[0], -1, 1))
	return result, resultLabel


In [20]:
result = []
resultLabel = []
data = open("C:/Users/Crow/Desktop/keras_chemical/finalresult_CV.txt").readlines()
seq_len = 2
#data = "GGGCGGGGUAGCUUUGUAAAUGUUUUUCUGAUUAACAAUGUCUUCUCUUUUUUAGCUAUUACAGGGAGGAGUGUUGAGACCAGAUGUCAUCUACUGUCUCUUGGGUCAGCAGCACGCAUGACAGGACCAAGGAAUGGCAGCAACACGCAGAAUCUUAGCUAGCGCUCUCCAGCAGUCUUCUCUAUUAGGGAAUGUCUAAUUGGCAUGAGAUGUUCAAAUCAGACCUGGGAUUGGACAACUGUAAAGUAUGACUGGAUCGUCAGGGCGUCGCUUGGACUCUGUAAUCUAAUGUUUAGGGCAUAUUGAAGUUGAGGUGCUGCCUUCCAGAACUUAAACAUGUAGCUCACUUUCCCCCAUCUUUUUGACAAAUGACCAGUAGUUAAUUUCUAAGGUUUUUCUUAUCAAGAGAAGAAUACUUAAAAACUCUUCCUUGUUUGCAAAAGAAACUUCUGAGGACUGAGUCUUAACUAUGCACACAAUGUCACCAGACAUCUUGAAACU ENSMUST00000000001 37026 1"
for i in range(len(data)):
	cut = (len(data[i].split()[0]) - 1 - seq_len) // 2
	trainData = data[i].split()[0][cut:-cut]
	trainLabel = int(data[i].split()[-1])
	trainDataArray = []
	for j in range(len(trainData)):
		if trainData[j] == 'A':
			trainDataArray.append([1, 0, 0, 0])
		elif trainData[j] == 'C':
			trainDataArray.append([0, 1, 0, 0])
		elif trainData[j] == 'G':
			trainDataArray.append([0, 0, 1, 0])
		elif trainData[j] == 'U':
			trainDataArray.append([0, 0, 0, 1])
		else:
			trainDataArray.append([0, 0, 0, 0])
	result.append(trainDataArray)
	resultLabel.append(trainLabel)
result = np.array(result)
resultLabel = np.array(resultLabel)
result = result.reshape((result.shape[0], -1, 1))
print(result[0:3], resultLabel[0:3])

[[[0]
  [0]
  [1]
  [0]
  [1]
  [0]
  [0]
  [0]
  [0]
  [1]
  [0]
  [0]]

 [[0]
  [0]
  [1]
  [0]
  [1]
  [0]
  [0]
  [0]
  [0]
  [1]
  [0]
  [0]]

 [[1]
  [0]
  [0]
  [0]
  [1]
  [0]
  [0]
  [0]
  [0]
  [1]
  [0]
  [0]]] [1 1 1]


In [4]:
def plot(result, out):
	plt.figure(1)
	plt.subplot(211)
	plt.plot(result.epoch, result.history['acc'], label="acc")
	plt.plot(result.epoch, result.history['val_acc'], label="val_acc")
	# plt.scatter(result.epoch, result.history['val_acc'])
	plt.legend(loc='lower right')
	plt.subplot(212)
	plt.plot(result.epoch, result.history['loss'], label="loss")
	plt.plot(result.epoch, result.history['val_loss'], label="val_loss")
	# plt.scatter(result.epoch, result.history['val_loss'], marker='*')
	plt.legend(loc='upper right')
	plt.savefig(out, dpi=300)
	plt.close()

In [5]:
def DLM(input_dim, neurons=(160, 100)):
	model = Sequential()
	model.add(Dense(neurons[0], input_dim=input_dim, activation='relu'))
	model.add(Dropout(0.5))
	# model.add(Dense(neurons[1], activation='relu'))
	# model.add(Dropout(0.8))
	model.add(Dense(1, activation='sigmoid'))
	optimizer = Adam(lr=1e-4)
	# mean_squared_error, categorical_crossentropy
	model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
	return model


In [6]:
def CNN(neurons, input_shape, dropout=0.2, window=2):
	model = Sequential()
	model.add(Conv1D(neurons, window, activation='relu', padding='same', input_shape=input_shape))
	model.add(MaxPooling1D(2))
	model.add(Dropout(dropout))
	model.add(Conv1D(neurons, window, activation='relu', padding='same'))
	model.add(MaxPooling1D(2))
	model.add(Dropout(dropout))
	model.add(Conv1D(neurons, window, activation='relu', padding='same'))
	model.add(MaxPooling1D(2))
	model.add(Dropout(dropout))
	model.add(Conv1D(neurons, window, activation='relu', padding='same'))
	model.add(MaxPooling1D(2))
	model.add(Dropout(dropout))
	model.add(Flatten())
	model.add(Dense(neurons, activation='relu'))
	model.add(Dropout(dropout))
	model.add(Dense(1, activation='sigmoid'))
	model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
	return model

In [7]:
def RNN(embed_input_dim, embed_output_dim, lstm_dim, input_length, dropout=0.2, weights=None):
	model = Sequential()
	model.add(Embedding(embed_input_dim, embed_output_dim, input_length=input_length, weights=weights, trainable=True))

	# model.add(GRU(lstm_dim, implementation=2, return_sequences=True))
	# model.add(Dropout(dropout))

	model.add(GRU(lstm_dim, implementation=2))
	model.add(Dropout(dropout))

	model.add(Dense(1024, activation='relu'))
	model.add(Dropout(dropout))
	model.add(Dense(1, activation='sigmoid'))
	model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
	return model

In [8]:
def BRNN(lstm_dim, input_shape, dropout=0.2, weights=None):
	model = Sequential()
	# model.add(Embedding(embed_input_dim, embed_output_dim, input_length=input_length, weights=weights, trainable=True))
	# model.add(Bidirectional(GRU(lstm_dim, implementation=2, return_sequences=True)))
	# model.add(Dropout(dropout))
	model.add(Dense(lstm_dim, input_shape=input_shape))

	model.add(Bidirectional(GRU(lstm_dim, implementation=2)))
	model.add(Dropout(dropout))

	model.add(Dense(lstm_dim, activation='relu'))
	model.add(Dropout(dropout))

	model.add(Dense(1, activation='sigmoid'))
	model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
	return model

In [9]:
def evaluate(X, y, model, params, out, indep=None, batch_size=32, epochs=100):
	cvs = np.zeros((len(y), 2))
	folds = StratifiedKFold(5).split(X, y)
	if indep:
		inds = np.zeros((len(indep[1]), 2))
	for i, (trained, valided) in enumerate(folds):
		X_train, y_train = X[trained], y[trained]
		X_valid, y_valid = X[valided], y[valided]
		instance = model(*params)
		if not os.path.exists('%s.%d.h5' % (out, i)):
			best_saving = ModelCheckpoint(filepath='%s.%d.h5' % (out, i), monitor='val_loss',
										  verbose=1, save_best_only=True, save_weights_only=True)
			early_stopping = EarlyStopping(monitor='val_loss', patience=100)
			instance.fit(X_train, y_train, epochs=epochs, validation_data=(X_valid, y_valid), verbose=2,
						 callbacks=[best_saving, early_stopping], batch_size=batch_size)
		instance.load_weights('%s.%d.h5' % (out, i))
		print("Validation test:", instance.evaluate(X_valid, y_valid, batch_size=batch_size))
		cvs[valided, 0], cvs[valided, 1] = instance.predict(X_valid, batch_size=batch_size)[:, 0], y_valid
		if indep:
			print("Independent test:", instance.evaluate(indep[0], indep[1], batch_size=batch_size))
			inds[:, 0] += instance.predict(indep[0], batch_size=batch_size)[:, 0]
			inds[:, 1] += indep[1]
	if indep:
		np.savetxt(out + '.ind.txt', inds / 5, fmt='%f', delimiter='\t')
	np.savetxt(out + '.cv.txt', cvs, fmt='%f', delimiter='\t')


In [10]:
def model_test(indep, path, out):
	scores = np.zeros((len(indep[1]), 2))
	instance = load_model(path)
	scores[:, 0], scores[:, 1] = indep[1], instance.predict_proba(indep[0])[:, 0]
	np.savetxt(out, scores, fmt='%f', delimiter='\t')


In [11]:
def main():
	os.chdir('./')
	# word_vector()
	dst = sys.argv[1] if len(sys.argv) > 1 else 'finalresult'
	cut = 60
	X, y = pep1('%s_CV.txt' % dst, seq_len=cut)
	indep = pep1('%s_IND.txt' % dst, seq_len=cut) if os.path.exists('%s_IND.txt' % dst) else None
	params = [64, X.shape[1:], 0.2]
	evaluate(X, y, BRNN, params, indep=indep, out='brnn_binary_%s_%d' % (dst, cut + 1), epochs=1000,
			 batch_size=2 ** 10)

In [12]:
main()

FileNotFoundError: [Errno 2] No such file or directory: '-f_CV.txt'