# model training with CRISPRoff dataset

In [None]:
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Conv2D, BatchNormalization, Activation, MaxPool2D, Dropout, Flatten, Dense
from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_squared_error, mean_absolute_error
import scipy as sp
import math
import matplotlib.pyplot as plt
from function import *
%matplotlib inline

np.set_printoptions(threshold=np.inf) 
tf.compat.v1.disable_eager_execution()

In [4]:
FILE = pd.read_csv("./dataset/testfile.csv")
data = pd.DataFrame(columns=(['40mer']))
data['40mer'] = FILE['40mer']

x_data = data.iloc[:, 0]
x_data = grna_preprocess(x_data,40)

y_data = FILE['efficiency']
y_data = np.array(y_data)
y_data = y_data.reshape(len(y_data), -1)

tss1,tss2,tss3,tss4 = FILE['nor_tss1'],FILE['nor_tss2'],FILE['nor_tss3'],FILE['nor_tss4']
tss1,tss2,tss3,tss4 = epi_progress(tss1),epi_progress(tss2),epi_progress(tss3),epi_progress(tss4)
Methylation,ATAC,RNA = FILE['nor_methylation'],FILE['nor_atac'],FILE['nor_rna']
Methylation,ATAC,RNA = epi_progress(Methylation),epi_progress(ATAC),epi_progress(RNA)
model_input = np.concatenate((x_data, tss1, tss2, tss3, tss4, Methylation, ATAC, RNA), axis=3)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(model_input, y_data, test_size=0.1)

In [None]:
def model_build():
    data1_input = Input(shape=(1, 40, 11))
    data1_Conv1 = Conv2D(filters=30, kernel_size=(1, 1), padding='same', activation='relu')(data1_input)
    data1_Conv2 = Conv2D(filters=30, kernel_size=(1, 2), padding='same', activation='relu')(data1_input)
    data1_Conv3 = Conv2D(filters=30, kernel_size=(1, 3), padding='same', activation='relu')(data1_input)
    data1_Conv4 = Conv2D(filters=30, kernel_size=(1, 4), padding='same', activation='relu')(data1_input)
    data1_Conv5 = Conv2D(filters=30, kernel_size=(1, 5), padding='same', activation='relu')(data1_input)
    data1_t = tf.keras.layers.Concatenate()([data1_Conv1, data1_Conv2, data1_Conv3, data1_Conv4, data1_Conv5])
    #data1_conv1_2 = Conv2D(filters=120, strides=(1, 2), kernel_size=(1, 3), padding='valid')(data1_t)
    data1_p1 = MaxPool2D(strides=2, padding='same')(data1_t)
    #BN1 = BatchNormalization()(data1_p1)
    #data1_d1 = Dropout(0.5)(data1_p1)

    flatten = Flatten()(data1_p1)
    BN1 = BatchNormalization()(flatten)
    #f5 = Dense(100, activation='relu')(flatten)
    f3 = Dense(80, activation='relu')(BN1)
    BN2 = BatchNormalization()(f3)
    drop1 = Dropout(0.5)(BN2)
    f6 = Dense(60, activation='relu')(drop1)
    BN3 = BatchNormalization()(f6)
    drop2 = Dropout(0.5)(BN3)
    f4 = Dense(40, activation='relu')(drop2)
    BN4 = BatchNormalization()(f4)
    #f7 = Dense(40, activation='relu')(f4)
    #BN2 = BatchNormalization()(f4)
    d2 = Dropout(0.5)(BN4)
    output = Dense(1, activation="linear", name="output")(d2)
    model = Model(inputs=data1_input, outputs=[output])
    return model

In [None]:
def model_build():
    data_input = Input(shape=(1, 40, 11))
    data_Conv1 = Conv2D(filters=30, kernel_size=(1, 1), padding='same', activation='relu')(data_input)
    data_Conv2 = Conv2D(filters=30, kernel_size=(1, 2), padding='same', activation='relu')(data_input)
    data_Conv3 = Conv2D(filters=30, kernel_size=(1, 3), padding='same', activation='relu')(data_input)
    data_Conv4 = Conv2D(filters=30, kernel_size=(1, 4), padding='same', activation='relu')(data_input)
    data_Conv5 = Conv2D(filters=30, kernel_size=(1, 5), padding='same', activation='relu')(data_input)
    data_t = tf.keras.layers.Concatenate()([data_Conv1, data_Conv2, data_Conv3, data_Conv4, data_Conv5])
    data_p = MaxPool2D(strides=2, padding='same')(data_t)
    data_d1 = Dropout(0.4)(data_p)

    flatten = Flatten()(data_d1)
    BN1 = BatchNormalization()(flatten)
    f1 = Dense(80, activation='relu')(BN1)
    BN2 = BatchNormalization()(f1)
    drop1 = Dropout(0.4)(BN2)
    f2 = Dense(60, activation='relu')(drop1)
    BN3 = BatchNormalization()(f2)
    drop2 = Dropout(0.4)(BN3)
    f3 = Dense(40, activation='relu')(drop2)
    BN4 = BatchNormalization()(f3)
    drop3 = Dropout(0.4)(BN4)
    output = Dense(1, activation="linear", name="output")(drop3)
    model = Model(inputs=data_input, outputs=[output])
    return model

In [None]:
model = model_build()
model.summary()
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0001),
                loss='mse'    ,
                metrics=['mse'])

history = model.fit(x_train, y_train, batch_size=256, epochs=60, validation_split=0.2)

In [None]:
y_test_pred = model.predict(x_test)
spermanr = sp.stats.spearmanr(y_test, y_test_pred)[0]
print(spermanrs)

In [None]:
model.save("./model/CRISPRoff_seq_sper.h5")

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
plt.plot(loss, label='loss')
plt.plot(val_loss, label='val_loss')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper right')