# Building the 1D CNN

### Import the relevant packages

In [1]:
%matplotlib inline
import expressyeaself.construct_neural_net as construct
import expressyeaself.encode_sequences as encode
import expressyeaself.organize_data as organize 
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import subprocess
import sys
from tqdm import tqdm

import tensorflow as tf
# tf.get_variable('test_bool', 1, tf.bool)
from tensorflow.python.keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import (Add, Concatenate, Input, Dense, 
                                            Dropout, Embedding, Conv1D, 
                                            MaxPooling1D, GlobalAveragePooling1D, 
                                            Flatten)
from tensorflow.python.keras import regularizers
from tensorflow.python.keras.utils import to_categorical
from tensorflow.python.keras.callbacks import ModelCheckpoint, TensorBoard
from sklearn.model_selection import StratifiedKFold

ROOT_DIR = os.getcwd() + '/'
CHECKPOINTS_DIR = ROOT_DIR + 'expressyeaself/models/1dcnn/checkpoints/'

### Define the input data

#### Using the full data set

In [2]:
sample_filename = ('20190612130111781831_percentiles_els_binarized_homogeneous'
                   '_deflanked_sequences_with_exp_levels.txt.gz')

#### Using a smaller sample set

In [3]:
sample_filename = '10000_from_' + sample_filename 

#### Define the absolute path

In [4]:
sample_path = ROOT_DIR + 'example/processed_data/' + sample_filename

### Encode sequences

In [5]:
import time as t
t0 = t.time()
X_padded, y_scaled, abs_max_el = encode.encode_sequences_with_method(sample_path, method='One-Hot', scale_els=True, model_type='1DCNN', binarized_els=True)
num_seqs, max_sequence_len = organize.get_num_and_len_of_seqs_from_file(sample_path)
t1 = t.time()
print(t1-t0)


1.1355481147766113


### Reshape expression levels

In [6]:
y_scaled = y_scaled.reshape((len(y_scaled), 1))
# scaler = MinMaxScaler()
# scaler.fit(y_scaled)
# y_scaled = scaler.transform(y_scaled)

### Perform a train-test split

In [7]:
test_size = 0.20

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_scaled, test_size=test_size)

## Build the model architectures

### Sequential Model

In [9]:
# Define the model parameters
batch_size = len(y_scaled) * 0.01 # no bigger than 1 % of data
filters = 15
kernel_size = 3
strides = 1
epochs = 20
dropout = 0.5

# Define the tensorboard and checkpointer if desired
tb = TensorBoard(log_dir='./logs', 
                 histogram_freq=3, 
                 batch_size=batch_size, 
                 write_graph=True, 
                 write_grads=True, 
                 write_images=True)
checkpointer = ModelCheckpoint(monitor='val_acc', 
                               filepath=(CHECKPOINTS_DIR + '1dcnn_onehot.hdf5'), 
                               verbose=1, 
                               save_best_only=True)


# Define the model
model = Sequential()

# Build up the layers
model.add(Conv1D(filters, kernel_size, activation='relu', 
                 input_shape=(max_sequence_len, 5), 
                 kernel_regularizer=regularizers.l2(0.01)))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(MaxPooling1D(3, strides))
#     keras.layers.Flatten(data_format=None)
# model.add(GlobalAveragePooling1D())
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(GlobalAveragePooling1D())

# Add some dense and dropout layers
model.add(Dropout(dropout))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='mse', optimizer='rmsprop', metrics=['accuracy'])

# Print model summary
print(model.summary())
    

W0612 13:29:52.200875 4496975296 callbacks.py:1466] `write_grads` will be ignored in TensorFlow 2.0 for the `TensorBoard` Callback.
W0612 13:29:52.201920 4496975296 callbacks.py:1469] `batch_size` is no longer needed in the `TensorBoard` Callback and will be ignored in TensorFlow 2.0.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 78, 15)            240       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 76, 15)            690       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 74, 15)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 72, 15)            690       
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 70, 15)            690       
_________________________________________________________________
global_average_pooling1d (Gl (None, 15)                0         
_________________________________________________________________
dropout (Dropout)            (None, 15)                0

### Parallel Model

In [10]:
# Define the model parameters
batch_size = len(y_scaled) * 0.01 # no bigger than 1 % of data
filters = 15
# kernel_size
strides = 1
epochs = 10
dropout = 0.1
num_layers = 10

# Define the tensorboard and checkpointer if desired
tb = TensorBoard(log_dir='./logs', 
                 histogram_freq=3, 
                 batch_size=batch_size, 
                 write_graph=True, 
                 write_grads=True, 
                 write_images=True)
checkpointer = ModelCheckpoint(monitor='val_acc', 
                               filepath=(CHECKPOINTS_DIR + '1dcnn_onehot.hdf5'), 
                               verbose=1, 
                               save_best_only=True)

# Define the inputs
inputs = Input(shape=(max_sequence_len, 5))
layers = []

# Build up the layers
for i in range(1, num_layers + 1):
    layer = Conv1D(filters, (2 * i - 1), strides)(inputs)
    layers.append(layer)

# Combine the layers
combined = Concatenate(axis=1)(layers)

# Add some flatten, dense, and dropout layers
out = Flatten()(combined)
# out = Dropout(dropout)(out)
out = Dense(500, activation='sigmoid')(out)
out = Dropout(dropout)(out)
out = Dense(1, activation='sigmoid')(out)
out = Dropout(dropout)(out)

# Define the model with inputs and outputs, and compile.
model = Model(inputs=inputs, outputs=out)
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

# Print model summary
print(model.summary())

W0612 13:29:52.417435 4496975296 callbacks.py:1466] `write_grads` will be ignored in TensorFlow 2.0 for the `TensorBoard` Callback.
W0612 13:29:52.418480 4496975296 callbacks.py:1469] `batch_size` is no longer needed in the `TensorBoard` Callback and will be ignored in TensorFlow 2.0.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 80, 5)]      0                                            
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 80, 15)       90          input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 78, 15)       240         input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 76, 15)       390         input_1[0][0]                    
______________________________________________________________________________________________

### Fit and Evaluate the model

In [11]:
# Fit the model
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,verbose=1,
                    validation_data=(X_test, y_test))#, callbacks=[checkpointer])


# Evaluate the model
score = max(history.history['val_acc'])
print("%s: %.2f%%" % (model.metrics_names[1], score*100))
plt = construct.plot_results(history.history)
plt.show()

Train on 8000 samples, validate on 2000 samples
Epoch 1/10


TypeError: slice indices must be integers or None or have an __index__ method