# KE5006 Applied Research

### Identifying enhancers and their strength with deep neural networks

# Layer 1 Model with Physiochemical Property Features - Results

## Summary of Findings
* xxx

## Load libraries

In [1]:
# Set the working directory (which contains the directories source, data, etc.)
import os
os.chdir(os.path.join(os.path.sep, 'home', 'tkokkeng', 'Documents', 'KE5006-AppliedResearch', 'enhancer'))
os.getcwd()

'/home/tkokkeng/Documents/KE5006-AppliedResearch/enhancer'

In [2]:
# Check if the directory containing the source files are in the path.
import sys
if os.path.join(os.getcwd(), 'source') not in sys.path:
    sys.path.append(os.path.join(os.getcwd(), 'source'))
sys.path

['/home/tkokkeng/python/python367/tsfvenv/lib/python36.zip',
 '/home/tkokkeng/python/python367/tsfvenv/lib/python3.6',
 '/home/tkokkeng/python/python367/tsfvenv/lib/python3.6/lib-dynload',
 '/usr/lib/python3.6',
 '',
 '/home/tkokkeng/python/python367/tsfvenv/lib/python3.6/site-packages',
 '/home/tkokkeng/.local/lib/python3.6/site-packages',
 '/usr/local/lib/python3.6/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/home/tkokkeng/python/python367/tsfvenv/lib/python3.6/site-packages/IPython/extensions',
 '/home/tkokkeng/.ipython',
 '/home/tkokkeng/Documents/KE5006-AppliedResearch/enhancer/source']

In [3]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
import pickle

import myUtilities as mu

from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef, make_scorer, recall_score, roc_auc_score, roc_curve 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import MinMaxScaler

from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras import layers
from keras.optimizers import RMSprop
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard

Using TensorFlow backend.


## Load data

In [4]:
enhancer_df = pd.read_csv(os.path.join('data', 'enhancer.csv'))
enhancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1484 entries, 0 to 1483
Data columns (total 2 columns):
id          1484 non-null object
sequence    1484 non-null object
dtypes: object(2)
memory usage: 23.3+ KB


In [5]:
enhancer_df['enhancer'] = np.ones((len(enhancer_df),))

In [6]:
enhancer_df.head()

Unnamed: 0,id,sequence,enhancer
0,CHRX_48897056_48897256,CACAATGTAGAAGCAGAGACACAGGAACCAGGCTTGGTGATGGCTC...,1.0
1,CHR12_6444339_6444539,GCCCTCACATTCCCTGGCCCATCCCCTCCACCTCAAAATTTACAAA...,1.0
2,CHR12_6444939_6445139,GAGCAGGAGGCCAGTCACCCTGAGTCAGCCACGGGGAGACGCTGCA...,1.0
3,CHR12_6445139_6445339,CCTCTGCTGAGAACAGGACTGGGGCTTCCAGGGCAACAGGAAGGGT...,1.0
4,CHR12_6445339_6445539,ACAGCCTTAAAGGGAGCTTTTCAGGGACCTCTGGCCAGTGGGGGAT...,1.0


In [7]:
non_enhancer_df = pd.read_csv(os.path.join('data', 'non_enhancer.csv'))
non_enhancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1484 entries, 0 to 1483
Data columns (total 2 columns):
id          1484 non-null object
sequence    1484 non-null object
dtypes: object(2)
memory usage: 23.3+ KB


In [8]:
non_enhancer_df['enhancer'] = np.zeros((len(non_enhancer_df),))

In [9]:
non_enhancer_df.head()

Unnamed: 0,id,sequence,enhancer
0,CHRX_2970600_2970800,CAGTCACATCTGTAATCACAATACGTTGGGAGGCTGAGGCAGGAGG...,0.0
1,CHRX_6179400_6179600,ACTTTGAAGAAGTCAGTCATCAAGATGAGAGACCCAACTGTCAAGC...,0.0
2,CHRX_11003079_11003279,TCGGCCTCCCAAAGTGCTGGGATTATAGGCATGAGCTACTGCACCC...,0.0
3,CHRX_22042679_22042879,TGGGAGCTGTATCAATCATGTTTTTTATTTTCTATATTTTATGATG...,0.0
4,CHRX_23280479_23280679,TACAGCAAATAGCCTTGGCAGATACAGTGTTTCCCTCCAGAGCAAA...,0.0


## Combine the data frames to form a single dataset

In [10]:
all_data_df = pd.concat([enhancer_df, non_enhancer_df])
all_data_df.reset_index(drop=True, inplace=True)
all_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2968 entries, 0 to 2967
Data columns (total 3 columns):
id          2968 non-null object
sequence    2968 non-null object
enhancer    2968 non-null float64
dtypes: float64(1), object(2)
memory usage: 69.6+ KB


In [11]:
all_data_df.head()

Unnamed: 0,id,sequence,enhancer
0,CHRX_48897056_48897256,CACAATGTAGAAGCAGAGACACAGGAACCAGGCTTGGTGATGGCTC...,1.0
1,CHR12_6444339_6444539,GCCCTCACATTCCCTGGCCCATCCCCTCCACCTCAAAATTTACAAA...,1.0
2,CHR12_6444939_6445139,GAGCAGGAGGCCAGTCACCCTGAGTCAGCCACGGGGAGACGCTGCA...,1.0
3,CHR12_6445139_6445339,CCTCTGCTGAGAACAGGACTGGGGCTTCCAGGGCAACAGGAAGGGT...,1.0
4,CHR12_6445339_6445539,ACAGCCTTAAAGGGAGCTTTTCAGGGACCTCTGGCCAGTGGGGGAT...,1.0


All the sequences are of length 200 characters.

In [12]:
all_data_df['sequence'].map(lambda x: len(x)).value_counts()

200    2968
Name: sequence, dtype: int64

## Load the physiochemical property data

In [13]:
pcp_df = pd.read_csv(os.path.join('data', 'S2.csv'), index_col=0)
pcp_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, AA to TT
Data columns (total 6 columns):
Rise     16 non-null float64
Roll     16 non-null float64
Shift    16 non-null float64
Slide    16 non-null float64
Tilt     16 non-null float64
Twist    16 non-null float64
dtypes: float64(6)
memory usage: 896.0+ bytes


In [14]:
scaler = MinMaxScaler()
pcp_df.loc[:, :] = scaler.fit_transform(pcp_df.values)
pcp_df

Unnamed: 0,Rise,Roll,Shift,Slide,Tilt,Twist
AA,0.430303,0.403042,1.0,0.545455,0.4,0.833333
AC,0.818182,0.695817,0.618557,1.0,0.7,0.833333
AG,0.257576,0.315589,0.762887,0.772727,0.3,0.791667
AT,0.860606,1.0,0.319588,0.863636,0.6,0.75
CA,0.045455,0.220532,0.360825,0.090909,0.1,0.291667
CC,0.548485,0.171103,0.731959,0.545455,0.3,1.0
CG,0.0,0.304183,0.371134,0.0,0.0,0.333333
CT,0.257576,0.315589,0.762887,0.772727,0.3,0.791667
GA,0.706061,0.277567,0.618557,0.5,0.4,0.833333
GC,1.0,0.536122,0.494845,0.5,1.0,0.75


## Prepare the sequence data for modelling

Create a transformation pipleline to prepare the training dataset for RNN.

In [15]:
# This class selects the desired attributes and drops the rest.
class DataFrameSelector(BaseEstimator, TransformerMixin):

    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names]

In [16]:
# This class converts a nucleotide base (A, C, G, T) to one-hot-encoding.
class one_hot_encoder(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.tokenizer = Tokenizer(num_words=4, lower=False, char_level=True)

    def fit(self, X, y=None):
        # Note that X is a data frame.
        # Fit the tokenizer on the 1st sequence in the dataset.
        self.tokenizer.fit_on_texts(X.iloc[0, 0])
        self.len_sequence = len(X.iloc[0, 0])
        return self

    def transform(self, X):
        # Note that X is a data frame.
        one_hot_X = X.iloc[:, 0].map(lambda x: self.tokenizer.texts_to_matrix(x, mode='binary')).values
        one_hot_X = np.concatenate(one_hot_X)
        one_hot_X = np.reshape(one_hot_X, (-1, self.len_sequence, 4))
        return one_hot_X

In [17]:
# This class converts a sequence of nucleotide bases (A, C, G, T) to a sequence of dinucleotides and then to a sequence of pysiochemical properties of each dinucleotide.
class pcp_encoder(BaseEstimator, TransformerMixin):

    def __init__(self, pcp_df):
        self.pcp_df = pcp_df

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Note that X is a data frame.
        dinuc_seq = X.iloc[:, 0].map(lambda x: [ x[i:i+2] for i in range(len(x) - 1) ])
        pcp_seq = dinuc_seq.map(lambda x: [ pcp_df[j][i] for i in x for j in pcp_df.columns.tolist() ])
        # Pad with -1 for last element of sequence; it does not have an associated di-nucleotide
        pcp_seq = pcp_seq.map(lambda x: np.array(x + [-1. for i in range(len(pcp_df.columns))]).reshape((len(X.iloc[0, 0]), len(pcp_df.columns)))).values
        # pandas values returns a 1-D array of objects; use numpy stack to reshape it to a multi-dimensional array
        return np.stack(pcp_seq)

In [18]:
# This class shapes a numpy array.
class Array_Shaper(BaseEstimator, TransformerMixin):
    
    def __init__(self, shape):
        self.shape = shape
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.reshape(self.shape)

In [19]:
attrbs = ['sequence']
num_bases = 4  # number of nucleotide bases
num_pcp = 6  # number of di-nucleotide physiochemical properties
len_seq = len(all_data_df['sequence'][0])
one_hot_pipeline = Pipeline([
    ('selector', DataFrameSelector(attrbs)),
    ('one_hot_encoder', one_hot_encoder()),
    ('array_shaper2D', Array_Shaper((-1, num_bases)))
])
pcp_pipeline = Pipeline([
    ('selector', DataFrameSelector(attrbs)),
    ('pcp_encoder', pcp_encoder(pcp_df)),
    ('array_shaper2D', Array_Shaper((-1, num_pcp)))
])
union_pipeline = FeatureUnion(transformer_list=[
    ("one_hot_pipeline", one_hot_pipeline),
    ("pcp_pipeline", pcp_pipeline)
])
my_pipeline = Pipeline([
    ('feature_combiner', union_pipeline),
    ('array_shaper3D', Array_Shaper((-1, len_seq, num_bases + num_pcp)))
])

In [20]:
X = my_pipeline.fit_transform(all_data_df)
X.shape

(2968, 200, 10)

Check the 1st sequence is correctly encoded.

In [21]:
X[0, :10, :]

array([[0.        , 0.        , 0.        , 0.        , 0.04545455,
        0.22053232, 0.36082474, 0.09090909, 0.1       , 0.29166667],
       [0.        , 0.        , 0.        , 1.        , 0.81818182,
        0.69581749, 0.6185567 , 1.        , 0.7       , 0.83333333],
       [0.        , 0.        , 0.        , 0.        , 0.04545455,
        0.22053232, 0.36082474, 0.09090909, 0.1       , 0.29166667],
       [0.        , 0.        , 0.        , 1.        , 0.43030303,
        0.40304183, 1.        , 0.54545455, 0.4       , 0.83333333],
       [0.        , 0.        , 0.        , 1.        , 0.86060606,
        1.        , 0.31958763, 0.86363636, 0.6       , 0.75      ],
       [0.        , 0.        , 1.        , 0.        , 0.04545455,
        0.22053232, 0.36082474, 0.09090909, 0.1       , 0.29166667],
       [0.        , 1.        , 0.        , 0.        , 0.81818182,
        0.69581749, 0.6185567 , 1.        , 0.7       , 0.83333333],
       [0.        , 0.        , 1.       

In [22]:
X[0, -10:, :]

array([[ 0.        ,  0.        ,  1.        ,  0.        ,  0.43030303,
         0.40304183,  1.        ,  0.54545455,  0.4       ,  0.83333333],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  0.04545455,
         0.22053232,  0.36082474,  0.09090909,  0.1       ,  0.29166667],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  0.81818182,
         0.69581749,  0.6185567 ,  1.        ,  0.7       ,  0.83333333],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  0.70606061,
         0.27756654,  0.6185567 ,  0.5       ,  0.4       ,  0.83333333],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.54848485,
         0.17110266,  0.73195876,  0.54545455,  0.3       ,  1.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.25757576,
         0.31558935,  0.7628866 ,  0.77272727,  0.3       ,  0.79166667],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.13636364

In [23]:
y = all_data_df['enhancer'].values
y.shape

(2968,)

In [24]:
y[:10]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

## Split the dataset into train / validation sets

For the initial base model, we will use a simple train / validation split. 5-fold cross-validation will be used during model fine-tuning to obtain the final model.

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=123)

In [26]:
X_train.shape

(1988, 200, 10)

In [27]:
X_train[0][:10]

array([[0.        , 0.        , 0.        , 0.        , 0.25757576,
        0.31558935, 0.7628866 , 0.77272727, 0.3       , 0.79166667],
       [0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.13636364, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        , 0.81818182,
        0.69581749, 0.6185567 , 1.        , 0.7       , 0.83333333],
       [0.        , 0.        , 0.        , 0.        , 0.54848485,
        0.17110266, 0.73195876, 0.54545455, 0.3       , 1.        ],
       [0.        , 0.        , 0.        , 0.        , 0.04545455,
        0.22053232, 0.36082474, 0.09090909, 0.1       , 0.29166667],
       [0.        , 0.        , 0.        , 1.        , 0.43030303,
        0.40304183, 1.        , 0.54545455, 0.4       , 0.83333333],
       [0.        , 0.        , 0.        , 1.        , 0.86060606,
        1.        , 0.31958763, 0.86363636, 0.6       , 0.75      ],
       [0.        , 0.        , 1.       

In [28]:
y_train.shape

(1988,)

In [29]:
y_train[0]

1.0

## Load models

### 1x16 Conv1D, 2x8 GRU Bi-directional, 1x8 Dense with Dropouts (.6/.6/.6), warm restarts cycle = 10 (last model in cycle)

In [204]:
from numpy.random import seed
seed(123)
from tensorflow import set_random_seed
set_random_seed(456)

In [205]:
# Best model
model = load_model(os.path.join('models', 'pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr1.best-epch575.h5'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 192, 16)           1456      
_________________________________________________________________
batch_normalization_1 (Batch (None, 192, 16)           64        
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 96, 16)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 96, 32)            3264      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 32)                4800      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 264       
__________

In [206]:
results_df = pd.DataFrame()

In [207]:
results_df['best'] = pd.Series(model.predict_classes(X_val, batch_size=128, verbose=1).flatten())



In [208]:
results_df.head()

Unnamed: 0,best
0,0
1,0
2,1
3,1
4,0


In [209]:
best_acc = 100. * (len(y_val) - (np.abs(results_df['best'].values - y_val)).sum()) / len(y_val)
best_acc

76.22448979591837

In [210]:
path = [os.path.join('models', 'pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr1-weights', i)
        for i in ['model_wgts_epch0279.h5', 'model_wgts_epch0379.h5', 'model_wgts_epch0479.h5', 'model_wgts_epch0679.h5', 'model_wgts_epch0779.h5', 'model_wgts_epch0879.h5']]
path

['models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr1-weights/model_wgts_epch0279.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr1-weights/model_wgts_epch0379.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr1-weights/model_wgts_epch0479.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr1-weights/model_wgts_epch0679.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr1-weights/model_wgts_epch0779.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr1-weights/model_wgts_epch0879.h5']

In [211]:
for idx, a_file in enumerate(path):
    model.load_weights(filepath=a_file, by_name=False)
    results_df['model' + str(idx)] = model.predict_classes(X_val, batch_size=128, verbose=1)



In [212]:
results_df.head()

Unnamed: 0,best,model0,model1,model2,model3,model4,model5
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0


In [213]:
for i in range(6):
    print((results_df['best'] - results_df['model' + str(i)]).abs().sum())

46
35
27
24
39
59


In [219]:
threshold = len(results_df.loc[:, results_df.columns !='ensemble'].columns) // 2
print('threshold = {}'.format(threshold))

threshold = 3


In [220]:
results_df['ensemble'] = results_df.loc[:, results_df.columns !='ensemble'].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

Unnamed: 0,best,model0,model1,model2,model3,model4,model5,ensemble
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0


In [221]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

75.61224489795919

### 2x16 GRU, 1x16 Dense with Dropouts (.1/.1/.1), warm restarts cycle = 10 (last model in cycle)

In [30]:
from numpy.random import seed
seed(123)
from tensorflow import set_random_seed
set_random_seed(456)

In [31]:
# Best model
model = load_model(os.path.join('models', 'pcp-2x16gru1x16dense-dropout010101-wr1.best-epch940.h5'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, None, 16)          1344      
_________________________________________________________________
gru_2 (GRU)                  (None, 16)                1632      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 3,265
Trainable params: 3,265
Non-trainable params: 0
_________________________________________________________________


In [32]:
results_df = pd.DataFrame()

In [33]:
results_df['best'] = pd.Series(model.predict_classes(X_val, batch_size=128, verbose=1).flatten())



In [34]:
results_df.head()

Unnamed: 0,best
0,0
1,0
2,1
3,0
4,0


In [35]:
best_acc = 100. * (len(y_val) - (np.abs(results_df['best'].values - y_val)).sum()) / len(y_val)
best_acc

76.73469387755102

In [44]:
path = [os.path.join('models', 'pcp-2x16gru1x16dense-dropout010101-wr1-weights', i)
        for i in ['model_wgts_epch0639.h5', 'model_wgts_epch0739.h5', 'model_wgts_epch0839.h5', 'model_wgts_epch1039.h5', 'model_wgts_epch1139.h5', 'model_wgts_epch1239.h5']]
path

['models/pcp-2x16gru1x16dense-dropout010101-wr1-weights/model_wgts_epch0639.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr1-weights/model_wgts_epch0739.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr1-weights/model_wgts_epch0839.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr1-weights/model_wgts_epch1039.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr1-weights/model_wgts_epch1139.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr1-weights/model_wgts_epch1239.h5']

In [45]:
for idx, a_file in enumerate(path):
    model.load_weights(filepath=a_file, by_name=False)
    results_df['model' + str(idx)] = model.predict_classes(X_val, batch_size=128, verbose=1)



In [46]:
results_df.head()

Unnamed: 0,best,model0,model1,model2,model3,model4,model5,ensemble
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1
3,0,1,1,1,1,1,0,1
4,0,0,0,0,0,0,0,0


In [47]:
for i in range(6):
    print((results_df['best'] - results_df['model' + str(i)]).abs().sum())

92
68
60
64
69
73


In [48]:
threshold = len(results_df.loc[:, results_df.columns !='ensemble'].columns) // 2
print('threshold = {}'.format(threshold))

threshold = 3


In [49]:
results_df['ensemble'] = results_df.loc[:, results_df.columns !='ensemble'].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

Unnamed: 0,best,model0,model1,model2,model3,model4,model5,ensemble
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1
3,0,1,1,1,1,1,0,1
4,0,0,0,0,0,0,0,0


In [50]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

75.61224489795919

### 2x16 GRU, 1x16 Dense with Dropouts (.1/.1/.1), warm restarts cycle = 200, lr=0.003, best in cycle

In [106]:
from numpy.random import seed
seed(123)
from tensorflow import set_random_seed
set_random_seed(456)

In [107]:
# Best model
model = load_model(os.path.join('models', 'pcp-2x16gru1x16dense-dropout010101-wr41.best-epch1283.h5'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, None, 16)          1344      
_________________________________________________________________
gru_2 (GRU)                  (None, 16)                1632      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 3,265
Trainable params: 3,265
Non-trainable params: 0
_________________________________________________________________


In [108]:
results_df = pd.DataFrame()

In [109]:
results_df['best'] = pd.Series(model.predict_classes(X_val, batch_size=128, verbose=1).flatten())



In [110]:
best_acc = 100. * (len(y_val) - (np.abs(results_df['best'].values - y_val)).sum()) / len(y_val)
best_acc

76.73469387755102

In [111]:
path = [os.path.join('models', 'pcp-2x16gru1x16dense-dropout010101-wr41-weights', i)
        for i in ['model_wgts_cyc0400.h5', 'model_wgts_cyc0600.h5', 'model_wgts_cyc0800.h5', 'model_wgts_cyc1000.h5', 'model_wgts_cyc1200.h5', 'model_wgts_cyc1400.h5', 'model_wgts_cyc1600.h5',
                 'model_wgts_cyc1800.h5', 'model_wgts_cyc2000.h5', 'model_wgts_cyc2200.h5', 'model_wgts_cyc2400.h5', 'model_wgts_cyc2600.h5', 'model_wgts_cyc2800.h5', 'model_wgts_cyc3000.h5']]
path

['models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc0400.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc0600.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc0800.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc1000.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc1200.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc1400.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc1600.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc1800.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc2000.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc2200.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc2400.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc2600.h5',
 'models/pcp-2x16gru1x16dens

In [112]:
for idx, a_file in enumerate(path):
    model.load_weights(filepath=a_file, by_name=False)
    results_df['model' + str(idx)] = model.predict_classes(X_val, batch_size=128, verbose=1)



In [113]:
for i in range(14):
    print((results_df['best'] - results_df['model' + str(i)]).abs().sum())

116
99
86
64
58
0
31
44
66
61
65
74
83
82


In [134]:
for i in range(14):
    acc = 100. * (len(y_val) - (np.abs(results_df['model' + str(i)].values - y_val)).sum()) / len(y_val)
    print('model' + str(i) + ' = ', acc)

model0 =  75.51020408163265
model1 =  76.22448979591837
model2 =  76.53061224489795
model3 =  76.53061224489795
model4 =  76.53061224489795
model5 =  76.73469387755102
model6 =  76.42857142857143
model7 =  75.91836734693878
model8 =  75.91836734693878
model9 =  75.40816326530613
model10 =  75.61224489795919
model11 =  75.3061224489796
model12 =  75.40816326530613
model13 =  75.71428571428571


In [114]:
cols = results_df.columns.tolist()
cols.remove('best')
cols.remove('model0')
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))

threshold = 6


In [115]:
cols = results_df.columns.tolist()
cols.remove('best')
cols.remove('model0')
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 6


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model12,model13,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [116]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

76.53061224489795

In [117]:
cols = results_df.columns.tolist()
for i in ['best', 'model0', 'model1', 'model13', 'ensemble']:
    cols.remove(i)
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 5


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model12,model13,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [118]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

76.63265306122449

In [119]:
cols = results_df.columns.tolist()
for i in ['best', 'model0', 'model1', 'model11', 'model12', 'model13', 'ensemble']:
    cols.remove(i)
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 4


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model12,model13,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [120]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

77.34693877551021

In [121]:
cols = results_df.columns.tolist()
for i in ['best', 'model0', 'model1', 'model9', 'model10', 'model11', 'model12', 'model13', 'ensemble']:
    cols.remove(i)
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 3


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model12,model13,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [122]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

77.24489795918367

In [125]:
cols = results_df.columns.tolist()
for i in ['best', 'model0', 'model1', 'model2', 'model8', 'model9', 'model10', 'model11', 'model12', 'model13', 'ensemble']:
    cols.remove(i)
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 2


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model12,model13,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [126]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

77.44897959183673

In [127]:
cols = results_df.columns.tolist()
for i in ['best', 'model0', 'model1', 'model2', 'model3', 'model7', 'model8', 'model9', 'model10', 'model11', 'model12', 'model13', 'ensemble']:
    cols.remove(i)
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 1


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model12,model13,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [128]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

76.83673469387755

In [133]:
cols = ['model5', 'model9', 'model13']
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

threshold = 1


76.12244897959184

In [135]:
cols = ['model2', 'model3', 'model4', 'model5', 'model6']
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

threshold = 2


76.83673469387755

* best only (5) = 76.73469387755102
* 13 models (1-13) = 76.53061224489795
* 11 models (2-12) = 76.63265306122449
* 9 models (2-10) = 77.34693877551021
* 7 models (2-8) = 77.24489795918367
* **5 models (3-7) = 77.44897959183673**
* 3 models (4-6) = 76.83673469387755
* 3 models (5, 9, 13) = 76.12244897959184
* 5 best models (2-6) = 76.83673469387755

It is probably better to take a total of 5-9 models aound the best model.

### 2x16 GRU, 1x16 Dense with Dropouts (.1/.1/.1), warm restarts cycle = 200, lr=0.001, best in cycle

In [68]:
from numpy.random import seed
seed(123)
from tensorflow import set_random_seed
set_random_seed(456)

In [69]:
# Best model
model = load_model(os.path.join('models', 'pcp-2x16gru1x16dense-dropout010101-wr45.best-epch1479.h5'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, None, 16)          1344      
_________________________________________________________________
gru_2 (GRU)                  (None, 16)                1632      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 3,265
Trainable params: 3,265
Non-trainable params: 0
_________________________________________________________________


In [70]:
results_df = pd.DataFrame()

In [71]:
results_df['best'] = pd.Series(model.predict_classes(X_val, batch_size=128, verbose=1).flatten())



In [72]:
best_acc = 100. * (len(y_val) - (np.abs(results_df['best'].values - y_val)).sum()) / len(y_val)
best_acc

77.14285714285714

In [73]:
path = [os.path.join('models', 'pcp-2x16gru1x16dense-dropout010101-wr45-weights', i)
        for i in ['model_wgts_cyc0400.h5', 'model_wgts_cyc0600.h5', 'model_wgts_cyc0800.h5', 'model_wgts_cyc1000.h5', 'model_wgts_cyc1200.h5', 'model_wgts_cyc1400.h5', 'model_wgts_cyc1600.h5',
                 'model_wgts_cyc1800.h5', 'model_wgts_cyc2000.h5', 'model_wgts_cyc2200.h5', 'model_wgts_cyc2400.h5', 'model_wgts_cyc2600.h5', 'model_wgts_cyc2800.h5', 'model_wgts_cyc3000.h5']]
path

['models/pcp-2x16gru1x16dense-dropout010101-wr45-weights/model_wgts_cyc0400.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr45-weights/model_wgts_cyc0600.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr45-weights/model_wgts_cyc0800.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr45-weights/model_wgts_cyc1000.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr45-weights/model_wgts_cyc1200.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr45-weights/model_wgts_cyc1400.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr45-weights/model_wgts_cyc1600.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr45-weights/model_wgts_cyc1800.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr45-weights/model_wgts_cyc2000.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr45-weights/model_wgts_cyc2200.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr45-weights/model_wgts_cyc2400.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr45-weights/model_wgts_cyc2600.h5',
 'models/pcp-2x16gru1x16dens

In [74]:
for idx, a_file in enumerate(path):
    model.load_weights(filepath=a_file, by_name=False)
    results_df['model' + str(idx)] = model.predict_classes(X_val, batch_size=128, verbose=1)



In [75]:
for i in range(14):
    print((results_df['best'] - results_df['model' + str(i)]).abs().sum())

89
93
95
92
40
19
0
19
48
56
42
50
63
67


In [76]:
for i in range(14):
    acc = 100. * (len(y_val) - (np.abs(results_df['model' + str(i)].values - y_val)).sum()) / len(y_val)
    print('model' + str(i) + ' = ', acc)

model0 =  75.20408163265306
model1 =  75.40816326530613
model2 =  75.20408163265306
model3 =  75.3061224489796
model4 =  76.12244897959184
model5 =  76.83673469387755
model6 =  77.14285714285714
model7 =  76.83673469387755
model8 =  76.93877551020408
model9 =  76.73469387755102
model10 =  76.3265306122449
model11 =  76.12244897959184
model12 =  75.81632653061224
model13 =  75.81632653061224


In [77]:
cols = results_df.columns.tolist()
cols.remove('best')
cols.remove('model0')
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))

threshold = 6


In [78]:
cols = results_df.columns.tolist()
cols.remove('best')
cols.remove('model0')
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 6


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model12,model13,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [79]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

76.42857142857143

In [80]:
cols = results_df.columns.tolist()
for i in ['best', 'model0', 'model1', 'model13', 'ensemble']:
    cols.remove(i)
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 5


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model12,model13,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [81]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

76.53061224489795

In [82]:
cols = results_df.columns.tolist()
for i in ['best', 'model0', 'model1', 'model11', 'model12', 'model13', 'ensemble']:
    cols.remove(i)
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 4


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model12,model13,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [83]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

76.83673469387755

In [84]:
cols = results_df.columns.tolist()
for i in ['best', 'model0', 'model1', 'model2', 'model10', 'model11', 'model12', 'model13', 'ensemble']:
    cols.remove(i)
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 3


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model12,model13,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [85]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

76.73469387755102

In [86]:
cols = results_df.columns.tolist()
for i in ['best', 'model0', 'model1', 'model2', 'model3', 'model9', 'model10', 'model11', 'model12', 'model13', 'ensemble']:
    cols.remove(i)
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 2


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model12,model13,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [87]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

76.93877551020408

In [88]:
cols = results_df.columns.tolist()
for i in ['best', 'model0', 'model1', 'model2', 'model3', 'model4', 'model8', 'model9', 'model10', 'model11', 'model12', 'model13', 'ensemble']:
    cols.remove(i)
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 1


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,model11,model12,model13,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [89]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

77.04081632653062

In [90]:
cols = ['model6', 'model10', 'model13']
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

threshold = 1


76.53061224489795

In [91]:
cols = ['model5', 'model6', 'model7', 'model8', 'model9']
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

threshold = 2


76.83673469387755

* best only (6) = 77.14285714285714
* 13 models (1-13) = 76.42857142857143
* 11 models (2-12) = 76.53061224489795
* 9 models (2-10) = 76.83673469387755
* 7 models (3-9) = 76.73469387755102
* 5 models (4-8) = 76.93877551020408
* 3 models (5-7) = 77.04081632653062
* 3 models (6, 10, 13) = 76.53061224489795
* 5 best models (5-9) = 76.83673469387755

The ensembles fail to outperform the best model. It is possible the max_lr rate is not high enough to produce diverse models between cycles. Comparing the difference in outputs with the best model, the differences are higher for when the max_lr is .003 compared to .001.

### 1x16 conv1D, 2x16 GRU bidirectional, 1x8 Dense, dropouts (.6/.6/.6), warm restarts cycle = 200, max_lr=.001, best in cycle

In [121]:
from numpy.random import seed
seed(123)
from tensorflow import set_random_seed
set_random_seed(456)

In [122]:
# Best model
model = load_model(os.path.join('models', 'pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44.best-epch871.h5'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 192, 16)           1456      
_________________________________________________________________
batch_normalization_1 (Batch (None, 192, 16)           64        
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 96, 16)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 96, 32)            3264      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 32)                4800      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 264       
__________

In [123]:
results_df = pd.DataFrame()

In [124]:
results_df['best'] = pd.Series(model.predict_classes(X_val, batch_size=128, verbose=1).flatten())



In [125]:
best_acc = 100. * (len(y_val) - (np.abs(results_df['best'].values - y_val)).sum()) / len(y_val)
best_acc

76.22448979591837

In [126]:
path = [os.path.join('models', 'pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights', i)
        for i in ['model_wgts_cyc0200.h5', 'model_wgts_cyc0400.h5', 'model_wgts_cyc0600.h5',
                  'model_wgts_cyc0800.h5', 'model_wgts_cyc1000.h5', 'model_wgts_cyc1200.h5',
                  'model_wgts_cyc1400.h5', 'model_wgts_cyc1600.h5',
                  'model_wgts_cyc1800.h5', 'model_wgts_cyc2000.h5']]
path

['models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights/model_wgts_cyc0200.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights/model_wgts_cyc0400.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights/model_wgts_cyc0600.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights/model_wgts_cyc0800.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights/model_wgts_cyc1000.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights/model_wgts_cyc1200.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights/model_wgts_cyc1400.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights/model_wgts_cyc1600.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights/model_wgts_cyc1800.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights/model_wgts_cyc2000.h5']

In [127]:
for idx, a_file in enumerate(path):
    model.load_weights(filepath=a_file, by_name=False)
    results_df['model' + str(idx)] = model.predict_classes(X_val, batch_size=128, verbose=1)



In [128]:
for i in range(10):
    print((results_df['best'] - results_df['model' + str(i)]).abs().sum())

85
58
54
37
0
55
32
37
61
61


In [129]:
for i in range(10):
    acc = 100. * (len(y_val) - (np.abs(results_df['model' + str(i)].values - y_val)).sum()) / len(y_val)
    print('model' + str(i) + ' = ', acc)

model0 =  74.6938775510204
model1 =  76.22448979591837
model2 =  75.81632653061224
model3 =  75.91836734693878
model4 =  76.22448979591837
model5 =  75.51020408163265
model6 =  75.61224489795919
model7 =  75.51020408163265
model8 =  75.91836734693878
model9 =  75.71428571428571


In [130]:
cols = results_df.columns.tolist()
cols.remove('best')
cols.remove('model0')
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))

threshold = 4


In [131]:
cols = results_df.columns.tolist()
cols.remove('best')
cols.remove('model0')
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 4


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0


In [132]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

76.0204081632653

In [133]:
cols = results_df.columns.tolist()
for i in ['best', 'model0', 'model1', 'model9', 'ensemble']:
    cols.remove(i)
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 3


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0


In [134]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

75.81632653061224

In [135]:
cols = results_df.columns.tolist()
for i in ['best', 'model0', 'model1', 'model7', 'model8', 'model9', 'ensemble']:
    cols.remove(i)
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 2


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0


In [136]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

75.81632653061224

In [137]:
cols = results_df.columns.tolist()
for i in ['best', 'model0', 'model1', 'model2', 'model6', 'model7', 'model8', 'model9', 'ensemble']:
    cols.remove(i)
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 1


Unnamed: 0,best,model0,model1,model2,model3,model4,model5,model6,model7,model8,model9,ensemble
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,0,0


In [138]:
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

75.91836734693878

In [139]:
cols = ['model1', 'model4', 'model7']
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

threshold = 1


76.42857142857143

In [140]:
cols = ['model1', 'model4', 'model8']
threshold = len(cols) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.loc[:, cols].apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
acc = 100. * (len(y_val) - (np.abs(results_df['ensemble'].values - y_val)).sum()) / len(y_val)
acc

threshold = 1


76.63265306122449

* best only (1 or 4) = 76.22448979591837
* 9 models (1-9) = 76.0204081632653
* 7 models (2-8) = 75.81632653061224
* 5 models (2-6) = 75.81632653061224
* 3 models (3-5) = 75.91836734693878
* **3 models (1, 4, 7) = 76.42857142857143**
* 3 best models (1, 4, 8) = 76.63265306122449

Taking 3 models around the best model (1, 4, 7) beats the best model.

 ## Evaluate models on the test set

In [142]:
test_df = pd.read_csv(os.path.join('data', 'independent.csv'))
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 3 columns):
id          400 non-null object
type        400 non-null object
sequence    400 non-null object
dtypes: object(3)
memory usage: 9.5+ KB


In [143]:
test_df.head()

Unnamed: 0,id,type,sequence
0,Chr11_6627824_6628024,strong,ATGCTGCCAGAAGGAAAAGGGGTGGAATTAATGAAACTGGAAGGTT...
1,Chr11_9587224_9587424,strong,GGCATTTTTTAACCTGTGTTTCATTTTCATCTGTGAAATGTGAATA...
2,Chr11_65187024_65187224,strong,GAAACCACAGAGCTGACCTGGCTTCAGAACAAGATGTGGGGCTCCA...
3,Chr10_74014594_74014794,strong,TTTGCATAGGGGCATTACCACTGGACTTGGGCTCAGAGCAAGTGTT...
4,Chr10_105667810_105668010,strong,CGGGAGGCGGGGGTTGCAGTGAGCCAAGATCACACCACTGCACTCC...


In [146]:
test_df['type'].value_counts()

non-enhancer    200
weak            100
strong          100
Name: type, dtype: int64

In [148]:
test_df['enhancer'] = test_df['type'].map(lambda x: 0.0 if x == 'non-enhancer' else 1.0)

In [151]:
test_df.head()

Unnamed: 0,id,type,sequence,enhancer
0,Chr11_6627824_6628024,strong,ATGCTGCCAGAAGGAAAAGGGGTGGAATTAATGAAACTGGAAGGTT...,1.0
1,Chr11_9587224_9587424,strong,GGCATTTTTTAACCTGTGTTTCATTTTCATCTGTGAAATGTGAATA...,1.0
2,Chr11_65187024_65187224,strong,GAAACCACAGAGCTGACCTGGCTTCAGAACAAGATGTGGGGCTCCA...,1.0
3,Chr10_74014594_74014794,strong,TTTGCATAGGGGCATTACCACTGGACTTGGGCTCAGAGCAAGTGTT...,1.0
4,Chr10_105667810_105668010,strong,CGGGAGGCGGGGGTTGCAGTGAGCCAAGATCACACCACTGCACTCC...,1.0


In [156]:
test_df.loc[test_df['type'] == 'weak', :][:5]

Unnamed: 0,id,type,sequence,enhancer
100,hg19_ct_UserTrack_3545_11005,weak,TTATGGTCACCTTCGACCCCAGAAATAATGGTCTCTGTTGTCAGAT...,1.0
101,hg19_ct_UserTrack_3545_8529,weak,CATCCAGGCTTGGTCCTGGTTGTTCCTTGCTGTTATACCAGCCTGG...,1.0
102,hg19_ct_UserTrack_3545_7245,weak,TTGTTTTTTTCTGTTTTGAGACGGAGTTTCGCTCTTGTTGCCCAGG...,1.0
103,hg19_ct_UserTrack_3545_12669,weak,ACTGTTAAATAGCAAAAATTATTGAGCTCAAACCATCTAACCAGGT...,1.0
104,hg19_ct_UserTrack_3545_5404,weak,GAGAATTAAGTTTGTATTAAGTTGGAGACCAGGGCAGATGGAAAGA...,1.0


In [157]:
test_df.loc[test_df['type'] == 'non-enhancer', :][:5]

Unnamed: 0,id,type,sequence,enhancer
200,hg19_ct_UserTrack_3545_158,non-enhancer,AATTTTCTCATTTTCTCATAAAGTTTAACAGTTGTTTATTTGAGTC...,0.0
201,hg19_ct_UserTrack_3545_57,non-enhancer,ACTGGTTATCTTTTAGGACTAGTTAATATAACCCATTCTCTAACCA...,0.0
202,hg19_ct_UserTrack_3545_762,non-enhancer,ATGCATATGTTCTTCAGTAAACAGAGCAGCCACTGGTACCACAGGA...,0.0
203,hg19_ct_UserTrack_3545_78,non-enhancer,CTGCTCTCCTCGCTCTATAAAAGTCAGAGTGCCTAAGCTGTTAATT...,0.0
204,hg19_ct_UserTrack_3545_9,non-enhancer,GCTTGGGTATATATTGTCCAATATAGCAGGCCTCATGTGCTCCTTA...,0.0


In [158]:
X_test = my_pipeline.fit_transform(test_df)
X_test.shape

(400, 200, 10)

In [159]:
X_test[0, :10, :]

array([[0.        , 0.        , 0.        , 1.        , 0.86060606,
        1.        , 0.31958763, 0.86363636, 0.6       , 0.75      ],
       [0.        , 0.        , 1.        , 0.        , 0.04545455,
        0.22053232, 0.36082474, 0.09090909, 0.1       , 0.29166667],
       [0.        , 1.        , 0.        , 0.        , 1.        ,
        0.53612167, 0.49484536, 0.5       , 1.        , 0.75      ],
       [0.        , 0.        , 0.        , 0.        , 0.25757576,
        0.31558935, 0.7628866 , 0.77272727, 0.3       , 0.79166667],
       [0.        , 0.        , 1.        , 0.        , 0.04545455,
        0.22053232, 0.36082474, 0.09090909, 0.1       , 0.29166667],
       [0.        , 1.        , 0.        , 0.        , 1.        ,
        0.53612167, 0.49484536, 0.5       , 1.        , 0.75      ],
       [0.        , 0.        , 0.        , 0.        , 0.54848485,
        0.17110266, 0.73195876, 0.54545455, 0.3       , 1.        ],
       [0.        , 0.        , 0.       

In [161]:
y_test = test_df['enhancer'].values
y_test.shape

(400,)

In [162]:
y[:10]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

#### 2x16gru, 1x16dense, dropout 010101, best epch 865

In [164]:
# Best model
model = load_model(os.path.join('models', 'pcp-2x16gru1x16dense-dropout0101.best-epch865.h5'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, None, 16)          1344      
_________________________________________________________________
gru_2 (GRU)                  (None, 16)                1632      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 3,265
Trainable params: 3,265
Non-trainable params: 0
_________________________________________________________________


In [166]:
model.evaluate(X_test, y_test, batch_size=None, verbose=1)



[0.5632989072799682, 0.74]

#### 1x16cv, 2x16gruB, 1x8d, dropout 060606, best epch 693

In [167]:
# Best model
model = load_model(os.path.join('models', 'pcp-1x16cv-2x16gruB-1x8d-dropout060606.best-epch693.h5'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_3 (Conv1D)            (None, 192, 16)           1456      
_________________________________________________________________
batch_normalization_3 (Batch (None, 192, 16)           64        
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 96, 16)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 96, 32)            3264      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 32)                4800      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 264       
__________

In [168]:
model.evaluate(X_test, y_test, batch_size=None, verbose=1)



[0.5652854883670807, 0.7375]

#### Model Ensemble for base model 2x16gru1x16dense-dropout010101 using 5 models (3-7) 

In [177]:
# Best model
model = load_model(os.path.join('models', 'pcp-2x16gru1x16dense-dropout010101-wr41.best-epch1283.h5'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, None, 16)          1344      
_________________________________________________________________
gru_2 (GRU)                  (None, 16)                1632      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 3,265
Trainable params: 3,265
Non-trainable params: 0
_________________________________________________________________


In [178]:
results_df = pd.DataFrame()

In [179]:
path = [os.path.join('models', 'pcp-2x16gru1x16dense-dropout010101-wr41-weights', i)
        for i in ['model_wgts_cyc1000.h5', 'model_wgts_cyc1200.h5', 'model_wgts_cyc1400.h5',
                  'model_wgts_cyc1600.h5', 'model_wgts_cyc1800.h5']]
path

['models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc1000.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc1200.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc1400.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc1600.h5',
 'models/pcp-2x16gru1x16dense-dropout010101-wr41-weights/model_wgts_cyc1800.h5']

In [180]:
for idx, a_file in enumerate(path):
    model.load_weights(filepath=a_file, by_name=False)
    results_df['model' + str(idx)] = pd.Series(model.predict_classes(X_test, batch_size=128, verbose=1).flatten())



In [181]:
results_df.head()

Unnamed: 0,model0,model1,model2,model3,model4
0,1,1,1,1,1
1,0,0,1,1,0
2,1,1,1,1,1
3,1,1,1,1,1
4,0,1,1,1,1


In [182]:
threshold = len(results_df.columns) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 2


Unnamed: 0,model0,model1,model2,model3,model4,ensemble
0,1,1,1,1,1,1
1,0,0,1,1,0,0
2,1,1,1,1,1,1
3,1,1,1,1,1,1
4,0,1,1,1,1,1


In [183]:
acc = 100. * (len(y_test) - (np.abs(results_df['ensemble'].values - y_test)).sum()) / len(y_test)
acc

75.25

#### Model Ensemble for base learner 1x16cv, 2x16gruB, 1x8d, dropout 060606 using 3 models (1, 4 and 7) 

In [190]:
# Best model
model = load_model(os.path.join('models', 'pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44.best-epch871.h5'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 192, 16)           1456      
_________________________________________________________________
batch_normalization_1 (Batch (None, 192, 16)           64        
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 96, 16)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 96, 32)            3264      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 32)                4800      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 264       
__________

In [191]:
results_df = pd.DataFrame()

In [192]:
path = [os.path.join('models', 'pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights', i)
        for i in ['model_wgts_cyc0400.h5', 'model_wgts_cyc1000.h5', 'model_wgts_cyc1600.h5']]
path

['models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights/model_wgts_cyc0400.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights/model_wgts_cyc1000.h5',
 'models/pcp-1x16cv-2x16gruB-1x8d-dropout060606-wr44-weights/model_wgts_cyc1600.h5']

In [193]:
for idx, a_file in enumerate(path):
    model.load_weights(filepath=a_file, by_name=False)
    results_df['model' + str(idx)] = pd.Series(model.predict_classes(X_test, batch_size=128, verbose=1).flatten())



In [194]:
results_df.head()

Unnamed: 0,model0,model1,model2
0,1,1,1
1,1,1,1
2,1,1,1
3,1,1,1
4,1,1,1


In [195]:
threshold = len(results_df.columns) // 2
print('threshold = {}'.format(threshold))
results_df['ensemble'] = results_df.apply(lambda x: 1 if x.sum() > threshold else 0, axis=1)
results_df.head()

threshold = 1


Unnamed: 0,model0,model1,model2,ensemble
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1


In [196]:
acc = 100. * (len(y_test) - (np.abs(results_df['ensemble'].values - y_test)).sum()) / len(y_test)
acc

75.5