# Building a k-NN Classifier

In [1]:
%matplotlib inline
import expressyeaself.construct_neural_net as construct
import expressyeaself.encode_sequences as encode
import expressyeaself.organize_data as organize 
import matplotlib.pyplot as plt
import numpy 
import os
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

ROOT_DIR = os.getcwd() + '/'

Using TensorFlow backend.


### Define the input data

#### Using the full data set

In [2]:
sample_filename = ('20190612130111781831_percentiles_els_binarized_homogeneous'
                   '_deflanked_sequences_with_exp_levels.txt.gz')

#### Using a smaller sample set

In [3]:
sample_filename = '10000_from_' + sample_filename 

#### Define the absolute path

In [4]:
sample_path = ROOT_DIR + 'example/processed_data/' + sample_filename

### Encode sequences

In [5]:
X_padded, y_scaled, abs_max_el = encode.encode_sequences_with_method(sample_path, method='One-Hot', scale_els=True, model_type='LSTM', binarized_els=True)
num_seqs, max_sequence_len = organize.get_num_and_len_of_seqs_from_file(sample_path)

### Reshape expression levels

In [6]:
y_scaled = y_scaled.reshape((len(y_scaled), 1))

In [7]:
X_padded.shape

(10000, 1, 400)

### Perform a train-test split

In [8]:
test_size = 0.20

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_scaled, test_size=test_size)

In [10]:
X_train.shape

(8000, 1, 400)

## Build the model architectures

### kNN Classifier

In [11]:
classifier = Sequential()
#First Hidden Layer
classifier.add(Dense(4, activation='relu', kernel_initializer='random_normal', input_dim=3))
#Second  Hidden Layer
classifier.add(Dense(4, activation='relu', kernel_initializer='random_normal'))
#Output Layer
classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))
# Compile
classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])
# Print summary
print(classifier.summary())
# Fit the model
classifier.fit(X_train, y_train, batch_size=10, epochs=100)

W0613 09:27:00.000183 4472735168 deprecation_wrapper.py:118] From /Users/joe.abbott/miniconda3/envs/yeast/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0613 09:27:00.017385 4472735168 deprecation_wrapper.py:118] From /Users/joe.abbott/miniconda3/envs/yeast/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0613 09:27:00.020622 4472735168 deprecation_wrapper.py:118] From /Users/joe.abbott/miniconda3/envs/yeast/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4115: The name tf.random_normal is deprecated. Please use tf.random.normal instead.

W0613 09:27:00.078565 4472735168 deprecation_wrapper.py:118] From /Users/joe.abbott/miniconda3/envs/yeast/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use t

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 4)                 16        
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 20        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 5         
Total params: 41
Trainable params: 41
Non-trainable params: 0
_________________________________________________________________
None


ValueError: Error when checking input: expected dense_1_input to have 2 dimensions, but got array with shape (8000, 1, 400)

In [None]:
# Define the model parameters
batch_size = len(y_scaled) * 0.01 # no bigger than 1 % of data
filters = 15
# kernel_size
strides = 1
epochs = 10
dropout = 0.1
num_layers = 10

# Define the tensorboard and checkpointer if desired
tb = TensorBoard(log_dir='./logs', 
                 histogram_freq=3, 
                 batch_size=batch_size, 
                 write_graph=True, 
                 write_grads=True, 
                 write_images=True)
checkpointer = ModelCheckpoint(monitor='val_acc', 
                               filepath=(CHECKPOINTS_DIR + '1dcnn_onehot.hdf5'), 
                               verbose=1, 
                               save_best_only=True)

# Define the inputs
inputs = Input(shape=(max_sequence_len, 5))
layers = []

# Build up the layers
# for i in range(1, num_layers + 1):
#     layer = Conv1D(filters, (2 * i - 1), strides)(inputs)
#     layers.append(layer)

# Combine the layers
# combined = Concatenate(axis=1)(layers)

# Add some flatten, dense, and dropout layers
out = Flatten()(combined)
# out = Dropout(dropout)(out)
out = Dense(500, activation='relu',
            kernel_initializer='normal')(out)
out = Dropout(dropout)(out)
out = Dense(1, activation='relu',
            kernel_initializer='normal')(out)
out = Dropout(dropout)(out)

# Define the model with inputs and outputs, and compile.
model = Model(inputs=inputs, outputs=out)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
print(model.summary())