# Stacked Autoencoder and Classsification

In [1]:
%run ConMatAndStats.py

In [2]:
# load data

from pandas import read_csv

path = '../Data/'
filename = 'selected_data.csv' 
testfilename = 'selected_test_data.csv'
data = read_csv(path + filename)
test = read_csv(path + testfilename)

allnames = data.columns
Xnames = data.columns[0:data.values.shape[1]-1]

array = data.values 
X = array[:,0:array.shape[1]-1] 
Y = array[:,array.shape[1]-1]

testarray = test.values 
Xtest = testarray[:,0:array.shape[1]-1] 
Ytest = testarray[:,array.shape[1]-1]

Xsize = X.shape[1]

import time
start = time.time()

#### PCA

Rerun the PCA on the selected feature to get idea of how many indeoendent vectors there are

In [3]:
import numpy
u, s, vh = numpy.linalg.svd(data.values, full_matrices=False)

In [4]:
for i in range (5, 30, 5):
    print('The first %d vectors explain %5.2f%% of all the variance in the dataset' %(i, 100 * s[0:i].sum() / s.sum()))

The first 5 vectors explain 72.55% of all the variance in the dataset
The first 10 vectors explain 89.50% of all the variance in the dataset
The first 15 vectors explain 97.17% of all the variance in the dataset
The first 20 vectors explain 99.84% of all the variance in the dataset
The first 25 vectors explain 100.00% of all the variance in the dataset


.

.

.

.

# Simple NN

In [5]:
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
import numpy
# fix random seed for reproducibility
random_seed = 7
numpy.random.seed(random_seed)

NNLayerSize = 5

2.0.0


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.33, random_state=random_seed, stratify=Y)

In [7]:
tf.random.set_seed(random_seed)
BasicNNmodel = Sequential()
BasicNNmodel.add(Dense(Xsize, input_dim=Xsize, activation='relu')) 
BasicNNmodel.add(Dense(NNLayerSize, activation='relu')) 
BasicNNmodel.add(Dense(1, activation='sigmoid'))
BasicNNmodel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
BasicNNmodel.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=10, batch_size=500)

Train on 65019 samples, validate on 32025 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe0b80174a8>

In [8]:
Ypred = BasicNNmodel.predict_classes(Xtest)
a = ConMatAndStats (Ytest, Ypred)

Confusion Matrix
[[19651   428]
 [ 5181 14898]]
Accuracy: 0.860  F1: 0.875  MCC: 0.742


simple NN has reasonable predictive ability

.

.

.

# The Stacked Autoencoder

First import required modules and split the training set into training and validation components

#### The Stacked Autoencoder mdel

Input layer takes the dimension of number of features read from file these are fed to a dense layer with number of neurons given by the Encoder_layer1_size variable

The stacked layer is a second dense layer of size Encoder_stack_size ad this will be used as the output from the autoencoder.

A final decoder layer is required for training, this has size the same as the input features and a sigmoid activation.

The whole model will be trained by using the same X matrix as input and classification, as the aim of the model is to reduce the dimensions of the input to the number of neurons in the stacked layer, but still be able to retrieve as much of the input information as possible when this is decoded. The model is effectively a lossy compression and the trainin is to minimise the loss in the compression algorithm.

Setting the model up with the separate layers and explicit connections rather than using Keras Sequential mechanism allows the intermediate layers to be called afterwards by defining separate models that use appropriate ayers as their input and output.

In [9]:
tf.random.set_seed(random_seed)

Encoder_layer1_size = 15
Encoder_stack_size = 8

input_data = Input(shape=(Xsize,))
first_encoded = Dense(Encoder_layer1_size, activation='relu')(input_data)
stacked_encoded = Dense(Encoder_stack_size, activation='relu')(first_encoded)
decoded = Dense(Xsize, activation = 'sigmoid')(stacked_encoded)
stacked_autoencoder = Model(input_data, decoded)

decoder_layer = stacked_autoencoder.layers[-1]
stacked_encoded_input = Input(shape=(Encoder_stack_size,))
decoder = Model(stacked_encoded_input, decoder_layer(stacked_encoded_input))
stacked_encoder = Model(input_data, stacked_encoded)


In [10]:
stacked_autoencoder.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
stacked_autoencoder.fit(X_train, X_train, validation_data=(X_val, X_val), epochs=25, batch_size=500)

Train on 65019 samples, validate on 32025 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7fe0104810b8>

#### Encode the datasets

Having trained the encoder it is now possible to use the stacked_encoder to compress the three X datasets down to the size defined for the second dense layer

In [11]:
EncodeX = stacked_encoder.predict(X)
Encode_X_train = stacked_encoder.predict(X_train)
Encode_X_val = stacked_encoder.predict(X_val)
Encode_Xtest = stacked_encoder.predict(Xtest)

## Check the Models

Having seen from prvious runs that the decison tree and logistic regression gave the best results, these two models will now be run against the output from the stacked autoencoder

In [12]:
from pandas import read_csv 
from matplotlib import pyplot 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression 

# prepare models 
models = []  
models.append(('CART', DecisionTreeClassifier())) 
models.append(('LR', LogisticRegression(solver='liblinear'))) 


results = [] 
names = [] 
stats =[]
scoring = 'accuracy' 
for name, model in models:   
    print(name)
    print("Check against Validation Set ...")
    model.fit(Encode_X_train, Y_train)
    Ypred  = model.predict(Encode_X_val)
    a = ConMatAndStats(Y_val, Ypred)
    print("Test check ...")
    Ypred = model.predict(Encode_Xtest)
    stats.append(ConMatAndStats (Ytest, Ypred))
    
    print() 

CART
Check against Validation Set ...
Confusion Matrix
[[15973    39]
 [   27 15986]]
Accuracy: 0.998  F1: 0.998  MCC: 0.996
Test check ...
Confusion Matrix
[[19821   258]
 [18751  1328]]
Accuracy: 0.527  F1: 0.676  MCC: 0.137

LR
Check against Validation Set ...
Confusion Matrix
[[15562   450]
 [  246 15767]]
Accuracy: 0.978  F1: 0.978  MCC: 0.957
Test check ...
Confusion Matrix
[[19396   683]
 [  176 19903]]
Accuracy: 0.979  F1: 0.978  MCC: 0.958



## Result

The Logistic Regression model maintains better accuracy against the Test dataset and so would be the best available result so far, fir future tests only include LR

In [13]:
models = []  
models.append(('LR', LogisticRegression(solver='liblinear'))) 

.

.

.

.

# Mutual Information Theoretic Selection

Concatenate the encoded X matrices onto the train and test sets.

For the expanded training set determine the features with highest mutual information against the test class. 

In [None]:
XplusEncode = numpy.concatenate((X, EncodeX), axis=1)
XtestplusEncode = numpy.concatenate((Xtest, Encode_Xtest), axis=1)
XtrainplusEncode = numpy.concatenate((X_train, Encode_X_train), axis=1)
XvalplusEncode = numpy.concatenate((X_val, Encode_X_val), axis=1)

In [None]:
from sklearn.feature_selection import mutual_info_classif
BestFits = mutual_info_classif(XplusEncode,Y)

### Select

And select the top features to be retained for modelling

In [None]:
Features_selected = 5

TopX = XplusEncode[:,BestFits.argsort()[-Features_selected:]]
TopXtest = XtestplusEncode[:,BestFits.argsort()[-Features_selected:]]
TopXtrain = XtrainplusEncode[:,BestFits.argsort()[-Features_selected:]]
TopXval = XvalplusEncode[:,BestFits.argsort()[-Features_selected:]]

### Retest Models

Then retest the decision tree and logistic regression models agains these

In [None]:
results = [] 
names = [] 
stats =[]
scoring = 'accuracy' 
for name, model in models:   
    print(name)
    print("Check against Validation Set ...")
    model.fit(TopXtrain, Y_train)
    Ypred  = model.predict(TopXval)
    a = ConMatAndStats(Y_val, Ypred)
    print("Test check ...")
    Ypred = model.predict(TopXtest)
    stats.append(ConMatAndStats (Ytest, Ypred))
    
    print() 

## Similar Results?

results vary on runs but are similar to just using the output from the stacked autoencoder

.

.

.

.

# Decision Tree Wrapper

Now try with a Decison Tree Wrapper. Select larger group of top features from the previous Mutual Information ranking, then pass these to the decion tree wrapper t select the same number of features as previously used in the models. See if this gives a better result

In [None]:
# needed to install package for tree wrapper
# !pip install mlxtend 

In [None]:
More_selected = 30

MoreX = XplusEncode[:,BestFits.argsort()[-More_selected:]]
MoreXtest = XtestplusEncode[:,BestFits.argsort()[-More_selected:]]
MoreXtrain = XtrainplusEncode[:,BestFits.argsort()[-More_selected:]]
MoreXval = XvalplusEncode[:,BestFits.argsort()[-More_selected:]]

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [None]:
feature_selector = SequentialFeatureSelector(DecisionTreeClassifier(),
           k_features=8, #Features_selected,
           forward=True,
           verbose=1,
           scoring='roc_auc',
           cv=4,
           n_jobs=-1)

In [None]:
features = feature_selector.fit(MoreX, Y)

In [None]:
features.k_feature_idx_

In [None]:
TopTreeX = MoreX[:,features.k_feature_idx_]
TopTreeXtest = MoreXtest[:,features.k_feature_idx_]
TopTreeXtrain = MoreXtrain[:,features.k_feature_idx_]
TopTreeXval = MoreXval[:,features.k_feature_idx_]

In [None]:
results = [] 
names = [] 
stats =[]
scoring = 'accuracy' 
for name, model in models:   
    print(name)
    print("Check against Validation Set ...")
    model.fit(TopTreeXtrain, Y_train)
    Ypred  = model.predict(TopTreeXval)
    a = ConMatAndStats(Y_val, Ypred)
    print("Test check ...")
    Ypred = model.predict(TopTreeXtest)
    stats.append(ConMatAndStats (Ytest, Ypred))
    
    print()  

# More Neural Nets 

Basd on the outputs from the selections above


# 1) NN using All X features, plus autoencoder output

In [None]:
tf.random.set_seed(random_seed)

DataAndAutoencoderModel = Sequential()
DataAndAutoencoderModel.add(Dense(XtrainplusEncode.shape[1], input_dim=XtrainplusEncode.shape[1], activation='relu')) 
DataAndAutoencoderModel.add(Dense(NNLayerSize, activation='relu')) 
DataAndAutoencoderModel.add(Dense(1, activation='sigmoid'))
DataAndAutoencoderModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
DataAndAutoencoderModel.fit(XtrainplusEncode, Y_train, validation_data=(XvalplusEncode, Y_val), epochs=10, batch_size=500)

In [None]:
Ypred = DataAndAutoencoderModel.predict_classes(XtestplusEncode)
a = ConMatAndStats (Ytest, Ypred)

# 2) NN using output from the MI filter

In [None]:
tf.random.set_seed(random_seed)

MIFilteredModel = Sequential()
MIFilteredModel.add(Dense(TopXtrain.shape[1], input_dim=TopXtrain.shape[1], activation='relu')) 
MIFilteredModel.add(Dense(NNLayerSize, activation='relu')) 
MIFilteredModel.add(Dense(1, activation='sigmoid'))
MIFilteredModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
MIFilteredModel.fit(TopXtrain, Y_train, validation_data=(TopXval, Y_val), epochs=10, batch_size=500)

In [None]:
Ypred = MIFilteredModel.predict_classes(TopXtest)
a = ConMatAndStats (Ytest, Ypred)

# 3) NN using Output from the Tree Wrapper

In [None]:
tf.random.set_seed(random_seed)

MIFilteredPLusTreeWrapperModel = Sequential()
MIFilteredPLusTreeWrapperModel.add(Dense(TopTreeXtrain.shape[1], input_dim=TopTreeXtrain.shape[1], activation='relu')) 
MIFilteredPLusTreeWrapperModel.add(Dense(NNLayerSize, activation='relu')) 
MIFilteredPLusTreeWrapperModel.add(Dense(1, activation='sigmoid'))
MIFilteredPLusTreeWrapperModel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
MIFilteredPLusTreeWrapperModel.fit(TopTreeXtrain, Y_train, validation_data=(TopTreeXval, Y_val), epochs=10, batch_size=500)

In [None]:
Ypred = MIFilteredPLusTreeWrapperModel.predict_classes(TopTreeXtest)
a = ConMatAndStats (Ytest, Ypred)

In [None]:
end = time.time()
print("Full Run Time = %.3f" % (end - start))