# Evalutation of Classifiers on full dataset (05/17/2017)

The following are details of my experiments using the Random Forest, Logistic Regression and Multilayer Percpetron learning methods.

In [1]:
import time
import matplotlib
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.manifold import Isomap
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from utils import load_data_h5, combine_positive_negative_data
from keras_utils import precision, recall
from keras import losses
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

matplotlib.style.use('ggplot')



plt.clf()
norm = mpl.colors.Normalize(vmin=0, vmax=1)
cmap = cm.Spectral
m = cm.ScalarMappable(norm=norm, cmap=cmap)

Using TensorFlow backend.


<matplotlib.figure.Figure at 0x7fe24fe607b8>

In [2]:
random_state = 0
t0 = time.clock()
X_p, y_p = load_data_h5("data/ml_pro_features_labels.h5", mode=1)
X_n, y_n = load_data_h5("data/ml_pro_features_labels.h5", mode=0)

print("Data loaded in ", (time.clock() - t0), " seconds.")

X = combine_positive_negative_data(X_n, X_p)
y = combine_positive_negative_data(y_n, y_p)

t1 = time.clock()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)
t2 = time.clock()


print ("Train-Test split: ",X_train.shape[0], "(train)","\t",X_test.shape[0], "(test)")
print("Train-Test split completed in ", (t1-t2), " seconds.")

Data loaded in  1.7797090000000004  seconds.


TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
logistic0 = LogisticRegression(random_state=random_state)
t5 = time.clock()
logistic0.fit(X_train, y_train)
print ("Logistic Regression trained on full features in ", (time.clock() - t5), " seconds.")
logistic_preds0 = logistic0.predict(X_test)
generate_report("Logistic: Full Features", logistic0, logistic_preds0, y_test)

In [None]:
model0 = Sequential()

model0.add(Dense(X_train.shape[1], input_shape=(X_train.shape[1],), activation='sigmoid'))
model0.add(Dense(1, activation='sigmoid'))


model0.compile(optimizer=Adam(lr=1e-4), loss=losses.binary_crossentropy, metrics=[precision, recall])

print(model0.summary())
t7 = time.clock()
model0.fit(X_train, y_train, batch_size=1000, epochs=100, validation_split=0.25, callbacks=[EarlyStopping()])
print ("MLP trained on full features in ", (time.clock() - t7), " seconds")

preds = model0.predict(X_test)

preds[preds >= 0.5] = 1
preds[preds < 0.5] = 0
generate_report("MLP: Full Features", model0, preds, y_test)

The multilayer perceptron outperforms the logistic regression in terms of correctly distinguishing true positives from false positives.

In [None]:
rforest0 = RandomForestClassifier(max_depth=16,random_state=random_state)
t1 = time.clock()
rforest0.fit(X_train, y_train)
print ("Random forest trained on full features in ", (time.clock() - t1), " seconds.")
rforest_preds0 = rforest0.predict(X_test)
generate_report("Random Forest: Full Features", rforest0, rforest_preds0, y_test)

The random forest is perhaps unsurprisingly able to improve upon the results of the multilayer perceptron.


# Dimensionality Reduction

In [None]:
X_p, y_p = load_data_h5("data/ml_pro_features_labels.h5", mode=1)
X_n, y_n = load_data_h5("data/ml_pro_features_labels.h5", mode=0)


X = combine_positive_negative_data(X_n, X_p)
y = combine_positive_negative_data(y_n, y_p)

### Linear Dimensionality Reduction (PCA)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_squared_error

X_norm = normalize(X)

pca = PCA(n_components=2)

reduced_x_ = pca.fit_transform(X_norm)

reconstructed_x = pca.inverse_transform(reduced_x_)

error = mean_squared_error(X_norm,reconstructed_x)
print(error)

In [None]:
reduced_x_ = pd.DataFrame(reduced_x_)


y_labels = pd.DataFrame(y)

plt.clf()
sm = pd.plotting.scatter_matrix(reduced_x_,alpha=0.2, c = y_labels[0],figsize=(12,12),diagonal='kde')

[s.xaxis.label.set_rotation(45) for s in sm.reshape(-1)]
[s.yaxis.label.set_rotation(0) for s in sm.reshape(-1)]

#May need to offset label when rotating to prevent overlap of figure
[s.get_yaxis().set_label_coords(-0.3,0.5) for s in sm.reshape(-1)]

#Hide all ticks
[s.set_xticks(()) for s in sm.reshape(-1)]
[s.set_yticks(()) for s in sm.reshape(-1)]

plt.show()

### Non-linear Dimensionality Reduction
Due to limitations in the scikit-learn dimensionality reduction techniques, an autoencoder deep network is trained to perform the non-linear dimensionality reduction.

### Autoencoder

In [None]:
from keras.layers import Input, Dense, BatchNormalization
from keras.models import Model
from keras import objectives
from keras import optimizers
from keras.callbacks import ModelCheckpoint
from keras.layers.advanced_activations import LeakyReLU, PReLU
from sklearn.preprocessing import normalize
from utils import load_data_h5
import matplotlib.pyplot as plt
import time
plt.style.use('ggplot')

time_stamp = time.clock()


X_,y_ = load_data_h5("data/ml_pro_features_labels.h5")
X_ = normalize(X_)

num_epochs = 1000
encoding_dim = 2
learning_rate = 1e-2

input_data = Input(shape=(188,))
alpha = 'glorot_uniform'

encoded = Dense(100)(input_data)
encoded = PReLU(alpha_initializer=alpha)(encoded)
encoded = Dense(50)(encoded)
encoded = PReLU(alpha_initializer=alpha)(encoded)
encoded = Dense(25)(encoded)
encoded = PReLU(alpha_initializer=alpha)(encoded)
encoded = Dense(encoding_dim)(encoded)
encoded = PReLU(alpha_initializer=alpha)(encoded)

decoded = Dense(25)(encoded)
decoded = PReLU(alpha_initializer=alpha)(decoded)
decoded = Dense(50)(decoded)
decoded = PReLU(alpha_initializer=alpha)(decoded)
decoded = Dense(100)(decoded)
decoded = PReLU(alpha_initializer=alpha)(decoded)
decoded = Dense(188)(decoded)
decoded = PReLU(alpha_initializer=alpha)(decoded)

autoencoder = Model(input_data, decoded)
print (autoencoder.summary())

encoder = Model(input_data, encoded)
print (encoder.summary())

encoded_input = Input(shape=(encoding_dim,))


decoder = Model(encoded_input,autoencoder.layers[-1](autoencoder.layers[-2](autoencoder.layers[-3](autoencoder.layers[-4](
    autoencoder.layers[-5](autoencoder.layers[-6](autoencoder.layers[-7](autoencoder.layers[-8](encoded_input)))))))))

print (decoder.summary())


autoencoder.compile(optimizer=optimizers.adam(lr=learning_rate),loss=objectives.mean_squared_error)


autoencoder.fit(X_,X_,epochs=num_epochs,batch_size=64,shuffle=True,validation_split=0.2,callbacks=[ModelCheckpoint(str(time_stamp)+"_model.h5")])

reduced_x = encoder.predict(X_)
plt.clf()
plt.scatter(reduced_x[:,0],reduced_x[:,1],c = y_,s=10)
plt.savefig(str(time_stamp)+"_final_dim_reduction.png")