# Classification Problem using logistic regression vs. using a Neural Network

Let's start by importing numpy and other important packages for this experiment.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import make_gaussian_quantiles
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, recall_score, classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

# Using sklearn datasets, create a gaussian quantiles dataset and load it into variables X and y.

In [3]:
def load_dataset():
    """
     This method creates a data set
     :param X input
     :param y labels labels for roc curves
     :param batch_size size of the batch
     :return X and y
    """
    N = 2000
    X, y = make_gaussian_quantiles(n_samples=N, n_features=2, n_classes=2)
    # X, y = make_moons(N, noise=0.20)
    return X, y

First see how logistic regression performs on this problem. 
We can use sklearn's built-in functions to do that. 
Run the code below to train a logistic regression classifier on the dataset.

In [4]:
X, y = load_dataset()
print('X={}, Y={}, #sample_points={}'.format(X.shape, y.shape, X.shape[0]))
scaler = RobustScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lr = LogisticRegression(solver='newton-cg')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

X=(2000, 2), Y=(2000,), #sample_points=2000


Adding a method to report on classification metrics.

In [6]:
def classifiers_metric_report(models, labels, X_test, y_test):
    """
    This method generates performance classification metric reports for models
    :param models: list of models
    :param labels labels for roc curves
    :param X_test: testing inout
    :param y_test: ground truth
    :return report

    """

    columns = ['Precision', 'Recall', 'F1-Score', 'Specificity']
    results = pd.DataFrame(0.0, columns=columns, index=labels)

    for i, model in enumerate(models):
        y_pred = model.predict(X_test)

        precision = precision_score(y_test, y_pred)
        results.iloc[i, 0] = precision

        recall = recall_score(y_test, y_pred, average='weighted')
        results.iloc[i, 1] = recall

        f1 = f1_score(y_test, y_pred, average='weighted')
        results.iloc[i, 2] = f1

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        specificity = tn / (tn + fp)
        results.iloc[i, 3] = specificity

    return results

We can now check the performance of our classifier.

In [7]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.46      0.45      0.45       202
           1       0.45      0.46      0.46       198

   micro avg       0.46      0.46      0.46       400
   macro avg       0.46      0.46      0.45       400
weighted avg       0.46      0.46      0.45       400



In [10]:
report = classifiers_metric_report([lr], ['Logistic Regression'], X_test, y_test)
report.round(3)

Unnamed: 0,Precision,Recall,F1-Score,Specificity
Logistic Regression,0.451,0.455,0.455,0.446


# Now lets build a Neural Network model with one hidden layer.

* We define the fully connected Neural Network and use
    1. One hidden layer with 3 nodes
    2. The activation function for the inputs is tanh and for the output, we use the sigmoid

In [11]:
class ANN:

    ## Fully Connected Neural Network

    def __init__(self, feature_size, learning_rate):
        # Trainable parameters.
        hidden_size = 3
        output_size = 1

        self.W1 = np.random.randn(hidden_size, feature_size) * 0.01
        self.b1 = np.zeros(shape=(hidden_size, 1))
        self.W2 = np.random.randn(output_size, hidden_size) * 0.01
        self.b2 = np.zeros(shape=(output_size, 1))
        print('W1={}, b1={}, W2={}, b2={}'.format(self.W1.shape, self.b1.shape, self.W2.shape, self.b2.shape))
        self.learning_rate = learning_rate

    def feedforward(self, X):
        self.z_h = np.dot(self.W1, X) + self.b1
        self.z = np.tanh(self.z_h)
        self.z0 = np.dot(self.W2, self.z) + self.b2
        self.output = self.sigmoid(self.z0)
        return self.output

    def backprop(self, X, y):
        # Application of the chain rule to find derivative of the loss function with respect to weights2 and weights1
        m = X.shape[1]
        E0 = (self.output - y)
        dW2 = (1 / m) * np.dot(E0, self.z.T)
        db2 = (1 / m) * np.sum(E0, axis=1, keepdims=True)
        E1 = np.dot(self.W2.T, E0) * (1 - np.power(self.z, 2))
        dW1 = (1 / m) * np.dot(E1, X.T)
        db1 = (1 / m) * np.sum(E1, axis=1, keepdims=True)
        self.W1 -= self.learning_rate * dW1
        self.b1 -= self.learning_rate * db1
        self.W2 -= self.learning_rate * dW2
        self.b2 -= self.learning_rate * db2

    def compute_logloss_cost(self, output, y):
        m = y.shape[1]  # number of examples

        # Compute the cross-entropy cost
        logprobs = np.multiply(np.log(output), y) + np.multiply((1 - y), np.log(1 - output))
        cost = - np.sum(logprobs) / m

        cost = np.squeeze(cost)  # makes sure cost is the dimension we expect.
        return cost

    def predict(self, X):
        ### Prediction based on log loss architecture
        z = np.tanh(np.dot(self.W1, X) + self.b1)
        predictions = self.sigmoid(np.dot(self.W2, z) + self.b2)
        predictions = np.squeeze(np.where(predictions > 0.5, 1, 0))
        return predictions

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, z):
        return self.sigmoid(z) * (1 - self.sigmoid(z))


## Now we can run the Model on the entire data with 1 hidden layer (3 neurons) and 100000 epochs.

In [16]:
def train(X, y, batch_size, learning_rate=0.1, num_iterations=10000):
    """
    This method trains the neural network
    :param X input
    :param y labels labels for roc curves
    :param batch_size size of the batch
    :param learning_rate
    :param num_iterations
    :return neural network
    """

    print('X={}, y={}'.format(X.shape, y.shape))

    nn = ANN(X.shape[0], learning_rate)

    # Full loop iteration
    for i in range(num_iterations):
        output = nn.feedforward(X)
        if i % 1000 == 0:
            cost = nn.compute_logloss_cost(output, y)
            print("Cost after iteration %i: %f" % (i, cost))
        nn.backprop(X, y)
    return nn

In [17]:
nn = train(X_train.T, y_train.reshape(-1, 1).T, batch_size=200)

X=(2, 1600), y=(1, 1600)
W1=(3, 2), b1=(3, 1), W2=(1, 3), b2=(1, 1)
Cost after iteration 0: 0.693145
Cost after iteration 1000: 0.693112
Cost after iteration 2000: 0.692979
Cost after iteration 3000: 0.692710
Cost after iteration 4000: 0.688988
Cost after iteration 5000: 0.537733
Cost after iteration 6000: 0.475228
Cost after iteration 7000: 0.460478
Cost after iteration 8000: 0.253094
Cost after iteration 9000: 0.195807


# Let's generate a report on the performances of our neural network classifier.

In [18]:
y_pred = nn.predict(X_test.T)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.91      0.93       202
           1       0.91      0.95      0.93       198

   micro avg       0.93      0.93      0.93       400
   macro avg       0.93      0.93      0.93       400
weighted avg       0.93      0.93      0.93       400



In [19]:
report = classifiers_metric_report([nn], ['Neural Network'], X_test.T, y_test)
report.round(3)

Unnamed: 0,Precision,Recall,F1-Score,Specificity
Neural Network,0.913,0.932,0.932,0.911


Overall the performances are much better given that we did not tune our logistic classifier hyperparameters using a grid search.
Yet unlike logistic regression, Neural Networks are able to learn even highly non-linear decision boundaries.

TODO: try more hidden layers, also check batch update.