In [1]:
# Importing the NumPy, and few other libraries for utility functions

import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Initializing a make_classification object to setup the Binary classification problem
# Data-set size: 1 million, 50 dimensional instances, of which only 10 are informative
# Other parameters: 5 redundant features (linear combinations of useful features), 
#                   weights = 0.7 (class-porportions),
#                   class_sep = 0.5 (spread of cluster)

X, y = make_classification(n_samples=1000000, n_features=50, n_informative=10, n_redundant=5,
                           n_classes=2, weights=[0.7], class_sep=0.7, random_state=42)

In [3]:
# Print the shape of instances, and their corresponding labels

print(X.shape)
print(y.shape)

(1000000, 50)
(1000000,)


In [4]:
# Initializing a StandardScaler object to transform the data-set to mean equal to zero, and
# standard deviation equal to 1.

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [5]:
# Printing out the mean, and standard deviation for the transform data-set.

print(X.mean())
print(X.std())

-8.288745334539272e-16
1.0000000000000016


In [6]:
# Initializing a train_test_split object to break the data set into 75% training data, and
# 25% test data.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [7]:
# Printing out the shapes of training, and testing instances, along with their corresponding labels.

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(750000, 50)
(750000,)
(250000, 50)
(250000,)


In [8]:
# Standard implementation of the Sigmoid function
# The Sigmoid function maps the value into the range [0, 1].

def sigmoid(X_train, weights):
    unbounded_predictions = np.dot(X_train, weights)
    return 1 / (1 + np.exp(-unbounded_predictions))

In [9]:
# Vectorized implementation of the Gradient Descent algorithm

def gradient_descent(X_train, y_train):
    # Appending an extra column of 1s into the training data
    X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
    
    # Initializing the weights with random values
    weights = np.random.random(X_train.shape[1])
        
    # Initializing the learning rate
    learning_rate = 0.01
        
    # Restricting the number of iterations to 1000
    n_iters = 1000
    for _ in range(n_iters):
        # Making a prediction for the current value of weights
        predictions = sigmoid(X_train, weights)
        
        # Calculating the losses for every instance in the training data
        losses = y_train - predictions
        
        # Calculating the gradient using the losses
        gradients = np.dot(X_train.T, losses) / X_train.shape[0]
                
        # Updating the weights
        weights += learning_rate * gradients
            
    # Returning the estimated weights
    return weights

In [10]:
# Making predictions for the testing data using the estimated weights

def make_prediction(X_test, weights):
    # Appending an extra column of 1s into the testing data
    X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))
    
    # Making a prediction using the estimated weights
    predictions = sigmoid(X_test, weights)
    
    # Setting the threshold to 0.5 for class predictions
    class_zero = (predictions < 0.50)  
    
    # Mapping values in the range [0, 1] to their corresponding classes
    predictions[class_zero] = 0
    predictions[~class_zero] = 1
    
    # Returing the predicted classes
    return predictions

In [11]:
%%time

# Estimating the weights using Gradient Descent algorithm
# Timing the cell to evaluate the running time, and benchmarking with other variants

weights = gradient_descent(X_train, y_train)

CPU times: user 5min 7s, sys: 4min 17s, total: 9min 25s
Wall time: 25.4 s


In [12]:
# Making the predictions on the testing data, and printing the result

predictions = make_prediction(X_test, weights)
print(predictions)
print(accuracy_score(y_test, predictions))

[0. 0. 0. ... 0. 0. 1.]
0.724364


In [13]:
# Vectorized implementation of the Stochastic Gradient Descent algorithm

def stochastic_gradient_descent(X_train, y_train):
    # Appending an extra column of 1s into the training data
    X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
    
    # Initializing the weights with random values
    weights = np.random.random(X_train.shape[1])
    
    # Initializing the learning rate
    learning_rate = 0.01
    
    # Restricting the number of iterations to 1000
    n_iters = 1000
    for _ in range(n_iters):
        # Picking a random index in the valid index range
        idx = np.random.randint(0, X_train.shape[0], size=1)
        
        # Picking the corresponding instance, and it's label
        X_sampled = X_train[idx, :]
        y_sampled = y_train[idx]
        
        # Making a prediction for the current value of weights
        predictions = sigmoid(X_sampled, weights)
        
        # Calculating the loss for only the picked instance
        loss = y_sampled - predictions
        
        # Calculating the gradient using the loss
        gradient = np.dot(X_sampled.T, loss)
        
        # Updating the weights
        weights += learning_rate * gradient 
    
    # Returning the estimated weights
    return weights

In [14]:
%%time

# Estimating the weights using Stochastic Gradient Descent algorithm
# Timing the cell to evaluate the running time, and benchmarking with other variants

weights = stochastic_gradient_descent(X_train, y_train)

CPU times: user 237 ms, sys: 517 ms, total: 754 ms
Wall time: 69.5 ms


In [15]:
# Making the predictions on the testing data, and printing the result

predictions = make_prediction(X_test, weights)
print(predictions)
print(accuracy_score(y_test, predictions))

[0. 1. 0. ... 0. 1. 1.]
0.70068


In [16]:
# Vectorized implementation of the Mini-batch Gradient Descent algorithm

def mini_batch_gradient_descent(X_train, y_train):
    # Appending an extra column of 1s into the training data
    X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
    
    # Initializing the weights with random values
    weights = np.random.random(X_train.shape[1])
    
    # Initializing the learning rate
    learning_rate = 0.01
    
    # Restricting the number of iterations to 1000
    n_iters = 1000
    
    # Setting the batch size to 1 / 100th the size of the training data
    sample_size = X_train.shape[0] // 100
    for _ in range(n_iters):
        # Picking random indexes in the valid index range
        idx = np.random.choice(X_train.shape[0], size=sample_size, replace=False)
        
        # Picking the corresponding instances, and their labels to create a batch
        X_sampled = X_train[idx, :]
        y_sampled = y_train[idx]
        
        # Making a prediction for the current value of weights
        predictions = sigmoid(X_sampled, weights)
        
        # Calculating the losses for the batch
        losses = y_sampled - predictions
        
        # Calculating the gradients using the losses
        gradients = np.dot(X_sampled.T, losses) / X_sampled.shape[0]
        
        # Updating the weights
        weights += learning_rate * gradients
    
    # Returning the estimated weights
    return weights        

In [17]:
%%time

# Estimating the weights using Mini-batch Gradient Descent algorithm
# Timing the cell to evaluate the running time, and benchmarking with other variants

weights = mini_batch_gradient_descent(X_train, y_train)

CPU times: user 1min 22s, sys: 3min 25s, total: 4min 48s
Wall time: 12.6 s


In [18]:
# Making the predictions on the testing data, and printing the result

predictions = make_prediction(X_test, weights)
print(predictions)
print(accuracy_score(y_test, predictions))

[0. 0. 0. ... 0. 0. 0.]
0.72354


| Algorithm | Training time | Accuracy Score         
| :- |-------------: | :-:
|Gradient Decent| 25.4 sec  | 0.7243
| Stochastic Gradient Descent | 69.5 ms | 0.7006
| Mini-Batch Gradient Descent | 12.6 sec | 0.7235