In [1]:
# Importing the NumPy, and few other libraries for utility functions

import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
# Initializing a make_regression object to setup the Regression problem
# Data-set size: 1 million, 50 dimensional instances, of which only 10 are informative
# Other parameters: noise = 2 (The std of Gaussian noise added to the output)

X, y = make_regression(n_samples=1000000, n_features=50, n_informative=10, noise=2, random_state=42)

In [3]:
# Print the shape of instances, and their corresponding targets

print(X.shape) 
print(y.shape)

(1000000, 50)
(1000000,)


In [4]:
# Initializing a StandardScaler object to transform the data-set to mean equal to zero, and
# standard deviation equal to 1.

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [5]:
# Printing out the mean, and standard deviation for the transform data-set.

print(X.mean())
print(X.std())

2.1572077457676642e-18
0.9999999999999958


In [6]:
# Initializing a train_test_split object to break the data set into 75% training data, and
# 25% test data.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [7]:
# Printing out the shapes of training, and testing instances, along with their corresponding labels.

print(X_train.shape) 
print(y_train.shape) 
print(X_test.shape) 
print(y_test.shape)

(750000, 50)
(750000,)
(250000, 50)
(250000,)


In [8]:
# Vectorized implementation of the Gradient Descent algorithm

def gradient_descent(X_train, y_train):
    # Appending an extra column of 1s into the training data
    X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
    
    # Initializing the weights with random values
    weights = np.random.random(X_train.shape[1])
        
    # Initializing the learning rate
    learning_rate = 0.01
    
    # Restricting the number of iterations to 1000
    n_iters = 1000
    for _ in range(n_iters):
        # Making a prediction for the current value of weights
        predictions = np.dot(X_train, weights) 
        
        # Calculating the losses for every instance in the training data
        losses = y_train - predictions
                
        # Calculating the gradient using the losses
        gradients = np.dot(X_train.T, losses) / X_train.shape[0]
        
        # Updating the weights
        weights += learning_rate * gradients
            
    # Returning the estimated weights
    return weights

In [9]:
# Making predictions for the testing data using the estimated weights

def make_prediction(X_test, weights):
    # Appending an extra column of 1s into the testing data
    X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))
    
    # Making a prediction using the estimated weights
    predictions = np.dot(X_test, weights)
    
    # Returing the predicted targets
    return predictions

In [10]:
%%time

# Estimating the weights using Gradient Descent algorithm
# Timing the cell to evaluate the running time, and benchmarking with other variants

weights = gradient_descent(X_train, y_train)

Wall time: 40.7 s


In [11]:
# Making the predictions on the testing data, and printing the result
predictions = make_prediction(X_test, weights)
print(predictions)
print(mean_squared_error(y_test, predictions))

[  68.07738807  -66.5145115    66.4842945  ... -213.02143821  -69.47491553
 -420.83642945]
3.9953282406388966


In [12]:
# Vectorized implementation of the Stochastic Gradient Descent algorithm

def stochastic_gradient_descent(X_train, y_train):
    # Appending an extra column of 1s into the training data
    X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
    
    # Initializing the weights with random values
    weights = np.random.random(X_train.shape[1])
    
    # Initializing the learning rate
    learning_rate = 0.01
    
    # Restricting the number of iterations to 1000
    n_iters = 1000
    for _ in range(n_iters):
        # Picking a random index in the valid index range
        idx = np.random.randint(X_train.shape[0], size=1)
        
        # Picking the corresponding instance, and it's target
        X_sampled = X_train[idx, :]
        y_sampled = y_train[idx]
        
        # Making a prediction for the current value of weights
        prediction = np.dot(X_sampled, weights)
        
        # Calculating the loss for only the picked instance
        loss = y_sampled - prediction
        
        # Calculating the gradient using the loss
        gradient = np.dot(X_sampled.T, loss)
        
        # Updating the weights
        weights += learning_rate * gradient 
    
    # Returning the estimated weights
    return weights

In [13]:
%%time

# Estimating the weights using Stochastic Gradient Descent algorithm
# Timing the cell to evaluate the running time, and benchmarking with other variants

weights = stochastic_gradient_descent(X_train, y_train)

Wall time: 176 ms


In [14]:
# Making the predictions on the testing data, and printing the result

predictions = make_prediction(X_test, weights)
print(predictions)
print(mean_squared_error(y_test, predictions))

[  70.05574816  -68.27151893   65.44607325 ... -212.98002917  -69.36914014
 -421.48958033]
5.342540781287521


In [15]:
# Vectorized implementation of the Mini-batch Gradient Descent algorithm

def mini_batch_gradient_descent(X_train, y_train):
    # Appending an extra column of 1s into the training data
    X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
    
    # Initializing the weights with random values
    weights = np.random.random(X_train.shape[1])
    
    # Initializing the learning rate
    learning_rate = 0.01
    
    # Restricting the number of iterations to 1000
    n_iters = 1000
    
    # Setting the batch size to 1 / 100th the size of the training data
    sample_size = X_train.shape[0] // 100
    for _ in range(n_iters):
        # Picking random indexes in the valid index range
        idx = np.random.choice(X_train.shape[0], size=sample_size, replace=False)

        # Picking the corresponding instances, and their labels to create a batch
        X_sampled = X_train[idx, :]
        y_sampled = y_train[idx]
        
        # Making a prediction for the current value of weights
        predictions = np.dot(X_sampled, weights)
        
        # Calculating the losses for the batch
        losses = y_sampled - predictions
        
        # Calculating the gradients using the losses
        gradients = np.dot(X_sampled.T, losses) / X_sampled.shape[0]
        
        # Updating the weights
        weights += learning_rate * gradients
     
    # Returning the estimated weights
    return weights

In [16]:
%%time

# Estimating the weights using Mini-batch Gradient Descent algorithm
# Timing the cell to evaluate the running time, and benchmarking with other variants

weights = mini_batch_gradient_descent(X_train, y_train)

Wall time: 24 s


In [17]:
# Making the predictions on the testing data, and printing the result

predictions = make_prediction(X_test, weights)
print(predictions)
print(mean_squared_error(y_test, predictions))

[  68.09432523  -66.49869118   66.46791186 ... -213.01669194  -69.48411853
 -420.83670452]
3.995530032524403


| Algorithm | Training time | MSE         
| :- |-------------: | :-:
|Gradient Decent| 40.7 sec  | 3.995
| Stochastic Gradient Descent | 176 ms | 5.342
| Mini-Batch Gradient Descent | 24 sec | 3.995