# Implementing Gradient Descent

gradient Descent for a **binary classification** using **logistic regression**.

## Data Preparation

This requires to have a Pandas version `conda install pandas==0.25.1`

In [9]:
import warnings; warnings.simplefilter('ignore')     # Stop Jupyter Notebook from printing warnings
import numpy as np
import pandas as pd

In [10]:
admissions = pd.read_csv('data/binary.csv')
admissions

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.00,1
3,1,640,3.19,4
4,0,520,2.93,4
...,...,...,...,...
395,0,620,4.00,2
396,0,560,3.04,3
397,0,460,2.63,2
398,0,700,3.65,2


In [11]:
# make dummy variables for rank
data = pd.concat([admissions, pd.get_dummies(admissions['rank'], prefix='rank')], axis=1)
data

Unnamed: 0,admit,gre,gpa,rank,rank_1,rank_2,rank_3,rank_4
0,0,380,3.61,3,0,0,1,0
1,1,660,3.67,3,0,0,1,0
2,1,800,4.00,1,1,0,0,0
3,1,640,3.19,4,0,0,0,1
4,0,520,2.93,4,0,0,0,1
...,...,...,...,...,...,...,...,...
395,0,620,4.00,2,0,1,0,0
396,0,560,3.04,3,0,0,1,0
397,0,460,2.63,2,0,1,0,0
398,0,700,3.65,2,0,1,0,0


In [12]:
data = data.drop('rank', axis=1)
data

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
0,0,380,3.61,0,0,1,0
1,1,660,3.67,0,0,1,0
2,1,800,4.00,1,0,0,0
3,1,640,3.19,0,0,0,1
4,0,520,2.93,0,0,0,1
...,...,...,...,...,...,...,...
395,0,620,4.00,0,1,0,0
396,0,560,3.04,0,0,1,0
397,0,460,2.63,0,1,0,0
398,0,700,3.65,0,1,0,0


We **normalize** the GRE and GPA data, which means to scale the values such that they have **zero mean** and a **standard deviation of 1**. 

This is necessary because the sigmoid function squashes really small and really large inputs. The gradient of really small and large inputs is zero, which means that the gradient descent step will go to zero too. Otherwise we would have to be really careful about how we initialize the weights or the gradient descent steps will die off and the network won't train.

In [13]:
# normalize features
for field in ['gre', 'gpa']:
    mean, std = data[field].mean(), data[field].std()
    data.loc[:,field] = (data[field] - mean) / std

data

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
0,0,-1.798011,0.578348,0,0,1,0
1,1,0.625884,0.736008,0,0,1,0
2,1,1.837832,1.603135,1,0,0,0
3,1,0.452749,-0.525269,0,0,0,1
4,0,-0.586063,-1.208461,0,0,0,1
...,...,...,...,...,...,...,...
395,0,0.279614,1.603135,0,1,0,0
396,0,-0.239793,-0.919418,0,0,1,0
397,0,-1.105469,-1.996759,0,1,0,0
398,0,0.972155,0.683454,0,1,0,0


In [14]:
# Split off random 10% of the data for testing
np.random.seed(42)
sample = np.random.choice(data.index, size=int(len(data) * 0.9), replace=False)
data, test_data = data.ix[sample], data.drop(sample)

# Split into features and targets
features, targets = data.drop('admit', axis=1), data['admit']
features_test, targets_test = test_data.drop('admit', axis=1), test_data['admit']

## Gradient Descent Algorithm

We use MSE as Loss Function:

$$E = \frac{1}{2m} \sum_\mu^m \big( y^{\,\mu} - \hat{y}^{\,\mu}\big)^2$$

The general algorithm for updating the weights with gradient descent is:

* Set the weight step to zero: $\Delta w_i = 0$
* For each record in the training data:
  * Make a forward pass through the network, calculating the output $\hat y = f(\sum_i w_i x_i)$
  * Calculate the error term for the output unit, $\delta = (y - \hat y) * f'(\sum_i w_i x_i)$
  * Update the weight step $\Delta w_i = \Delta w_i + \delta x_i$
* Update the weights $w_i = w_i + \eta \Delta w_i / m$ where $\eta$ is the learning rate and $m$ is the number of records. Here we're averaging the weight steps to help reduce any large variations in the training data.
* Repeat for $e$ epochs.

In [15]:
import numpy as np
np.random.seed(42)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

n_records, n_features = features.shape
last_loss = None

# initialize weights
weights = np.random.normal(scale=1 / n_features ** .5, size=n_features)
weights

array([ 0.2027827 , -0.05644616,  0.26441774,  0.62177434, -0.09559271,
       -0.09558601])

In [16]:
# Neural Network hyperparameters
epochs = 1000
learnrate = 0.5

for e in range(epochs):
    del_w = np.zeros(weights.shape)

    # loop through all records, x is the input, y is the target
    for x, y in zip(features.values, targets):
        output = sigmoid(np.dot(x, weights))
        error = y - output
        error_term = error * output * (1 - output)
        del_w += error_term * x

    weights += learnrate * del_w / n_records

    # printing out the mean square error on the training set
    if e % (epochs / 10) == 0:
        out = sigmoid(np.dot(features, weights))
        loss = np.mean((out - targets) ** 2)
        if last_loss and last_loss < loss:
            print("Train loss: ", loss, "  WARNING - Loss Increasing")
        else:
            print("Train loss: ", loss)
        last_loss = loss


# Calculate accuracy on test data
tes_out = sigmoid(np.dot(features_test, weights))
predictions = tes_out > 0.5
accuracy = np.mean(predictions == targets_test)
print("Prediction accuracy: {:.3f}".format(accuracy))

Train loss:  0.2627609384996635
Train loss:  0.20928619409324875
Train loss:  0.20084292908073426
Train loss:  0.19862156475527873
Train loss:  0.1977985139668603
Train loss:  0.19742577912189863
Train loss:  0.1972350774624106
Train loss:  0.1971294562509248
Train loss:  0.19706766341315082
Train loss:  0.19703005801777368
Prediction accuracy: 0.725
