# 利用神经网络来预测学生录取情况

在该 notebook 中，应用神经网络并基于以下三项特征预测学生录取情况：

- GRE 分数（测试）即 GRE Scores
- GPA 分数（成绩）即 GPA Scores
- 评级（1-4）即 Class rank (1-4)

## 1.导入相关包

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## 2.加载数据集
## 2.1. 从csv文件中读取数据

In [None]:
# Reading the csv file into a pandas DataFrame
df = pd.read_csv("student_data.csv")

# Print first 10 rows of df
df.head(10)

## 2.2. 对rank进行one-hot编码

In [None]:
one_hot_data = pd.get_dummies(df, columns=['rank'])
one_hot_data.head(10)

## 缩放数据
由于gre数据范围为，gpa数据范围大概为，与其他数据的范围相差较大，不利于神经网络处理。因此，需要将gre和gpa两个特征数据缩放到0-1范围内，缩放方式为：减去最小值后除以最大值与最小值的差值。

In [None]:
# 将DataFrame格式数据转成numpy数组，便于后续处理
data = np.array(one_hot_data)

# 数据缩放，统一处理
data = (data - np.min(data, axis=0))/(np.max(data, axis=0)-np.min(data, axis=0))

## 将数据分成训练集和测试集
将数据集拆分成训练数据和测试数据，训练数据占70%，测试数据占30%。

In [None]:
np.random.seed(1)

# 打乱数据顺序
np.random.shuffle(data)

# 获取训练数据集
train_set = data[:280,:]

# 获取测试数据集
test_set = data[280:,:]

print(data.shape)
print(train_set.shape)
print(test_set.shape)

## 将数据分成特征和标签
将把数据分为特征 (features)（X）和标签 (labels)（y）

In [None]:
train_X = train_set[:,1:]
train_y = train_set[:,0]
train_y = train_y.reshape(train_y.shape[0],1)
test_X = test_set[:,1:]
test_y = test_set[:,0]
test_y = test_y.reshape(test_y.shape[0],1)

# 转置
train_X = train_X.T
train_y = train_y.T
test_X = test_X.T
test_y = test_y.T

print(train_X.shape)
print(train_y.shape)
print(test_X.shape)
print(test_y.shape)

## 初始化参数

In [None]:
def initialize_parameters(layer_dims):
    np.random.seed(2)
    parameters = {}
    L = len(layer_dims)
    
    for l in range(1, L):
        parameters['W'+str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*0.01
        parameters['b'+str(l)] = np.zeros((layer_dims[l], 1))
        assert(parameters['W'+str(l)].shape==(layer_dims[l], layer_dims[l-1]))
        assert(parameters['b'+str(l)].shape==(layer_dims[l], 1))
        
    return parameters

In [None]:
parameters = initialize_parameters([6,5,5,1])
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))

## 辅助功能函数

In [None]:
def relu(x):
    '''
    Implements the relu activation in numpy
    
    Arguments:
    x -- numpy array of any shape
    
    Returns:
    y -- output of relu(x), same shape as x
    '''
    
    y = np.maximum(0, x)
    return y

In [None]:
def sigmoid(x):
    '''
    Implements the sigmoid activation in numpy
    
    Arguments:
    x -- numpy array of any shape
    
    Returns:
    y -- output of sigmoid(x), same shape as x
    '''

    y = 1/(1+np.exp(-x))
    return y

In [None]:
def relu_gradient(x):
    '''
    calculate the gradient for a single RELU unit.

    Arguments:
    x -- where we store for computing backward propagation efficiently

    Returns:
    g -- Gradient of the RELU unit with respect to x
    '''

    g = np.array(x, copy=True)
    g[x<=0] = 0
    g[x>0] = 1
    return g

In [None]:
def sigmoid_gradient(x):
    '''
    calculate the gradient for a single SIGMOID unit.

    Arguments:
    x -- where we store for computing backward propagation efficiently

    Returns:
    g -- Gradient of the SIGMOID unit with respect to x
    '''

    s = 1/(1+np.exp(-x))
    g = s*(1-s)
    return g

## L-layer Forward propagation
模型架构： [LINEAR -> RELU] × (L-1) -> LINEAR -> SIGMOID

In [None]:
def linear_activation_forward(A_prev, W, b, activation):
    Z = np.dot(W, A_prev) + b
    
    if activation == 'sigmoid':
        A = sigmoid(Z)
    if activation == 'relu':
        A = relu(Z)
    
    cache = (A_prev, W, b, Z)
    
    return A, cache

In [None]:
def L_layer_forward(X, parameters):
    caches = []
    A = X
    L = len(parameters) // 2
    for l in range(1,L): # 1~L-1层
        A_prev = A
        A, cache = linear_activation_forward(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], 'relu')
        caches.append(cache) # 缓存A_prev W b Z
    AL, cache = linear_activation_forward(A, parameters['W'+str(L)], parameters['b'+str(L)], 'sigmoid') # L层
    caches.append(cache)
    return AL, caches

## 损失函数
实现交叉熵损失函数

In [None]:
def compute_cost(AL, y):
    '''
    Implement the cross-entropy loss function.

    Arguments:
    AL -- probability vector corresponding to label predictions, shape (1, number of examples)
    y -- true "label" vector, shape (1, number of examples)

    Returns:
    J -- cross-entropy loss
    '''

    m = y.shape[1]
    J = -(np.sum(y*np.log(AL)+(1-y)*np.log(1-AL), axis=1))/m
    J = np.squeeze(J)
    return J

## L-layer Backward propagation

In [None]:
def linear_activation_backward(dA, cache, activation):
    '''
    Implement the backward propagation for the LINEAR->ACTIVATION layer.
    
    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (A_prev, W, b, Z) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    '''

    A_prev, W, b, Z = cache
    
    m = A_prev.shape[1]
    
    if activation == 'sigmoid':
        dZ = dA * sigmoid_gradient(Z)
    if activation == 'relu':
        dZ = dA * relu_gradient(Z)
    
    dW = np.dot(dZ, A_prev.T)
    db = np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

In [None]:
def L_layer_backward(AL, y, caches):
    '''
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    y -- true "label" vector
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (there are (L-1) or them, indexes from 0 to L-2)
                the cache of linear_activation_forward() with "sigmoid" (there is one, index L-1)
    
    Returns:
    grads -- A dictionary with the gradients
             grads['dA' + str(l)] = ... 
             grads['dW' + str(l)] = ...
             grads['db' + str(l)] = ... 
    '''
    
    grads = {}
    L = len(caches)
    m = y.shape[1]
    
    dAL = -(y/AL-(1-y)/(1-AL))/m
    grads['dA'+str(L)] = dAL
    
    grads['dA'+str(L-1)], grads['dW'+str(L)], grads['db'+str(L)] = linear_activation_backward(dAL, caches[L-1], 'sigmoid')
    
    for l in reversed(range(L-1)):
        grads['dA'+str(l)], grads['dW'+str(l+1)], grads['db'+str(l+1)] = linear_activation_backward(grads['dA'+str(l+1)], caches[l], 'relu')
        
    return grads

## 更新参数

In [None]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2 # number of layers
    for l in range(L):
        parameters['W'+str(l+1)] = parameters['W'+str(l+1)] - learning_rate * grads['dW'+str(l+1)]
        parameters['b'+str(l+1)] = parameters['b'+str(l+1)] - learning_rate * grads['db'+str(l+1)]
        
    return parameters

## 神经网络训练函数

In [None]:
layers_dims = [6, 12, 4, 1]

def train_nn(X, y, layers_dims, learning_rate = 0.001, num_iterations = 500, print_cost=False):
    """
    Train a L-layer neural network: [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID.

    Arguments:
    X -- data, numpy array of shape (6, number of examples)
    y -- true "label" vector of shape (1, number of examples)
    layers_dims -- list containing the input size and each layer size, of length (number of layers + 1).
    learning_rate -- learning rate of the gradient descent update rule
    num_iterations -- number of iterations of the optimization loop
    print_cost -- if True, it prints the cost every 100 steps

    Returns:
    parameters -- parameters learnt by the model. They can then be used to predict.
    """
    
    np.random.seed(3)
    
    costs = []
    
    # parameters initialization
    parameters = initialize_parameters(layers_dims)
    
    # Loop (gradient descent)
    for i in range(0, num_iterations):
        # forward propagation
        AL, caches = L_layer_forward(X, parameters)
        
        # computer cost
        cost = compute_cost(AL, y)
        
        # backward propagation
        grads = L_layer_backward(AL, y, caches)
        
        # update parameters
        parameters = update_parameters(parameters, grads, learning_rate)
        
        # print the cost every 20 training example
        if print_cost and i % 20 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
            costs.append(cost)
    
    # plot the cost
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return parameters

## 神经网络预测函数

In [None]:
def predict(X, parameters):
    '''
    Using the learned parameters, predicts a class for each example in X

    Arguments:
    parameters -- python dictionary containing your parameters 
    X -- input data of size (n_x, m)

    Returns
    predictions -- vector of predictions of our model
    '''
    
    AL, caches = L_layer_forward(X, parameters)
    predictions = np.around(AL) # round off
    
    return predictions

## 训练神经网络

In [None]:
parameters = train_nn(train_X, train_y, layers_dims, learning_rate = 0.01, num_iterations = 1000, print_cost=True)

## 模型预测精度

In [None]:
preds = predict(test_X, parameters)
accuracy = np.mean(preds == test_y)
print("Prediction accuracy: {:.3f}".format(accuracy))