# 数据处理

我们首先将数据归一化，由于样本不同特征的取值范围不一样，可能导致迭代很慢，为了减少特征取值的影响，可以对特征数据归一化，也就是对于每个特征x，求出它的期望x¯¯¯和标准差std(x)，然后转化为：

 $$\frac{x-\overline{x}}{std(x)}$$
 
这样特征的新期望为0，新方差为1，迭代次数可以大大加快。

对于一切非连续的值，比如等级，月份，星期，这些值的2并不表示是1的两倍， 因此，我们需要用 dummy variables 来对 这些数据进行编码，独热编码

In [1]:
import numpy as np
import pandas as pd

admissions = pd.read_csv('binary.csv')
print(admissions.head())

   admit  gre   gpa  rank
0      0  380  3.61     3
1      1  660  3.67     3
2      1  800  4.00     1
3      1  640  3.19     4
4      0  520  2.93     4


In [2]:
# 连接原数据和  dummy variables
data = pd.concat([admissions, pd.get_dummies(admissions['rank'], prefix='rank')], axis=1)
data.head()


Unnamed: 0,admit,gre,gpa,rank,rank_1,rank_2,rank_3,rank_4
0,0,380,3.61,3,0,0,1,0
1,1,660,3.67,3,0,0,1,0
2,1,800,4.0,1,1,0,0,0
3,1,640,3.19,4,0,0,0,1
4,0,520,2.93,4,0,0,0,1


In [3]:
data = data.drop('rank', axis=1)
data.head()

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
0,0,380,3.61,0,0,1,0
1,1,660,3.67,0,0,1,0
2,1,800,4.0,1,0,0,0
3,1,640,3.19,0,0,0,1
4,0,520,2.93,0,0,0,1


In [4]:
# standarize features
for field in ['gre', 'gpa']:
    mean, std = data[field].mean(), data[field].std()
    data.loc[:, field] = (data[field] - mean) / std
data.head()

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
0,0,-1.798011,0.578348,0,0,1,0
1,1,0.625884,0.736008,0,0,1,0
2,1,1.837832,1.603135,1,0,0,0
3,1,0.452749,-0.525269,0,0,0,1
4,0,-0.586063,-1.208461,0,0,0,1


In [5]:
# split off random 10% of  the data for testing
np.random.seed(42)
# replace 是否放回抽取
sample = np.random.choice(data.index, size=int(len(data)*0.9), replace=False)
data, test_data = data.iloc[sample], data.drop(sample)
# split into features and targets
features, targets = data.drop('admit', axis=1), data['admit']
features_test, targets_test = test_data.drop('admit', axis=1), test_data['admit']

In [10]:
np.random.seed(21)

def sigmoid(x):
    """
    Calculate sigmoid
    """
    return 1 / (1 + np.exp(-x))

n_hidden = 2
epochs = 1000
learnrate = 0.005

n_records, n_features = features.shape
last_loss = None
print(features.shape)

weights_input_hidden = np.random.normal(scale=1 / n_features ** .5,
                                        size=(n_features, n_hidden))
weights_hidden_output = np.random.normal(scale=1 / n_features ** .5,
                                         size=n_hidden)

print('input to hidden weights \n',weights_input_hidden,'\n')
print('hidden to output weights \n', weights_hidden_output, '\n')
for e in range(epochs):
    del_w_input_hidden = np.zeros(weights_input_hidden.shape)
    del_w_hidden_output = np.zeros(weights_hidden_output.shape)
    for x, y in zip(features.values, targets):
        hidden_input = np.dot(x, weights_input_hidden)
        hidden_output = sigmoid(hidden_input)
        output = sigmoid(np.dot(hidden_output, weights_hidden_output))
        ## Backward psss ##
        error = y - output
        output_error_term = error*output*(1-output)

        hidden_error = np.dot(output_error_term, weights_hidden_output)                       
        hidden_error_term = hidden_error *  hidden_output * (1-hidden_output)
        
        del_w_hidden_output  += output_error_term * hidden_output 
        del_w_input_hidden += hidden_error_term * x[:, None]
        
    weights_input_hidden += learnrate * del_w_input_hidden / n_records
    weights_hidden_output += learnrate * del_w_hidden_output / n_records
    # Printing out the mean square error on the training set
    if e % (epochs / 10) == 0:
        hidden_output = sigmoid(np.dot(x, weights_input_hidden))
        out = sigmoid(np.dot(hidden_output, weights_hidden_output))
        
        loss = np.mean((out - y) ** 2)
        if last_loss and last_loss < loss:
            print("Train loss: ", loss, "  WARNING - Loss Increasing")
        else:
            print("Train loss: ", loss)
        last_loss = loss
        

(360, 6)
input to hidden weights 
 [[-0.02121432 -0.0453956 ]
 [ 0.42531176 -0.51306167]
 [ 0.30430325 -0.69853477]
 [-0.08404378 -0.09576333]
 [ 0.46056288 -0.00515452]
 [-0.25033797  0.56080598]] 

hidden to output weights 
 [ 0.65768472 -0.28137626] 

Train loss:  0.3176332925746983
Train loss:  0.3138774282565665
Train loss:  0.3101963906450564
Train loss:  0.30658934459103165
Train loss:  0.30305540194253355
Train loss:  0.2995936270906176
Train loss:  0.2962030422587658
Train loss:  0.2928826325308384
Train loss:  0.28963135061497686
Train loss:  0.2864481213430444


In [11]:
hidden = sigmoid(np.dot(features_test, weights_input_hidden))
out = sigmoid(np.dot(hidden, weights_hidden_output))
predictions = out > 0.5

print(predictions)
accuracy = np.mean(predictions == targets_test)

print("Prediction accuracy: {:.3f}".format(accuracy))

[ True  True False  True  True  True False  True  True False  True False
 False  True  True False  True False False False  True  True  True False
 False  True  True  True  True  True False  True  True False  True  True
  True False False  True]
Prediction accuracy: 0.475
