<a href="https://colab.research.google.com/github/viki6666/Pytorch_learn/blob/master/dropout.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dropout理解

In [1]:
# coding:utf-8
import numpy as np
 
# dropout函数的实现
def dropout(x, level):
    if level < 0. or level >= 1: #level是概率值，必须在0~1之间
        raise ValueError('Dropout level must be in interval [0, 1[.')
    retain_prob = 1. - level
 
    # 我们通过binomial函数，生成与x一样的维数向量。binomial函数就像抛硬币一样，我们可以把每个神经元当做抛硬币一样
    # 硬币 正面的概率为p，n表示每个神经元试验的次数
    # 因为我们每个神经元只需要抛一次就可以了所以n=1，size参数是我们有多少个硬币。
    random_tensor = np.random.binomial(n=1, p=retain_prob, size=x.shape) #即将生成一个0、1分布的向量，0表示这个神经元被屏蔽，不工作了，也就是dropout了
    print(random_tensor)
 
    x *= random_tensor
    print(x)
    x /= retain_prob
 
    return x
 
#对dropout的测试，大家可以跑一下上面的函数，了解一个输入x向量，经过dropout的结果  
x=np.asarray([1,2,3,4,5,6,7,8,9,10],dtype=np.float32)
dropout(x,0.4)

[0 0 1 0 1 0 0 1 0 1]
[ 0.  0.  3.  0.  5.  0.  0.  8.  0. 10.]


array([ 0.      ,  0.      ,  5.      ,  0.      ,  8.333333,  0.      ,
        0.      , 13.333333,  0.      , 16.666666], dtype=float32)

Dropout实现

In [0]:
# 前向传播，需要用到一个输入x以及所有的权重以及偏执项，都在parameters这个字典里面存储
# 最后返回会返回一个caches里面包含的 是各层的a和z，a[layers]就是最终的输出
def forward(x,parameters,keep_prob = 0.5):
    a = []
    z = []
    d = []
    caches = {}
    a.append(x)
    z.append(x)
    # 输入层不用删除
    d.append(np.ones(x.shape))
    layers = len(parameters)//2
    # 前面都要用sigmoid
    for i in range(1,layers):
        z_temp =parameters["w"+str(i)].dot(a[i-1]) + parameters["b"+str(i)]#参数和数据进行计算
        a_temp = sigmoid(z_temp)#激活函数
        # 1、建立一个维度与本层神经元数目相同的矩阵d_temp.
        d_temp = np.random.rand(z_temp.shape[0],z_temp.shape[1])
        #2、根据概率(keep_prob)我们把d_temp中的元素设置为0或者1.
        d_temp = d_temp < keep_prob
        #3、把本层的激活函数的输出a_temp 与 d_temp相乘（对应元素乘）作为新的输出。4、除keep_prob
        a_temp = (a_temp * d_temp)/keep_prob
        z.append(z_temp)
        a.append(a_temp)
        d.append(d_temp)
        
    # 最后一层不用sigmoid,也不用dropout
    z_temp = parameters["w"+str(layers)].dot(a[layers-1]) + parameters["b"+str(layers)]
    z.append(z_temp)
    a.append(z_temp)
    d.append(np.ones(z_temp.shape))
    
    caches["z"] = z
    caches["a"] = a
    # 记得保存起来，因为反向传播还会使用
    caches["d"] = d
    caches["keep_prob"] = keep_prob    
    return  caches,a[layers]   
    
    
# 反向传播，parameters里面存储的是所有的各层的权重以及偏执，caches里面存储各层的a和z
# al是经过反向传播后最后一层的输出，y代表真实值 
# 返回的grades代表着误差对所有的w以及b的导数
def backward(parameters,caches,al,y):
    layers = len(parameters)//2#整除
    grades = {}
    m = y.shape[1]
    # 假设最后一层不经历激活函数
    # 就是按照上面的图片中的公式写的
    grades["dz"+str(layers)] = al - y
    grades["dw"+str(layers)] = grades["dz"+str(layers)].dot(caches["a"][layers-1].T) /m
    grades["db"+str(layers)] = np.sum(grades["dz"+str(layers)],axis = 1,keepdims = True) /m
    # 前面全部都是sigmoid激活
    for i in reversed(range(1,layers)):
        da_temp = parameters["w"+str(i+1)].T.dot(grades["dz"+str(i+1)])
        #5、在进行反向传播的时候同样进行这样的操作，所以要把D存储下来，这样才能对被关闭的神经元对应的w不进行更新
        # 要记得乘上对应的开关caches["d"][i]，这样就保证了关闭的神经元在反向传播的时候仍然是关闭的
        da_temp = (caches["d"][i] * da_temp)/caches["keep_prob"]
        grades["dz"+str(i)] = da_temp * sigmoid_prime(caches["z"][i])
        grades["dw"+str(i)] = grades["dz"+str(i)].dot(caches["a"][i-1].T)/m
        grades["db"+str(i)] = np.sum(grades["dz"+str(i)],axis = 1,keepdims = True) /m
    return grades   


![替代文字](https://img-blog.csdnimg.cn/20181120152355651.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzI4ODg4ODM3,size_16,color_FFFFFF,t_70)

BP神经网络  
[代码来源](https://blog.csdn.net/qq_28888837/article/details/84296673)

In [0]:
# 生成权重以及偏执项layers_dim代表每层的神经元个数，
#比如[2,3,1]代表一个三成的网络，输入为2层，中间为3层输出为1层
def init_parameters(layers_dim):
    
    L = len(layers_dim)
    parameters ={}
    for i in range(1,L):
        parameters["w"+str(i)] = np.random.random([layers_dim[i],layers_dim[i-1]])
        parameters["b"+str(i)] = np.zeros((layers_dim[i],1))
    return parameters

In [0]:
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

# sigmoid的导函数
def sigmoid_prime(z):
        return sigmoid(z) * (1-sigmoid(z))


In [0]:
# 前向传播，需要用到一个输入x以及所有的权重以及偏执项，都在parameters这个字典里面存储
# 最后返回会返回一个caches里面包含的 是各层的a和z，a[layers]就是最终的输出
def forward(x,parameters):
    a = []
    z = []
    caches = {}
    a.append(x)
    z.append(x)
    layers = len(parameters)//2
    # 前面都要用sigmoid
    for i in range(1,layers):
        z_temp =parameters["w"+str(i)].dot(x) + parameters["b"+str(i)]
        z.append(z_temp)
        a.append(sigmoid(z_temp))
    # 最后一层不用sigmoid
    z_temp = parameters["w"+str(layers)].dot(a[layers-1]) + parameters["b"+str(layers)]
    z.append(z_temp)
    a.append(z_temp)
    
    caches["z"] = z
    caches["a"] = a    
    return  caches,a[layers]


In [0]:
# 反向传播，parameters里面存储的是所有的各层的权重以及偏执，caches里面存储各层的a和z
# al是经过反向传播后最后一层的输出，y代表真实值
# 返回的grades代表着误差对所有的w以及b的导数
def backward(parameters,caches,al,y):
    layers = len(parameters)//2
    grades = {}
    m = y.shape[1]
    # 假设最后一层不经历激活函数
    # 就是按照上面的图片中的公式写的
    grades["dz"+str(layers)] = al - y
    grades["dw"+str(layers)] = grades["dz"+str(layers)].dot(caches["a"][layers-1].T) /m
    grades["db"+str(layers)] = np.sum(grades["dz"+str(layers)],axis = 1,keepdims = True) /m
    # 前面全部都是sigmoid激活
    for i in reversed(range(1,layers)):
        grades["dz"+str(i)] = parameters["w"+str(i+1)].T.dot(grades["dz"+str(i+1)]) * sigmoid_prime(caches["z"][i])
        grades["dw"+str(i)] = grades["dz"+str(i)].dot(caches["a"][i-1].T)/m
        grades["db"+str(i)] = np.sum(grades["dz"+str(i)],axis = 1,keepdims = True) /m
    return grades   


In [0]:
# 就是把其所有的权重以及偏执都更新一下
def update_grades(parameters,grades,learning_rate):
    layers = len(parameters)//2
    for i in range(1,layers+1):
        parameters["w"+str(i)] -= learning_rate * grades["dw"+str(i)]
        parameters["b"+str(i)] -= learning_rate * grades["db"+str(i)]
    return parameters
# 计算误差值
def compute_loss(al,y):
    return np.mean(np.square(al-y))


In [0]:
import numpy as np
import matplotlib.pyplot as plt
# 加载数据
def load_data():
    """
    加载数据集
    """
    x = np.arange(0.0,1.0,0.01)#0-1 0.01
    y =20* np.sin(2*np.pi*x)
    # 数据可视化
    plt.scatter(x,y)
    return x,y
#进行测试
x,y = load_data()
x = x.reshape(1,100)
y = y.reshape(1,100)
plt.scatter(x,y)
parameters = init_parameters([1,25,1])
al = 0
for i in range(4000):
    caches,al = forward(x, parameters)
    grades = backward(parameters, caches, al, y)
    parameters = update_grades(parameters, grades, learning_rate= 0.3)
    if i %100 ==0:
        print(compute_loss(al, y))
plt.scatter(x,al)
plt.show()



In [0]:

# -*- coding: utf-8 -*-
"""
last layer without sigmoid
@author: cdq
"""

import numpy as np
import matplotlib.pyplot as plt
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))
def dsigmoid(x):
    return sigmoid(x) * (1 - sigmoid(x))

class BP(object):
    def __init__(self, layers, activation='sigmoid', learning_rate=0.01):
        self.layers = layers
        self.learning_rate = learning_rate
        self.caches = {}
        self.grades = {}
        if activation == 'sigmoid':
            self.activation = sigmoid
            self.dactivation = dsigmoid
        self.parameters = {}
        for i in range(1, len(self.layers)):
            self.parameters["w"+str(i)] = np.random.random((self.layers[i], self.layers[i-1]))
            self.parameters["b"+str(i)] = np.zeros((layers[i],1))
    
    def forward(self, X):
        a = []
        z = []
        a.append(X)
        z.append(X)
        
        len_layers = len(self.parameters) // 2
        for i in range(1, len_layers):
            z.append(self.parameters["w"+str(i)] @ a[i-1] + self.parameters["b"+str(i)])
            a.append(sigmoid(z[-1]))
        #last layer without sigmoid
        z.append(self.parameters["w"+str(len_layers)] @ a[-1] + self.parameters["b"+str(len_layers)])
        a.append(z[-1])
        
        self.caches['z'] = z
        self.caches['a'] = a
        
        return self.caches, a[-1]
    
    def backward(self, y):
        a = self.caches['a']
        m = y.shape[1]
        # last layer grade
        len_layers = len(self.parameters) // 2
        self.grades["dz"+str(len_layers)] = a[-1]-y
        self.grades["dw"+str(len_layers)] = self.grades["dz"+str(len_layers)].dot(a[-2].T) / m
        self.grades["db"+str(len_layers)] = np.sum(self.grades["dz"+str(len_layers)], axis=1, keepdims=True) / m
        # compute grades
        for i in reversed(range(1, len_layers)):
            self.grades["dz"+str(i)] = self.parameters["w"+str(i+1)].T.dot(self.grades["dz"+str(i+1)]) * dsigmoid(self.caches["z"][i])
            self.grades["dw"+str(i)] = self.grades["dz"+str(i)].dot(self.caches["a"][i-1].T)/m
            self.grades["db"+str(i)] = np.sum(self.grades["dz"+str(i)],axis = 1,keepdims = True) /m
        #update weights and bias
        for i in range(1, len(self.layers)):
            self.parameters["w"+str(i)] -= self.learning_rate * self.grades["dw"+str(i)]
            self.parameters["b"+str(i)] -= self.learning_rate * self.grades["db"+str(i)]
            
    def compute_loss(self, y):
        return np.mean(np.square(self.caches['a'][-1]-y))
#%%
def test():
    x = np.arange(0.0,1.0,0.01)
    y =20* np.sin(2*np.pi*x)
    plt.scatter(x,y)
    
    x = x.reshape(1, 100)
    y = y.reshape(1, 100)
    
    bp = BP([1, 6, 1], learning_rate = 0.01)
    
    for i in range(1, 50000):
        caches, al = bp.forward(x)
        bp.backward(y)
        
        if(i%50 == 0):
            print(bp.compute_loss(y))
    plt.scatter(x, al)
    plt.show()
    
test()
