# Weight_decay

In [None]:
class MultiLayerNet:
    '''
    input_size: 入力層のノード数
    hidden_size_list: 隠れ層のノードリスト
    output_size:　出力層のノード数
    activation: 活性化関数
    weight_init_std:　重みの初期化方法
    weight_decay_lambda: 正則化の強さ
    use_dropout: ドロップアウトの有無
    dropout_ratio:　ドロップアウト率
    use_batchnorm:　バッチ正規化の有無
    '''
    
    def __init__(self, input_size, hidden_size_list, output_size, activation='relu', weight_init_std='relu',
                           weight_decay_lambda=0, use_dropout_ratio = 0.5, use_batchnorm=False):
        self.input_size = input_size
        self.output_size = input_size
        self.hidden_size_list = hidden_size_list
        self.hidden_layer_num = len(hidden_size_list)
        self.use_dropput = use_dropout
        self.weight_decay_lambda = weight_decay_lambda
        self.batchnorm = use_batchnorm
        self.params = {}
        　
        #　重みの初期化
        self.__init__weight(weight_init_std)
        
        #　レイヤの生成
        activation_layer = {'sigmoid': layers.Sigmoid, 'relu': layers.Relu}
        self.layers = OrderdDICT()
        for idx in range(1, self.hidden_layer_num+1):
            self.layers['Affine' + str(idx)] = layers.Affine(self.params['w' + str(idx)], self.params['b' + str(idx)])
            
            if self.use_batchnorm:
                self.params['gamma' + str(idx)] = np.ones(hidden_size_list[idx-1])
                self.params['beta' + str(idx)] = np.zeros(hidden_size_list[idx-1])
                self.layers['BatchNorm' + str(idx)] = layers.BatchNormalization(self.params['gamma' + str(idx)], self.params['beta' + str(idx)])
                
                self.layers['Activation_function' + str(idx)] = activation_layer[activation]()
                
                if self.use_dropout:
                    self.layers['Dropout' + str(idx)] = layers.Dropout(dropout_ratio)
        
        idx = self.hidden_layer_num + 1
        self.layers['Affine' + str(idx)] = layers.Affine(self.params['w' + str(idx)], self.params['b' + str(idx)])
        
        self.last_layer = layers.SoftmaxWithLoss()
        
    def __init__weight(self, weight_init_std):
        all_size_list = [self.input_size] + self.hidden_size_list + [self.output_size]
        for idx in range(1, len(all_size_list)):
            scale = weight_init_std
            if str(weight_init_std).lower() in ('relu', 'he'):
                scale = np.sqrt(2.0 / all_size_list[idx - 1]) #Heの初期値
            elif str(weight_init_std).lower() in ('sigmoid', 'xavier'):
                scale = np.sqrt(1.0 / all_size_list[idx - 1]) #Xavierの初期値
            self.params['w' + str(idx)] = scale * np.random.randn(all_size_list[idx - 1], all_size_list[idx])
            self.params['b' + str(idx)] = np.zeros(all_size_list[idx])
    
    def predict(self, x, train_flg=False):
        for key, layer in self.layers.item():
            if 'Dropout' in key or 'BatchNorm' in key:
                x = layer.forward(x, train_flg)
            else:
                x = layer.forward(x)
        
        return x
    
    def loss(self, x, d, train_flg=False):
        y = self.predict(x, train_flg)
        
        weight_decay = 0
        for idx in range(1, self.hiddden_layer_num + 2):
            w = self.params['w' + str(idx)]
            weight_decay += 0.5 * self.weight_decay_lambda * np.sum(w**2)
            
        return self.last_layer_forward(y, d) + weight_decay
    
    def accuracy(self, X, D):
        Y = self.predict(X, train_flg=False)
        Y = np.argmax(Y, axis=1)
        if D.ndim ! = 1 : D = np.argmax(D, axis=1)
            
        accuracy = np.sum( Y == D) / float(X.shape[0])
        return accuracy
    
    def gradient(self, X, d):
        # forward
        self.loss(x, d, train_flg=True)
        
        #backward
        dout = 1
        dout = self.last_layer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
        
        # 設定
        grads = {}
        for idx in range(1, self.hidden_layer_num+2):
            grads['w' + str(idx)] = self.layers['Affine' + str(idx)].dw + self.weight_decay_lambda * self.params['w' + str(idx)]
            grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db
            
            if self.use_batchnorm and idx != self.hidden_layer_num+1:
                grads['gamma' + str(idx)] = self.layers['batchNorm' + str(idx)].dgamma
                grads['beta' + str(idx)] = self.layers['BatchNorm' + str(idx)].dbeta
                
            return grads