In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets

## Assignment 2-1
ReLu activation function을 구현해보세요

- Hint : np.maximum 함수 사용하면 편리합니다
- 다른 방법 사용하셔도 무방합니다


In [None]:
def relu(x):
  R = np.maximum(0,x)
  return R

##Assignment 2-2
ReLu의 derivative function을 구현해보세요


In [None]:
def d_relu(x):
  x[x<=0]=0
  x[x>0]=1
  return x

## Assignment 2-3
Lecture 2의 2. Backpropagation with numpy 코드 참고해서
Three layer MLP를 구한후, 학습을 돌려 보세요

hyperparameter는 다음과 같이 설정

- <#> of train data, <#> of test data : 60000, 10000
- epochs : 100
- hiddensize : 128, 64 (two layer)
- learning_rate : 0.5

In [None]:
# Assignment 2-3 구현은 여기서 ()
from IPython import get_ipython
get_ipython().magic('reset -sf')
import numpy as np
import sklearn.datasets

mnist = sklearn.datasets.fetch_openml('mnist_784', data_home="mnist_784")

In [None]:
num_train = 60000
num_class = 10

x_train = np.float32(mnist.data[:num_train]).T
y_train_index = np.int32(mnist.target[:num_train]).T
x_test = np.float32(mnist.data[num_train:]).T
y_test_index = np.int32(mnist.target[num_train:]).T

# Normalization

x_train /= 255
x_test /= 255
x_size = x_train.shape[0]

y_train = np.zeros((num_class, y_train_index.shape[0]))
for idx in range(y_train_index.shape[0]):
  y_train[y_train_index[idx], idx] = 1

y_test = np.zeros((num_class, y_test_index.shape[0]))
for idx in range(y_test_index.shape[0]):
  y_test[y_test_index[idx], idx] = 1  

In [None]:
hidden_size_1 = 128 
hidden_size_2 = 64

# three-layer neural network
params = {"W1": np.random.randn(hidden_size_1, x_size) * np.sqrt(1/ x_size),
          "b1": np.zeros((hidden_size_1, 1)) * np.sqrt(1/ x_size),
          "W2": np.random.randn(hidden_size_2, hidden_size_1) * np.sqrt(1/ hidden_size_1),
          "b2": np.zeros((hidden_size_2, 1)) * np.sqrt(1/ hidden_size_1),
          "W3": np.random.randn(num_class, hidden_size_2) * np.sqrt(1/ hidden_size_2),
          "b3": np.zeros((num_class, 1)) * np.sqrt(1/ hidden_size_2)
          }


In [None]:
def sigmoid(x):
  return 1/(1+np.exp(-x))

def d_sigmoid(x):
  # derivative of sigmoid
  exp = np.exp(-x)
  return (exp)/((1+exp)**2)

def softmax(x):
  exp = np.exp(x)
  return exp/np.sum(exp, axis=0)

def compute_loss(y_true, y_pred):
  # loss calculation
  num_sample = y_true.shape[1]
  Li = -1 * np.sum(y_true * np.log(y_pred))
  
  return Li/num_sample

In [None]:
def foward_pass(x, params):
  
  params["S1"] = np.dot(params["W1"], x) + params["b1"]
  params["A1"] = sigmoid(params["S1"])
  params["S2"] = np.dot(params["W2"], params["A1"]) + params["b2"]
  params["A2"] = sigmoid(params["S2"])
  params["S3"] = np.dot(params["W3"], params["A2"]) + params["b3"]
  params["A3"] = softmax(params["S3"])

  return params

In [None]:
def foward_pass_test(x, params):

  params_test = {}
  
  params_test["S1"] = np.dot(params["W1"], x) + params["b1"]
  params_test["A1"] = sigmoid(params_test["S1"])
  params_test["S2"] = np.dot(params["W2"], params_test["A1"]) + params["b2"]
  params_test["A2"] = sigmoid(params_test["S2"])
  params_test["S3"] = np.dot(params["W3"], params_test["A2"]) + params["b3"]
  params_test["A3"] = softmax(params_test["S3"])

  return params_test

def compute_accuracy(y_true, y_pred):
  y_true_idx = np.argmax(y_true, axis = 0)
  y_pred_idx = np.argmax(y_pred, axis = 0)
  num_correct = np.sum(y_true_idx==y_pred_idx)

  accuracy = num_correct / y_true.shape[1] * 100

  return accuracy

In [None]:
def backward_pass(x, y_true, params):

  dS3 = params["A3"] - y_true

  grads = {}

  grads["dW3"] =  np.dot(dS3, params["A2"].T)/x.shape[1]
  grads["db3"] =  (1/x.shape[1])*np.sum(dS3, axis=1, keepdims=True)/x.shape[1]

  dA2 = np.dot(params["W3"].T, dS3)
  dS2 = dA2 * d_sigmoid(params["S2"])

  grads["dW2"] = np.dot(dS2,  params["A1"].T)/x.shape[1]
  grads["db2"] = np.sum(dS2, axis=1, keepdims=True)/x.shape[1]

  dA1 = np.dot(params["W2"].T, dS2)
  dS1 = dA1 * d_sigmoid(params["S1"])

  grads["dW1"] = np.dot(dS1, x.T)/x.shape[1]
  grads["db1"] = np.sum(dS1, axis=1, keepdims=True)/x.shape[1]

  return grads

In [None]:
epochs = 100
learning_rate = 0.5

for i in range(1,epochs+1):

  if i == 1:
    params = foward_pass(x_train, params)
    
  grads = backward_pass(x_train, y_train, params)

  params["W1"] -= learning_rate * grads["dW1"]
  params["b1"] -= learning_rate * grads["db1"]
  params["W2"] -= learning_rate * grads["dW2"]
  params["b2"] -= learning_rate * grads["db2"]
  params["W3"] -= learning_rate * grads["dW3"]
  params["b3"] -= learning_rate * grads["db3"]

  params = foward_pass(x_train, params)
  train_loss = compute_loss(y_train, params["A3"])
  train_acc = compute_accuracy(y_train, params["A3"])

  params_test = foward_pass_test(x_test, params)
  test_loss = compute_loss(y_test, params_test["A3"])
  test_acc = compute_accuracy(y_test, params_test["A3"])
  if i % 10 == 0:
    print("Epoch {}: training loss = {}, training acuracy = {}%, test loss = {}, training acuracy = {}%"
    .format(i, np.round(train_loss, 6), np.round(train_acc, 2), np.round(test_loss, 6), np.round(test_acc, 2)))

Epoch 10: training loss = 2.274707, training acuracy = 12.12%, test loss = 2.273789, training acuracy = 12.38%
Epoch 20: training loss = 2.245321, training acuracy = 25.97%, test loss = 2.243568, training acuracy = 26.93%
Epoch 30: training loss = 2.204654, training acuracy = 41.5%, test loss = 2.201735, training acuracy = 43.16%
Epoch 40: training loss = 2.143366, training acuracy = 49.86%, test loss = 2.13871, training acuracy = 51.35%
Epoch 50: training loss = 2.049696, training acuracy = 53.76%, test loss = 2.042515, training acuracy = 54.51%
Epoch 60: training loss = 1.916781, training acuracy = 56.42%, test loss = 1.906461, training acuracy = 56.86%
Epoch 70: training loss = 1.754021, training acuracy = 59.48%, test loss = 1.740753, training acuracy = 59.56%
Epoch 80: training loss = 1.581855, training acuracy = 62.52%, test loss = 1.566607, training acuracy = 62.95%
Epoch 90: training loss = 1.418251, training acuracy = 65.44%, test loss = 1.402213, training acuracy = 65.97%
Epo

## Assignment 2-4
Lecture 2의 2. backpropagatin with numpy 부분의 성능을 지금까지 배운 지식을 바탕으로 향상시켜보세요

- Hint : Activation function, hyperparameter setting

In [7]:
# Assignment 2-4 구현은 여기서 ()
from IPython import get_ipython
get_ipython().magic('reset -sf')

import numpy as np
import sklearn.datasets

mnist = sklearn.datasets.fetch_openml('mnist_784', data_home="mnist_784")

num_train = 60000
num_class = 10

x_train = np.float32(mnist.data[:num_train]).T
y_train_index = np.int32(mnist.target[:num_train]).T
x_test = np.float32(mnist.data[num_train:]).T
y_test_index = np.int32(mnist.target[num_train:]).T

# Normalization

x_train /= 255
x_test /= 255
x_size = x_train.shape[0]

y_train = np.zeros((num_class, y_train_index.shape[0]))
for idx in range(y_train_index.shape[0]):
  y_train[y_train_index[idx], idx] = 1

y_test = np.zeros((num_class, y_test_index.shape[0]))
for idx in range(y_test_index.shape[0]):
  y_test[y_test_index[idx], idx] = 1    

In [21]:
hidden_size_1 = 128 
hidden_size_2 = 64
hidden_size_3 = 64
hidden_size_4 = 32

# five-layer neural network
params = {"W1": np.random.randn(hidden_size_1, x_size) * np.sqrt(1/ x_size),
          "b1": np.zeros((hidden_size_1, 1)) * np.sqrt(1/ x_size),

          "W2": np.random.randn(hidden_size_2, hidden_size_1) * np.sqrt(1/ hidden_size_1),
          "b2": np.zeros((hidden_size_2, 1)) * np.sqrt(1/ hidden_size_1),

          "W3": np.random.randn(hidden_size_3, hidden_size_2) * np.sqrt(1/ hidden_size_2),
          "b3": np.zeros((hidden_size_3, 1)) * np.sqrt(1/ hidden_size_2),

          "W4": np.random.randn(hidden_size_4, hidden_size_3) * np.sqrt(1/ hidden_size_3),
          "b4": np.zeros((hidden_size_4, 1)) * np.sqrt(1/ hidden_size_3),

          "W5": np.random.randn(num_class, hidden_size_4) * np.sqrt(1/ hidden_size_4),
          "b5": np.zeros((num_class, 1)) * np.sqrt(1/ hidden_size_4),
          
          }

In [22]:
def relu(x):
  R = np.maximum(0,x)
  return R

def d_relu(x):
  x[x<=0]=0
  x[x>0]=1
  return x

def softmax(x):
  exp = np.exp(x)
  return exp/np.sum(exp, axis=0)

def compute_loss(y_true, y_pred):
  # loss calculation
  num_sample = y_true.shape[1]
  Li = -1 * np.sum(y_true * np.log(y_pred))
  
  return Li/num_sample

In [23]:
def foward_pass(x, params):
  
  params["S1"] = np.dot(params["W1"], x) + params["b1"]
  params["A1"] = relu(params["S1"])
  params["S2"] = np.dot(params["W2"], params["A1"]) + params["b2"]
  params["A2"] = relu(params["S2"])
  params["S3"] = np.dot(params["W3"], params["A2"]) + params["b3"]
  params["A3"] = relu(params["S3"])
  params["S4"] = np.dot(params["W4"], params["A3"]) + params["b4"]
  params["A4"] = relu(params["S4"])
  params["S5"] = np.dot(params["W5"], params["A4"]) + params["b5"]
  params["A5"] = softmax(params["S5"])

  return params

def foward_pass_test(x, params):

  params_test = {}
  
  params_test["S1"] = np.dot(params["W1"], x) + params["b1"]
  params_test["A1"] = relu(params_test["S1"])
  params_test["S2"] = np.dot(params["W2"], params_test["A1"]) + params["b2"]
  params_test["A2"] = relu(params_test["S2"])
  params_test["S3"] = np.dot(params["W3"], params_test["A2"]) + params["b3"]
  params_test["A3"] = relu(params_test["S3"])
  params_test["S4"] = np.dot(params["W4"], params_test["A3"]) + params["b4"]
  params_test["A4"] = relu(params_test["S4"])
  params_test["S5"] = np.dot(params["W5"], params_test["A4"]) + params["b5"]
  params_test["A5"] = softmax(params_test["S5"])

  return params_test

def compute_accuracy(y_true, y_pred):
  y_true_idx = np.argmax(y_true, axis = 0)
  y_pred_idx = np.argmax(y_pred, axis = 0)
  num_correct = np.sum(y_true_idx==y_pred_idx)

  accuracy = num_correct / y_true.shape[1] * 100

  return accuracy

In [24]:
def backward_pass(x, y_true, params):

  dS5 = params["A5"] - y_true

  grads = {}

  grads["dW5"] =  np.dot(dS5, params["A4"].T)/x.shape[1]
  grads["db5"] =  (1/x.shape[1])*np.sum(dS5, axis=1, keepdims=True)/x.shape[1]

  dA4 = np.dot(params["W5"].T, dS5)
  dS4 = dA4 * d_relu(params["S4"])

  grads["dW4"] = np.dot(dS4,  params["A3"].T)/x.shape[1]
  grads["db4"] = np.sum(dS4, axis=1, keepdims=True)/x.shape[1]

  dA3 = np.dot(params["W4"].T, dS4)
  dS3 = dA3 * d_relu(params["S3"])

  grads["dW3"] = np.dot(dS3,  params["A2"].T)/x.shape[1]
  grads["db3"] = np.sum(dS3, axis=1, keepdims=True)/x.shape[1]

  dA2 = np.dot(params["W3"].T, dS3)
  dS2 = dA2 * d_relu(params["S2"])

  grads["dW2"] = np.dot(dS2,  params["A1"].T)/x.shape[1]
  grads["db2"] = np.sum(dS2, axis=1, keepdims=True)/x.shape[1]

  dA1 = np.dot(params["W2"].T, dS2)
  dS1 = dA1 * d_relu(params["S1"])

  grads["dW1"] = np.dot(dS1, x.T)/x.shape[1]
  grads["db1"] = np.sum(dS1, axis=1, keepdims=True)/x.shape[1]

  return grads

In [25]:
epochs = 1000
learning_rate = 0.05

for i in range(0,epochs+1):

  if i == 0:
    params = foward_pass(x_train, params)
    
  grads = backward_pass(x_train, y_train, params)

  params["W1"] -= learning_rate * grads["dW1"]
  params["b1"] -= learning_rate * grads["db1"]
  params["W2"] -= learning_rate * grads["dW2"]
  params["b2"] -= learning_rate * grads["db2"]
  params["W3"] -= learning_rate * grads["dW3"]
  params["b3"] -= learning_rate * grads["db3"]
  params["W4"] -= learning_rate * grads["dW4"]
  params["b4"] -= learning_rate * grads["db4"]
  params["W5"] -= learning_rate * grads["dW5"]
  params["b5"] -= learning_rate * grads["db5"]

  params = foward_pass(x_train, params)
  train_loss = compute_loss(y_train, params["A5"])
  train_acc = compute_accuracy(y_train, params["A5"])

  params_test = foward_pass_test(x_test, params)
  test_loss = compute_loss(y_test, params_test["A5"])
  test_acc = compute_accuracy(y_test, params_test["A5"])
  if i % 100 == 0 :
    print("Epoch {}: training loss = {}, training acuracy = {}%, test loss = {}, training acuracy = {}%"
    .format(i, np.round(train_loss, 6), np.round(train_acc, 2), np.round(test_loss, 6), np.round(test_acc, 2)))

Epoch 0: training loss = 2.306689, training acuracy = 9.01%, test loss = 2.307064, training acuracy = 9.11%
Epoch 100: training loss = 1.531997, training acuracy = 58.6%, test loss = 1.517695, training acuracy = 59.04%
Epoch 200: training loss = 0.568771, training acuracy = 84.33%, test loss = 0.551862, training acuracy = 84.75%
Epoch 300: training loss = 0.400079, training acuracy = 88.68%, test loss = 0.387023, training acuracy = 89.13%
Epoch 400: training loss = 0.33278, training acuracy = 90.5%, test loss = 0.3219, training acuracy = 90.65%
Epoch 500: training loss = 0.286499, training acuracy = 91.81%, test loss = 0.277308, training acuracy = 92.0%
Epoch 600: training loss = 0.259458, training acuracy = 92.48%, test loss = 0.251373, training acuracy = 92.79%
Epoch 700: training loss = 0.234347, training acuracy = 93.28%, test loss = 0.228377, training acuracy = 93.49%
Epoch 800: training loss = 0.214651, training acuracy = 93.86%, test loss = 0.210628, training acuracy = 93.91%
Ep

**무엇을 보완하였고, 왜 보완되었는지에 대한 자유 서술 (아래에)**



```
# 코드로 형식 지정됨
```

1. 2 layer MLP를 5 layer MLP로 신경망을 다층화함
- 보다 깊은 신경망을 거쳐 정교한 결과를 유도한다.

2. training epoch를 1000회로 증가
- 더 많은 횟수의 학습으로 정확도를 상승시킨다.

3. learning rate를 0.05로 줄여 정교화
- 경험을 통해 0.1의 learning rate이 가장 이상적임을 확인하였다.

4. activation function을 sigmoid에서 relu로 변화
- input이 0~255 사이의 데이터로 sigmoid function의 범위를 넘어가 많은 데이터가 0,1로 수렴하는 경우가 발생하고 gradient가 0으로 수렴하게 되어 relu에 비해 많은 cost가 발생한다.