# ReLU

Sigmoid activation function은 0 또는 1로 수렴하기 때문에 gradient가 vanishing 할 수 있는 위험이 있다.

ReLU를 사용하면 이를 방지할 수 있으므로, ReLU를 사용해 Neural Network를 구현해보자!

## MNIST Classifier

In [1]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [2]:
batch_size = 100
learning_rate = 0.001
training_epochs = 15

In [3]:
import torchvision.datasets as dsets
import torchvision.transforms as transforms

mnist_path = "../datasets/MNIST_data/"

mnist_train = dsets.MNIST(root=mnist_path, train=True, transform=transforms.ToTensor(),
                          download=True)
mnist_test = dsets.MNIST(root=mnist_path, train=False, transform=transforms.ToTensor(),
                         download=True)

In [4]:
data_loader = torch.utils.data.DataLoader(dataset=mnist_train,
                                          batch_size=batch_size,
                                          shuffle=True,
                                          drop_last=True)

In [5]:
from torch import nn

class MNISTClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(784, 256, bias=True)
        self.linear2 = nn.Linear(256, 256, bias=True)
        self.linear3 = nn.Linear(256, 10, bias=True)
        self.relu = nn.ReLU()
        
        nn.init.normal_(self.linear1.weight)
        nn.init.normal_(self.linear2.weight)
        nn.init.normal_(self.linear3.weight)
        
    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.linear3(x)
        
        return x
    
model = MNISTClassifier().to(device)

In [6]:
from torch import optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), learning_rate)

In [7]:
num_batch = len(data_loader)

for epoch in range(training_epochs):
    avg_loss = 0
    
    for X, Y in data_loader:
        X = X.view(-1, 784).to(device)
        Y = Y.to(device)
        
        optimizer.zero_grad()
        hypothesis = model(X)
        loss = criterion(hypothesis, Y)
        loss.backward()
        optimizer.step()
        
        avg_loss += loss / num_batch
        
    print("Epoch: {:04d}, Loss: {:.9f}".format(epoch+1, avg_loss))

print("Learning Finished")        

Epoch: 0001, Loss: 129.358459473
Epoch: 0002, Loss: 36.173007965
Epoch: 0003, Loss: 23.009593964
Epoch: 0004, Loss: 16.035247803
Epoch: 0005, Loss: 11.578867912
Epoch: 0006, Loss: 8.558766365
Epoch: 0007, Loss: 6.399905682
Epoch: 0008, Loss: 4.823712826
Epoch: 0009, Loss: 3.544556618
Epoch: 0010, Loss: 2.721437454
Epoch: 0011, Loss: 2.077881336
Epoch: 0012, Loss: 1.647168875
Epoch: 0013, Loss: 1.261189938
Epoch: 0014, Loss: 0.995852590
Epoch: 0015, Loss: 0.851831138
Learning Finished


In [8]:
import random

with torch.no_grad():
    X_test = mnist_test.data.view(-1, 784).float().to(device)
    Y_test = mnist_test.targets.to(device)
    
    predictions = model(X_test)
    correct_predictions = torch.argmax(predictions, dim=1) == Y_test
    accuracy = correct_predictions.float().mean()
    print("Accuracy: ", accuracy.item())
    
    # test data의 sample 한개 예측
    r = random.randint(0, len(mnist_test) - 1)
    X_single_data = mnist_test.data[r:r + 1].view(-1, 28 * 28).float().to(device)
    Y_single_data = mnist_test.targets[r:r + 1].to(device)

    print('Label: ', Y_single_data.item())
    single_prediction = model(X_single_data)
    print('Prediction: ', torch.argmax(single_prediction, 1).item())

Accuracy:  0.9472000002861023
Label:  8
Prediction:  8
