In [2]:
import numpy as np
import re

In [3]:
data = '''나라말이 중국과 달라 한자와 서로 통하지 아니하므로, 어리석은 백성들이 말하고자 하는 바가 있어도 끝내 제 뜻을 펴지 못하는 사람이 많다 내가 이를 불쌍히 여겨 새로 스물 여덟 글자를 만드니 사람마다 하여금 쉽게 익혀 날마다 씀에 편하게 하고자 할 따름이다'''

In [4]:
def data_preprocessing(data):
    data = re.sub(r'[^가-힣\s]', '', data)
    tokens = data.split()
    vocab = list(set(tokens))
    vocab_size = len(vocab)

    word_to_ix = {word: i for i, word in enumerate(vocab)}
    ix_to_word = {i: word for i, word in enumerate(vocab)}

    return tokens, vocab_size, word_to_ix, ix_to_word

# RNN

In [22]:
def init_weights(h_size, vocab_size):
    U = np.random.randn(h_size, vocab_size) * 0.01
    W = np.random.randn(h_size, h_size) * 0.01
    V = np.random.randn(vocab_size, h_size) * 0.01
    return U, W, V

In [33]:
#MLP
def feedforward(inputs, targets, hprev): # hprev: 이전 Hidden State
    loss = 0
    xs, hs, ps, ys = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    for i in range(seq_len):
        xs[i] = np.zeros((vocab_size, 1))
        xs[i][inputs[i]] = 1 # one-hot encoding
        hs[i] = np.tanh(np.dot(U, xs[i]) + np.dot(W, hs[i-1])) # Hidden State 계산
        ys[i] = np.dot(V, hs[i])
        ps[i] = np.exp(ys[i]) / np.sum(np.exp(ys[i])) # softmax 계산
        loss += -np.log(ps[i][targets[i], 0])
    return loss, ps, hs, xs

In [24]:
def backward(ps, hs, xs):
    # Backward Propagation Through Time (BPTT)
    # 처음에 모든 가중치들은 0으로 설정
    dV = np.zeros(V.shape)
    dW = np.zeros(W.shape)
    dU = np.zeros(U.shape)

    for i in range(seq_len)[::-1]:
        output = np.zeros((vocab_size, 1))
        output[targets[i]] = 1
        ps[i] = ps[i] - output.reshape(-1, 1)

        dV_step_i = ps[i] @ (hs[i].T) # (y_hat - y) @ hs.T - for each step
        dV += dV_step_i

        delta_recent = (V.T @ ps[i]) * (1 - hs[i] ** 2)

        for j in range(i+1)[::-1]:
            dW_ij = delta_recent @ hs[j-1].T
            dW += dW_ij

            dU_ij = delta_recent @ xs[j].reshape(1, -1)
            dU += dU_ij

            delta_recent = (W.T @ delta_recent) * (1 - hs[j-1] ** 2)
        
        for d in [dU, dW, dV]:
            np.clip(d, -1, 1, out=d)

    return dU, dW, dV, hs[len(inputs)-1]

In [25]:
def predict(word, length):
    x = np.zeros((vocab_size, 1))
    x[word_to_ix[word]] = 1
    ixes = []
    h = np.zeros((h_size, 1))

    for t in range(length):
        h = np.tanh(np.dot(U, x) + np.dot(W, h))
        y = np.dot(V, h)
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.argmax(p) # 가장 높은 확률의 단어를 선택
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    
    prep_words = ' '.join([ix_to_word[ix] for ix in ixes])
    return prep_words

### Paramter

In [26]:
epochs = 10000
h_size = 100
seq_len = 3
learning_rate = 0.01

### Run

In [27]:
tokens, vocab_size, word_to_ix, ix_to_word = data_preprocessing(data)

In [28]:
tokens

['나라말이',
 '중국과',
 '달라',
 '한자와',
 '서로',
 '통하지',
 '아니하므로',
 '어리석은',
 '백성들이',
 '말하고자',
 '하는',
 '바가',
 '있어도',
 '끝내',
 '제',
 '뜻을',
 '펴지',
 '못하는',
 '사람이',
 '많다',
 '내가',
 '이를',
 '불쌍히',
 '여겨',
 '새로',
 '스물',
 '여덟',
 '글자를',
 '만드니',
 '사람마다',
 '하여금',
 '쉽게',
 '익혀',
 '날마다',
 '씀에',
 '편하게',
 '하고자',
 '할',
 '따름이다']

In [29]:
vocab_size

39

In [30]:
U, W, V = init_weights(h_size, vocab_size)

In [31]:
U

array([[-6.50252988e-03, -3.67714625e-03, -3.34210828e-05, ...,
        -1.29653317e-02, -1.17891387e-02,  5.09585015e-03],
       [-1.00150819e-03, -1.07953846e-02,  3.01023079e-03, ...,
        -4.02864948e-03,  6.88232275e-03,  3.07877789e-03],
       [-3.65943042e-03, -1.82963063e-02, -9.58025665e-03, ...,
         4.39615447e-03, -1.14761029e-03, -1.05240217e-02],
       ...,
       [ 1.45902631e-02, -1.62929595e-03,  5.74625118e-03, ...,
        -1.33285618e-02,  4.98080517e-04,  1.52250661e-02],
       [ 5.98281381e-03,  8.14329480e-03, -1.71042616e-02, ...,
         5.96537100e-03,  1.29977773e-02,  1.98031474e-02],
       [-1.27315087e-03, -6.33662520e-03, -9.96767504e-03, ...,
         1.12440736e-02, -6.85301754e-03,  7.18290946e-03]])

In [34]:
p = 0
hprev = np.zeros((h_size, 1))
for epoch in range(epochs):
    for p in range(len(tokens) - seq_len):
        inputs = [word_to_ix[tok] for tok in tokens[p:p + seq_len]] # 나라의 
        targets = [word_to_ix[tok] for tok in tokens[p + 1:p + seq_len + 1]] # 말이

        loss, ps, hs, xs = feedforward(inputs, targets, hprev)

        dU, dW, dV, hprev = backward(ps, hs, xs) # Backward Propagation

        W -= learning_rate * dW
        U -= learning_rate * dU
        V -= learning_rate * dV

    if epoch % 100 == 0:
        print(f'Epoch: {epoch}, Loss: {loss}')

Epoch: 0, Loss: 10.989632194500821
Epoch: 100, Loss: 2.872124129622122
Epoch: 200, Loss: 0.3756356976937405
Epoch: 300, Loss: 0.1743231949969733
Epoch: 400, Loss: 0.10975351178868817
Epoch: 500, Loss: 0.08031774632046923
Epoch: 600, Loss: 0.06419006421472803
Epoch: 700, Loss: 0.05348128018233771
Epoch: 800, Loss: 0.04527186480421673
Epoch: 900, Loss: 0.0388750068164998
Epoch: 1000, Loss: 0.03391673147266552
Epoch: 1100, Loss: 0.030036738385053153
Epoch: 1200, Loss: 0.026936534610948908
Epoch: 1300, Loss: 0.02439685523776114
Epoch: 1400, Loss: 0.022266367232185606
Epoch: 1500, Loss: 0.0204426882787836
Epoch: 1600, Loss: 0.01885718284019442
Epoch: 1700, Loss: 0.01746441099207391
Epoch: 1800, Loss: 0.0162334435072776
Epoch: 1900, Loss: 0.0151412529611402
Epoch: 2000, Loss: 0.01416906391610472
Epoch: 2100, Loss: 0.01330087590533279
Epoch: 2200, Loss: 0.012522940074742354
Epoch: 2300, Loss: 0.011823482904942829
Epoch: 2400, Loss: 0.011192441209060893
Epoch: 2500, Loss: 0.010621195315575557


In [35]:
while True:
    try:
        user_input = input("input word: ") # 나라말이
        if user_input == 'break':
            break
        response = predict(user_input, 20)
        print(response)

    except:
        print('Try Again!')

Try Again!
중국과 달라 한자와 달라 한자와 서로 한자와 서로 통하지 서로 통하지 아니하므로 어리석은 아니하므로 어리석은 백성들이 어리석은 백성들이 말하고자 백성들이


# LSTM

In [5]:
from tqdm import tqdm

In [6]:
def sigmoid(input):
    return 1 / (1 + np.exp(-input))

def sigmoid_derivative(input):
    return input * (1 - input)

def tanh(input, derivative = False):
    return np.tanh(input)

def tanh_derivative(input):
    return 1 - input ** 2

def softmax(input):
    return np.exp(input) / np.sum(np.exp(input))

In [16]:
class LSTM:
    def __init__(self, input_size, hidden_size, output_size, num_epochs, learning_rate):
        self.learning_rate = learning_rate
        self.hidden_size = hidden_size
        self.num_epochs = num_epochs

        # Forget Gate
        self.Wf = np.random.randn(hidden_size, input_size) * 0.1
        self.bf = np.zeros((hidden_size, 1))

        # Input Gate
        self.Wi = np.random.randn(hidden_size, input_size) * 0.1
        self.bi = np.zeros((hidden_size, 1))

        # Candidate Gate
        self.Wc = np.random.randn(hidden_size, input_size) * 0.1
        self.bc = np.zeros((hidden_size, 1))
        
        # Output Gate
        self.Wo = np.random.randn(hidden_size, input_size) * 0.1
        self.bo = np.zeros((hidden_size, 1))

        # Final Gate
        self.Wy = np.random.randn(output_size, hidden_size)
        self.by = np.zeros((output_size, 1))
    
    def reset(self): # Epoch 마다 초기화
        self.X = {}

        self.HS = {-1: np.zeros((self.hidden_size, 1))}
        self.CS = {-1: np.zeros((self.hidden_size, 1))}

        self.C = {}
        self.O = {}
        self.F = {}
        self.I = {}
        self.outputs = {}

    # Forward Propagation
    def forward(self, inputs):
        x = {}
        outputs = []
        for t in range(len(inputs)):
            x[t] = np.zeros((vocab_size, 1))
            x[t][inputs[t]] = 1 # one-hot encoding
            self.X[t] = np.concatenate((self.HS[t-1], x[t]))

            self.F[t] = sigmoid(np.dot(self.Wf, self.X[t]) + self.bf)
            self.I[t] = sigmoid(np.dot(self.Wi, self.X[t]) + self.bi)
            self.C[t] = tanh(np.dot(self.Wc, self.X[t]) + self.bc)
            self.O[t] = sigmoid(np.dot(self.Wo, self.X[t]) + self.bo)

            self.CS[t] = self.F[t] * self.CS[t-1] + self.I[t] * self.C[t]
            self.HS[t] = self.O[t] * tanh(self.CS[t])

            outputs += [np.dot(self.Wy, self.HS[t]) + self.by]

        return outputs
    
    def backward(self, errors, inputs):
        dLdWf, dLdbf = 0, 0
        dLdWi, dLdbi = 0, 0
        dLdWc, dLdbc = 0, 0
        dLdWo, dLdbo = 0, 0
        dLdWy, dLdby = 0, 0

        dh_next, dc_next = np.zeros_like(self.HS[0]), np.zeros_like(self.CS[0])
        for t in reversed(range(len(inputs))):
            error = errors[t]

            # Final Gate Weights and Biases Errors
            dLdWy += np.dot(error, self.HS[t].T)
            dLdby += error

            # Calculate the hidden layer error
            dLdHS = np.dot(self.Wy.T, error) + dh_next

            # Output Gate Weights and Biases Errors
            dLdO = tanh(self.CS[t]) * dLdHS * sigmoid_derivative(self.O[t])
            dLdWo += np.dot(dLdO, inputs[t].T)
            dLdbo += dLdO

            # Cell State Errors
            dLdCS = tanh_derivative(tanh(self.CS[t])) * dLdHS * self.O[t] + dc_next

            # Forget Gate Weights and Biases Errors
            dLdf = dLdCS * self.CS[t-1] * sigmoid_derivative(self.F[t])
            dLdWf += np.dot(dLdf, inputs[t].T)
            dLdbf += dLdf

            # Input Gate Weights and Biases Errors
            dLdi = dLdCS * self.C[t] * sigmoid_derivative(self.I[t])
            dLdWi += np.dot(dLdi, inputs[t].T)
            dLdbi += dLdi

            # Candidate Gate Weights and Biases Errors
            dLdc = dLdCS * self.I[t] * tanh_derivative(self.C[t])
            dLdWc += np.dot(dLdc, inputs[t].T)
            dLdbc += dLdc

            # Concatenated Input Errors
            d_z = np.dot(self.Wf.T, dLdf) + np.dot(self.Wi.T, dLdi) + np.dot(self.Wc.T, dLdc) + np.dot(self.Wo.T, dLdO)

            # Error of Hidden State and Cell State at next time step
            dh_next = d_z[:self.hidden_size, :]
            dc_next = self.F[t] * dLdCS
        
        for d_ in (dLdWf, dLdbf, dLdWi, dLdbi, dLdWc, dLdbc, dLdWo, dLdbo, dLdWy, dLdby):
            np.clip(d_, -1, 1, out=d_)
        
        self.Wf += dLdWf * self.learning_rate * (-1)
        self.bf += dLdbf * self.learning_rate * (-1)
        
        self.Wi += dLdWi * self.learning_rate * (-1)
        self.bi += dLdbi * self.learning_rate * (-1)

        self.Wc += dLdWc * self.learning_rate * (-1)
        self.bc += dLdbc * self.learning_rate * (-1)

        self.Wo += dLdWo * self.learning_rate * (-1)
        self.bo += dLdbo * self.learning_rate * (-1)

        self.Wy += dLdWy * self.learning_rate * (-1)
        self.by += dLdby * self.learning_rate * (-1)

    def train(self, inputs, labels):
        for _ in tqdm(range(self.num_epochs)):
            self.reset()
            input_idx = [Word_to_ix[input] for input in inputs]
            predictions = self.forward(input_idx)

            errors = []
            for t in range(len(predictions)):
                errors += [softmax(predictions[t])]
                errors[-1][Word_to_ix[labels[t]]] -= 1

            self.backward(errors, self.X)

    def test(self, inputs, labels):
        accuracy = 0
        probabilities = self.forward([Word_to_ix[input] for input in inputs])

        gt = ''
        output = '나라말이 '
        for q in range(len(labels)):
            prediction = ix_to_Word[np.argmax(softmax(probabilities[q].reshape(-1)))]
            gt += inputs[q] + ' '
            output += prediction + ' '

            if prediction == labels[q]:
                accuracy += 1
        print('실제값: ', gt)
        print('예측값: ', output)

### Parameter

In [17]:
hidden_size = 25

### Run

In [18]:
# data preprocessing
tokens, vocab_size, Word_to_ix, ix_to_Word = data_preprocessing(data)
train_X, train_y = tokens[:-1], tokens[1:]

lstm = LSTM(input_size = vocab_size + hidden_size, hidden_size = hidden_size, output_size = vocab_size, num_epochs = 1000, learning_rate = 0.05)

# Train
lstm.train(train_X, train_y)
lstm.test(train_X, train_y)

100%|██████████| 1000/1000 [00:02<00:00, 441.01it/s]

실제값:  나라말이 중국과 달라 한자와 서로 통하지 아니하므로 어리석은 백성들이 말하고자 하는 바가 있어도 끝내 제 뜻을 펴지 못하는 사람이 많다 내가 이를 불쌍히 여겨 새로 스물 여덟 글자를 만드니 사람마다 하여금 쉽게 익혀 날마다 씀에 편하게 하고자 할 
예측값:  나라말이 중국과 달라 한자와 서로 통하지 아니하므로 어리석은 백성들이 말하고자 하는 바가 있어도 끝내 제 뜻을 펴지 못하는 사람이 많다 내가 이를 불쌍히 여겨 새로 스물 여덟 글자를 만드니 사람마다 하여금 쉽게 익혀 날마다 씀에 편하게 하고자 할 따름이다 



