In [1]:
import random
import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# many to many 구조 사용

### Load Data

In [3]:
nltk.download("gutenberg")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [4]:
raw = nltk.corpus.gutenberg.raw("shakespeare-hamlet.txt")
print(len(raw), '\n')
print(raw[:500])

162881 

[The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Barnardo and Francisco two Centinels.

  Barnardo. Who's there?
  Fran. Nay answer me: Stand & vnfold
your selfe

   Bar. Long liue the King

   Fran. Barnardo?
  Bar. He

   Fran. You come most carefully vpon your houre

   Bar. 'Tis now strook twelue, get thee to bed Francisco

   Fran. For this releefe much thankes: 'Tis bitter cold,
And I am sicke at heart

   Barn. Haue you had quiet Guard?
  Fran. Not


### Char to Dic

word가 아닌 character 단위 RNN

In [5]:
char2index = {}
index2char = []

In [6]:
for char in raw :
    if char not in char2index.keys() :
        char2index[char] = len(char2index)
        index2char.append(char)

In [7]:
char2index

{'\n': 27,
 ' ': 4,
 '!': 64,
 '&': 43,
 "'": 39,
 '(': 61,
 ')': 62,
 ',': 48,
 '-': 54,
 '.': 32,
 '1': 23,
 '5': 24,
 '9': 25,
 ':': 42,
 ';': 58,
 '?': 40,
 'A': 28,
 'B': 35,
 'C': 38,
 'D': 55,
 'E': 34,
 'F': 36,
 'G': 51,
 'H': 12,
 'I': 49,
 'K': 46,
 'L': 45,
 'M': 52,
 'N': 41,
 'O': 56,
 'P': 31,
 'Q': 59,
 'R': 53,
 'S': 19,
 'T': 1,
 'V': 63,
 'W': 18,
 'Y': 47,
 'Z': 66,
 '[': 0,
 ']': 26,
 'a': 6,
 'b': 16,
 'c': 29,
 'd': 8,
 'e': 3,
 'f': 11,
 'g': 7,
 'h': 2,
 'i': 9,
 'j': 65,
 'k': 20,
 'l': 14,
 'm': 13,
 'n': 33,
 'o': 10,
 'p': 22,
 'q': 50,
 'r': 5,
 's': 21,
 't': 15,
 'u': 30,
 'v': 44,
 'w': 37,
 'x': 57,
 'y': 17,
 'z': 60}

In [8]:
len(char2index)

67

In [9]:
char2vec = {}
eye = np.eye(len(char2index))   # identity matrix (대각행렬) --> one hot encoding

for item in char2index.keys() :
    char2vec[item] = eye[char2index[item],:]

In [10]:
char2vec['a']

array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [11]:
char2vec['b']

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [12]:
# text 문서의 전체 문자를 데이터 행렬로 변환

data = np.array([char2vec[char] for char in raw])
data.shape

(162881, 67)

In [13]:
len(data[0])

67

In [14]:
data.shape[1]

67

### Define Model

Parameters:
* input_size – The number of expected features in the input x
* hidden_size – The number of features in the hidden state h
* num_layers – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two RNNs together to form a stacked RNN, with the second RNN taking in outputs of the first RNN and computing the final results. Default: 1
* nonlinearity – The non-linearity to use. Can be either ‘tanh’ or ‘relu’. Default: ‘tanh’
* bias – If False, then the layer does not use bias weights b_ih and b_hh. Default: True
* batch_first – If True, then the input and output tensors are provided as (batch, seq, feature). Default: False
* dropout – If non-zero, introduces a Dropout layer on the outputs of each RNN layer except the last layer, with dropout probability equal to dropout. Default: 0
* bidirectional – If True, becomes a bidirectional RNN. Default: False

In [15]:
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(CharRNN, self).__init__()        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        
#         self.rnn = nn.GRU(input_size, hidden_size, num_layers)  # GRU는 nonlinearity 지원 안함.
#         self.rnn = nn.RNN(input_size, hidden_size, num_layers, dropout=0.5)

        self.rnn = nn.RNN(input_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, input, hidden):
        out, hidden = self.rnn(input.view(1,1,-1), hidden)  # 1*1*67
        out = self.fc(out.view(1,-1))
        return out, hidden

    def init_hidden(self):
        hidden = Variable(torch.zeros(self.num_layers, 1, self.hidden_size)).cuda() # weight 초기화
        return hidden

In [16]:
# input_size, hidden_size, output_size, num_layers

model = CharRNN(data.shape[1], 500, data.shape[1], 1).cuda()

In [17]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss = nn.CrossEntropyLoss()

### Training

In [18]:
step = 100
num_epochs = 5

for epoch in range(num_epochs):
    
    # starting point
    sp = list(range(0, len(data) - 2 * step, step))
    sp = np.add(sp, random.randint(0, step))
    random.shuffle(sp)
   
    print(len(sp))
    
    for i in range(len(sp)) :
    
        hidden = model.init_hidden()
        cost = 0

        for pos in range(sp[i], sp[i] + step):
            X = Variable(torch.from_numpy(data[pos]).type(torch.FloatTensor)).cuda()
            y = torch.from_numpy(data[pos+1]).cuda()
            
            _, y = y.max(dim=0)
            y = y.unsqueeze(0)
            
            pred, hidden = model(X, hidden)
            
            cost += loss(pred, Variable(y).cuda())

        cost.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), 5)  # explosion 방지.
        
        optimizer.step()
        
        if (i+1) == len(sp):
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f'%(epoch+1, num_epochs, i+1, len(sp), cost.item()))

1627
Epoch [1/5], Iter [100/1627] Loss: 326.3845
1627
Epoch [2/5], Iter [100/1627] Loss: 223.6533
1627
Epoch [3/5], Iter [100/1627] Loss: 186.1924
1627
Epoch [4/5], Iter [100/1627] Loss: 232.0170
1627
Epoch [5/5], Iter [100/1627] Loss: 167.7863


In [19]:
start_num = 1
text = index2char[start_num]

model.eval()
hidden = model.init_hidden()

X_test = Variable(torch.from_numpy(data[start_num]).type(torch.FloatTensor)).cuda()
    
for i in range(500) :

    pre, hidden = model(X_test, hidden)

    temp = pre.cpu().data.numpy()[0]  # 확률

    best_5 = np.argsort(temp)[::-1][:5]
    
    # softmax
    temp = np.exp(temp[best_5])
    temp = temp / temp.sum()
    
    pre = np.random.choice(best_5, 1, p = temp)[0]
    
    curr_char = index2char[pre]
    
    text += curr_char
    
    # 다음 y 입력
    X_test = Variable(torch.from_numpy(char2vec[curr_char]).type(torch.FloatTensor)).cuda()
    
print("* Generated Text : \n", text)

* Generated Text : 
 That in what is the fore in to the King on a Poltendinge

   Ham. A sight and mence this wat has whan in when with as the ferriner and

   Polon. There holdon that the heare of tise wall seefe:
Anght and stone

   Hor. I my Lort

   Hor. There shat with has, wath his do a Prechiosens. But bout not mige thos of in mone same of your who eat ingre sore werce,
I know mide in heaue the fantring of on of in hend
To his not, where'd and serale: who han heere
A there windent of my Laer? she thy fould war
