In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
sentence = "Leonardo da Vinci was an Italian polymath of the High Renaissance who is widely considered one of the most diversely talented individuals ever to have lived.".split()
print(sentence) #split : space 단위로 split

['Leonardo', 'da', 'Vinci', 'was', 'an', 'Italian', 'polymath', 'of', 'the', 'High', 'Renaissance', 'who', 'is', 'widely', 'considered', 'one', 'of', 'the', 'most', 'diversely', 'talented', 'individuals', 'ever', 'to', 'have', 'lived.']


In [3]:
vocab = list(set(sentence)) # 중복제거

In [4]:
word2index={tkn: i for i, tkn in enumerate(vocab,1)} 
word2index['unk']=0
print(word2index) # 각 단어에 정수 부여한 dictionary

{'Vinci': 1, 'individuals': 2, 'an': 3, 'have': 4, 'widely': 5, 'Italian': 6, 'of': 7, 'was': 8, 'lived.': 9, 'the': 10, 'da': 11, 'who': 12, 'Leonardo': 13, 'diversely': 14, 'most': 15, 'one': 16, 'Renaissance': 17, 'High': 18, 'is': 19, 'ever': 20, 'polymath': 21, 'talented': 22, 'to': 23, 'considered': 24, 'unk': 0}


enumerate : (index 번호, 원소) tuple 반환

In [5]:
index2word={v:k for k,v in word2index.items()} #items(): key 와 value 한꺼번에
print(index2word)

{1: 'Vinci', 2: 'individuals', 3: 'an', 4: 'have', 5: 'widely', 6: 'Italian', 7: 'of', 8: 'was', 9: 'lived.', 10: 'the', 11: 'da', 12: 'who', 13: 'Leonardo', 14: 'diversely', 15: 'most', 16: 'one', 17: 'Renaissance', 18: 'High', 19: 'is', 20: 'ever', 21: 'polymath', 22: 'talented', 23: 'to', 24: 'considered', 0: 'unk'}


In [6]:
def build_data(sentence, word2index):
  encoded=[word2index[token]for token in sentence] #각 단어를 정수로
  input_seq,label_seq =encoded[0:-1],encoded[1:] # 입력 시퀀스, 레이블 시퀀스 - 다음 단어를 예측?
  print(input_seq)
  print(label_seq)
  input_seq = torch.LongTensor(input_seq).unsqueeze(0) # 배치 차원 추가
  label_seq = torch.LongTensor(label_seq).unsqueeze(0) # 배치 차원 추가
  return input_seq, label_seq

input 과 label 설정 함수


In [7]:
X,Y= build_data(sentence,word2index)

[13, 11, 1, 8, 3, 6, 21, 7, 10, 18, 17, 12, 19, 5, 24, 16, 7, 10, 15, 14, 22, 2, 20, 23, 4]
[11, 1, 8, 3, 6, 21, 7, 10, 18, 17, 12, 19, 5, 24, 16, 7, 10, 15, 14, 22, 2, 20, 23, 4, 9]


In [15]:
class Net(nn.Module):
  def __init__(self,vocab_size,input_size,hidden_size,batch_first=True): # 생성자
    super(Net,self).__init__() # nn.Module의 생성자 호출
    self.embedding_layer = nn.Embedding(num_embeddings=vocab_size,embedding_dim=input_size) #word embedding 
    self.rnn_layer=nn.RNN(input_size, hidden_size, batch_first=batch_first) #inputsize :입력차원, hidden_size:은닉 상태의 크기
    self.linear = nn.Linear(hidden_size, vocab_size)

  def forward(self,x):
    output = self.embedding_layer(x)
    #크기변화 : 배치크기, 시퀀스 길이 -> 배치크기, 시퀀스 길이, 임베딩 차원
    output, hidden = self.rnn_layer(output)
    output=self.linear(output)
    return output.view(-1,output.size(2))

In [16]:
vocab_size =len(word2index) #단어장의 크기
input_size=5 # RNN층 입력차원 크기 , 임베딩 된 차원의 크기
hidden_size=20 #RNN의 은닉층

In [17]:
model = Net(vocab_size,input_size,hidden_size,batch_first=True)

In [18]:
#loss 함수
loss_function=nn.CrossEntropyLoss()

In [19]:
#optimizer
optimizer = optim.Adam(params=model.parameters())

In [20]:
output=model(X)
print(output)

tensor([[-0.3776,  0.3011, -0.1392, -0.4326, -0.1811,  0.0712,  0.3543, -0.1683,
          0.1419,  0.2920, -0.0438,  0.0343, -0.3666, -0.1203,  0.2777, -0.0687,
         -0.2741,  0.0914, -0.1746,  0.4913,  0.1229,  0.6166,  0.1829, -0.0480,
          0.3907],
        [-0.6193,  0.0997,  0.2073, -0.4324, -0.4974,  0.2198,  0.1996, -0.1955,
         -0.2228,  0.4228,  0.0794,  0.0055, -0.3299, -0.0673,  0.2764, -0.1537,
          0.1822,  0.0671,  0.1754,  0.3929,  0.4090,  0.6248,  0.0478, -0.0837,
          0.3521],
        [-0.6039, -0.5462,  0.4990, -0.0993,  0.0713,  0.2089,  0.3072,  0.2507,
         -0.1966, -0.0014,  0.2882, -0.2676, -0.1704, -0.1414,  0.0718,  0.2180,
          0.1076, -0.2918,  0.0311,  0.0236,  0.3958,  0.1882, -0.2351, -0.2065,
         -0.0326],
        [-0.1347,  0.1048, -0.4293, -0.4550,  0.0236, -0.0448,  0.2253, -0.1704,
          0.0885, -0.0474, -0.0485, -0.0113, -0.3872, -0.3271,  0.0777,  0.1028,
         -0.6863,  0.2121, -0.0663,  0.2128, -0.0144

In [21]:
print(output.shape)

torch.Size([25, 25])


In [22]:
#수치화된 데이터를 단어로 전환
decode = lambda y:[index2word.get(x) for x in y]

In [23]:
# 훈련 시작
for step in range(201):
    # 경사 초기화
    optimizer.zero_grad()
    # 순방향 전파
    output = model(X)
    # 손실값 계산
    loss = loss_function(output, Y.view(-1))
    # 역방향 전파
    loss.backward()
    # 매개변수 업데이트
    optimizer.step()
    # 기록
    if step % 40 == 0:
        print("[{:02d}/201] {:.4f} ".format(step+1, loss))
        pred = output.softmax(-1).argmax(-1).tolist()
        print(" ".join(["Repeat"] + decode(pred)))
        print()

[01/201] 3.2185 
Repeat polymath polymath individuals considered ever ever talented individuals ever ever polymath widely Italian polymath ever ever the polymath considered polymath polymath polymath ever considered Leonardo

[41/201] 2.7742 
Repeat da Vinci of have is widely considered the ever of the is of considered one of the most diversely the the ever to diversely is

[81/201] 2.1829 
Repeat da Vinci of an Italian polymath of the most Renaissance who is widely considered one of the most diversely talented individuals ever to have lived.

[121/201] 1.5632 
Repeat da Vinci was an Italian polymath of the High Renaissance who is widely considered one of the most diversely talented individuals ever to have lived.

[161/201] 1.0483 
Repeat da Vinci was an Italian polymath of the High Renaissance who is widely considered one of the most diversely talented individuals ever to have lived.

[201/201] 0.6989 
Repeat da Vinci was an Italian polymath of the High Renaissance who is widely cons