In [1]:
import torch
import torch.nn as nn
from torch.optim import SGD 
import numpy as np

In [2]:
class WordDataSet:
    
    def __init__(self, word):
        self.chars2idx = {}
        self.indexs  = []
        for c in word: 
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)
                
            self.indexs.append(self.chars2idx[c])
            
        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)
        
    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x
    
    def __iter__(self):
        return zip(self.indexs[:-1], self.indexs[1:])
    
    def __len__(self):
        return self.seq_len
    
    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

In [3]:
#тестовое слово
word = 'ololoasdasddqweqw123456789'

## LSTM

In [4]:
class LSTM(nn.Module):
    def __init__(self, in_size = 5, hidden_size = 3, out_size = 5):
        super(LSTM, self).__init__() 
        #input gate
        self.hidden_ii   = nn.Linear(in_features = in_size, out_features = hidden_size)
        self.hidden_hi   = nn.Linear(in_features = hidden_size, out_features = hidden_size)
        # forget gate
        self.hidden_if   = nn.Linear(in_features = in_size, out_features = hidden_size)
        self.hidden_hf   = nn.Linear(in_features = hidden_size, out_features = hidden_size)
        # gate gate
        self.hidden_ig   = nn.Linear(in_features = in_size, out_features = hidden_size)
        self.hidden_hg   = nn.Linear(in_features = hidden_size, out_features = hidden_size)
        # output gate
        self.hidden_io   = nn.Linear(in_features = in_size, out_features = hidden_size)
        self.hidden_ho   = nn.Linear(in_features = hidden_size, out_features = hidden_size)
        # activations
        self.tanh        = nn.Tanh()
        self.sigmoid     = nn.Sigmoid()
        # out
        self.out         = nn.Linear(in_features = hidden_size, out_features = out_size)
    
    def forward(self, x, prev_c, prev_h):
        i = self.sigmoid(self.hidden_ii(x) + self.hidden_hi(prev_h))
        f = self.sigmoid(self.hidden_if(x) + self.hidden_hf(prev_h))
        g = self.tanh(self.hidden_ig(x) + self.hidden_hg(prev_h))
        o = self.sigmoid(self.hidden_io(x) + self.hidden_ho(prev_h))
        c = f * prev_c + i * g
        h = o * self.tanh(c)
        return self.out(h), c, h

## Инициализация переменных 

In [5]:
ds        = WordDataSet(word = word)
lstm      = LSTM(in_size = ds.vec_size, hidden_size = 10, out_size = ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 100
optim     = SGD(lstm.parameters(), lr = 0.1, momentum = 0.9)

## Обучение 

In [6]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(lstm.hidden_hi.in_features)
    cc = torch.zeros(lstm.hidden_hi.in_features)
    loss = 0
    optim.zero_grad()
    
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target = torch.LongTensor([next_sample])
        y, cc, hh = lstm(x, cc, hh)   
        loss += criterion(y, target)
     
    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(lstm.parameters(), max_norm = 5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(lstm.parameters(), max_norm = 1)
    
    optim.step()

70.697265625
Clip gradient :  2.691818253015925
65.51197814941406
Clip gradient :  3.04446056192994
41.48557662963867
Clip gradient :  6.498533868747408
27.55721664428711
Clip gradient :  8.200489264492742
20.307024002075195
Clip gradient :  8.618617800689991
14.103322982788086
Clip gradient :  16.021302304462274
9.740650177001953
Clip gradient :  15.963398406759852
4.4009318351745605
Clip gradient :  2.7400219023757737
0.9384341239929199
Clip gradient :  0.5041099492626817
0.2759885787963867
Clip gradient :  0.1785601245306558


## Тестирование 

In [7]:
lstm.eval()
hh = torch.zeros(lstm.hidden_hi.in_features)
cc = torch.zeros(lstm.hidden_hi.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, cc, hh = lstm(x, cc, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


## GRU

In [8]:
class GRU(nn.Module):
    def __init__(self, in_size = 5, hidden_size = 3, out_size = 5):
        super(GRU, self).__init__() 
        #update gate
        self.hidden_iu   = nn.Linear(in_features = in_size, out_features = hidden_size)
        self.hidden_hu   = nn.Linear(in_features = hidden_size, out_features = hidden_size)
        # reset gate
        self.hidden_ir   = nn.Linear(in_features = in_size, out_features = hidden_size)
        self.hidden_hr   = nn.Linear(in_features = hidden_size, out_features = hidden_size)
        # hidde_cell gate
        self.hidden_ihc   = nn.Linear(in_features = in_size, out_features = hidden_size)
        self.hidden_hhc   = nn.Linear(in_features = hidden_size, out_features = hidden_size)
        # activations
        self.tanh        = nn.Tanh()
        self.sigmoid     = nn.Sigmoid()
        # out
        self.out         = nn.Linear(in_features = hidden_size, out_features = out_size)
    
    def forward(self, x, prev_h):
        u = self.sigmoid(self.hidden_iu(x) + self.hidden_hu(prev_h))
        r = self.sigmoid(self.hidden_ir(x) + self.hidden_hr(prev_h))
        hc = self.tanh(self.hidden_ihc(x) + self.hidden_hhc(r * prev_h))
        h = (1 - u) * hc + u * prev_h
        return self.out(h), h

## Инициализация переменных 

In [9]:
ds        = WordDataSet(word = word)
gru       = GRU(in_size = ds.vec_size, hidden_size = 15, out_size = ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 100
optim     = SGD(gru.parameters(), lr = 0.1, momentum = 0.9)

## Обучение 

In [10]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(gru.hidden_hu.in_features)

    loss = 0
    optim.zero_grad()
    
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target = torch.LongTensor([next_sample])
        y, hh = gru(x, hh)   
        loss += criterion(y, target)
     
    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(gru.parameters(), max_norm = 5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(gru.parameters(), max_norm = 1)
    
    optim.step()

70.8460922241211
Clip gradient :  4.4838711183361575
52.6328125
Clip gradient :  9.973733043087073
27.040254592895508
Clip gradient :  9.989557519635085
12.121700286865234
Clip gradient :  4.136827103818908
3.3191709518432617
Clip gradient :  3.5985249242969495
0.8404607772827148
Clip gradient :  1.9510484040243632
0.18989944458007812
Clip gradient :  0.40503203241825914
0.07626724243164062
Clip gradient :  0.0950081273625977
0.04526805877685547
Clip gradient :  0.0384570612505573
0.03318595886230469
Clip gradient :  0.023802809413985673


## Тестирование 

In [11]:
gru.eval()
hh = torch.zeros(gru.hidden_hu.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = gru(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789
