# Рекуррентные нейросети

Построим простейшую нейросеть для посимвольной генерации текста

In [1]:
import pandas as pd  # для работы с данными
import time  # для оценки времени
import torch  # для написания нейросети

## Загрузка данных

Будем работать с датасетом реплик из Симпсонов. Нам нужно извлечь предобработанные тексты и закодировать их числами

In [3]:
df = pd.read_csv('simpsons_script_lines.csv')
df.head()

  df = pd.read_csv('simpsons_script_lines.csv')


Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31
1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3
2,9551,32,211,Miss Hoover: I don't know. Although I'd sure l...,856000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,i dont know although id sure like to talk to h...,22
3,9552,32,212,Lisa Simpson: That life is worth living.,864000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,That life is worth living.,that life is worth living,5
4,9553,32,213,Edna Krabappel-Flanders: The polls will be ope...,864000,True,40.0,3.0,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,the polls will be open from now until the end ...,33


In [4]:
phrases = df['normalized_text'].tolist()  # колонка с предобработанными текстами
phrases[:10]

['no actually it was a little of both sometimes when a disease is in all the magazines and all the news shows its only natural that you think you have it',
 'wheres mr bergstrom',
 'i dont know although id sure like to talk to him he didnt touch my lesson plan what did he teach you',
 'that life is worth living',
 'the polls will be open from now until the end of recess now just in case any of you have decided to put any thought into this well have our final statements martin',
 'i dont think theres anything left to say',
 'bart',
 'victory party under the slide',
 nan,
 'mr bergstrom mr bergstrom']

In [5]:
text = [[c for c in ph] for ph in phrases if type(ph) is str]

## Создаём массив с данными

Нужно

1. Разбить данные на токены (у нас символы)
2. Закодировать числами
3. Превратить в эмбеддинги

In [6]:
CHARS = set('abcdefghijklmnopqrstuvwxyz ')  # все символы, которые мы хотим использовать для кодировки = наш словарь
INDEX_TO_CHAR = ['none'] + [w for w in CHARS]  # все неизвестные символы будут получать тег none
CHAR_TO_INDEX = {w: i for i, w in enumerate(INDEX_TO_CHAR)}  # словарь токен-индекс

In [9]:
CHAR_TO_INDEX

{'none': 0,
 ' ': 1,
 'w': 2,
 'm': 3,
 'd': 4,
 'y': 5,
 'u': 6,
 'a': 7,
 'b': 8,
 'p': 9,
 'c': 10,
 'k': 11,
 'n': 12,
 'h': 13,
 'v': 14,
 'e': 15,
 't': 16,
 'q': 17,
 'z': 18,
 'l': 19,
 'g': 20,
 'x': 21,
 'j': 22,
 'f': 23,
 's': 24,
 'i': 25,
 'r': 26,
 'o': 27}

In [None]:
len(INDEX_TO_CHAR)

28

In [10]:
MAX_LEN = 50  # мы хотим ограничить максимальную длину ввода
X = torch.zeros((len(text), MAX_LEN), dtype=int)  # создаём пустой вектор для текста, чтобы класть в него индексы токенов
for i in range(len(text)):  # для каждого предложения
    for j, w in enumerate(text[i]):  # для каждого токена
        if j >= MAX_LEN:
            break
        X[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

In [12]:
X[0:5]

tensor([[12, 27,  1,  7, 10, 16,  6,  7, 19, 19,  5,  1, 25, 16,  1,  2,  7, 24,
          1,  7,  1, 19, 25, 16, 16, 19, 15,  1, 27, 23,  1,  8, 27, 16, 13,  1,
         24, 27,  3, 15, 16, 25,  3, 15, 24,  1,  2, 13, 15, 12],
        [ 2, 13, 15, 26, 15, 24,  1,  3, 26,  1,  8, 15, 26, 20, 24, 16, 26, 27,
          3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [25,  1,  4, 27, 12, 16,  1, 11, 12, 27,  2,  1,  7, 19, 16, 13, 27,  6,
         20, 13,  1, 25,  4,  1, 24,  6, 26, 15,  1, 19, 25, 11, 15,  1, 16, 27,
          1, 16,  7, 19, 11,  1, 16, 27,  1, 13, 25,  3,  1, 13],
        [16, 13,  7, 16,  1, 19, 25, 23, 15,  1, 25, 24,  1,  2, 27, 26, 16, 13,
          1, 19, 25, 14, 25, 12, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [16, 13, 15,  1,  9, 27, 19, 19, 24,  1,  2, 25, 19, 19,  1,  8, 15,  1,
       

## Embedding и RNN ячейки

Каждому токену мы хотим сопоставить не просто число, но вектор. Поэтому вектор текста нам нужно умножить на матрицу эмбеддингов, которая тоже будет учиться в процессе обучения нейросети. Для создания такой матрицы нам нужен слой `nn.Embedding`

In [13]:
X[0:5].shape

torch.Size([5, 50])

In [14]:
embeddings = torch.nn.Embedding(len(INDEX_TO_CHAR), 28)  # размер словаря * размер вектора для кодировки каждого слова
t = embeddings(X[0:5])
t.shape

torch.Size([5, 50, 28])

In [16]:
t

tensor([[[ 0.3689,  0.3359,  0.1294,  ..., -2.2670, -0.5084,  2.0914],
         [-0.0124, -0.4274,  0.0569,  ..., -0.8373,  0.2788,  0.2409],
         [ 0.4870, -0.6032,  0.2132,  ...,  1.2992, -1.1362, -0.7522],
         ...,
         [ 1.1295,  0.6400,  0.3355,  ...,  0.5713,  0.1381, -1.9632],
         [-1.7195, -1.1320, -0.4221,  ..., -1.0422, -2.1570, -0.1842],
         [ 0.3689,  0.3359,  0.1294,  ..., -2.2670, -0.5084,  2.0914]],

        [[ 0.6711, -0.1875,  0.3068,  ..., -1.3658, -0.1113,  0.1579],
         [ 1.1295,  0.6400,  0.3355,  ...,  0.5713,  0.1381, -1.9632],
         [-1.7195, -1.1320, -0.4221,  ..., -1.0422, -2.1570, -0.1842],
         ...,
         [ 2.1302,  0.0217, -1.4572,  ...,  0.0211,  0.0140, -0.3286],
         [ 2.1302,  0.0217, -1.4572,  ...,  0.0211,  0.0140, -0.3286],
         [ 2.1302,  0.0217, -1.4572,  ...,  0.0211,  0.0140, -0.3286]],

        [[ 0.5599, -0.4547, -0.1584,  ..., -1.0144,  0.6576, -0.3028],
         [ 0.4870, -0.6032,  0.2132,  ...,  1

In [17]:
t.shape, X[0:5].shape

(torch.Size([5, 50, 28]), torch.Size([5, 50]))

In [18]:
rnn = torch.nn.RNN(28, 128, batch_first=True)  # на вход - размер эмбеддинга, размер скрытого состояния и порядок размерностей
o, s = rnn(t)
# вектора для слов: батч * число токенов * размер скрытого состояния
# вектор скрытого состояния: число вектров (один) * батч * размер скрытого состояния
o.shape, s.shape

(torch.Size([5, 50, 128]), torch.Size([1, 5, 128]))

Можно применять несколько рекуррентных ячеек подряд

In [19]:
o, s2 = rnn(t, s)
o.shape, s2.shape

(torch.Size([5, 50, 128]), torch.Size([1, 5, 128]))

## Реализация сети с RNN
3 слоя:
1. Embeding (30)
2. RNN (hidden_dim=128)
3. Полносвязный слой для предсказания буквы (28, то есть размер словаря)

In [20]:
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.embedding = torch.nn.Embedding(28, 30)
        self.rnn = torch.nn.RNN(30, 128)
        self.out = torch.nn.Linear(128, 28)

    def forward(self, sentences, state=None):
        x = self.embedding(sentences)
        x, s = self.rnn(x) # берём выход с последнего слоя для всех токенов, а не скрытое состояние
        return self.out(x)

In [21]:
model = Network()

In [22]:
criterion = torch.nn.CrossEntropyLoss()  # типичный лосс многоклассовой классификации
optimizer = torch.optim.SGD(model.parameters(), lr=.05)

Обучение:

In [23]:
for ep in range(20):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    for i in range(int(len(X) / 100)):
        # берём батч в 100 элементов
        batch = X[i * 100:(i + 1) * 100]
        X_batch = batch[:, :-1]
        Y_batch = batch[:, 1:].flatten()

        optimizer.zero_grad()
        answers = model.forward(X_batch)
        answers = answers.view(-1, len(INDEX_TO_CHAR))
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

Epoch 0. Time: 16.475, Train loss: 1.826
Epoch 1. Time: 16.602, Train loss: 1.727
Epoch 2. Time: 16.502, Train loss: 1.712
Epoch 3. Time: 16.549, Train loss: 1.703
Epoch 4. Time: 16.506, Train loss: 1.695
Epoch 5. Time: 16.520, Train loss: 1.688
Epoch 6. Time: 16.545, Train loss: 1.684
Epoch 7. Time: 16.511, Train loss: 1.680
Epoch 8. Time: 16.507, Train loss: 1.677
Epoch 9. Time: 16.500, Train loss: 1.674
Epoch 10. Time: 16.581, Train loss: 1.672
Epoch 11. Time: 16.560, Train loss: 1.670
Epoch 12. Time: 16.750, Train loss: 1.669
Epoch 13. Time: 16.698, Train loss: 1.667
Epoch 14. Time: 16.963, Train loss: 1.666
Epoch 15. Time: 16.794, Train loss: 1.665
Epoch 16. Time: 16.501, Train loss: 1.664
Epoch 17. Time: 16.474, Train loss: 1.663
Epoch 18. Time: 16.841, Train loss: 1.662
Epoch 19. Time: 16.694, Train loss: 1.664


## Генерация


- Сначала отправлем в модель буквы из предложения (прогревая состояние)
- Затем берём самую вероятную букву и добавляем её в предложение
- Повторяем пока не получим none (0)

In [24]:
CHAR_TO_INDEX['none']

0

In [25]:
def generate_sentence(word):
    sentence = list(word)
    sentence = [CHAR_TO_INDEX.get(s, 0) for s in sentence]
    answers = model.forward(torch.tensor(sentence))
    probas, indices = answers.topk(1)
    return ''.join([INDEX_TO_CHAR[ind.item()] for ind in indices.flatten()])

In [26]:
generate_sentence('dog')

' uo'

In [35]:
generate_sentence('no actually it was a little of both')

' uita  tll t  t t tttls    iu teu e'

### Задание про шифр Цезаря

### Функция шифрования

In [133]:
def caesar(sym: str, k: int) -> str:
    current_ord = ord(sym)
    # print(current_ord)
    if current_ord > 120:
        current_ord -= 26
    return chr(current_ord + k)

In [134]:
caesar('g', 2)

'i'

### Создание датасета

In [75]:
!pip install english_words

Collecting english_words
  Downloading english_words-2.0.2-py3-none-any.whl.metadata (3.6 kB)
Downloading english_words-2.0.2-py3-none-any.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m1.8 MB/s[0m  [33m0:00:04[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: english_words
Successfully installed english_words-2.0.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [80]:
from english_words import get_english_words_set
import random

In [96]:
words = list(get_english_words_set(['web2'], alpha=True, lower=True))
random_10k_words = random.sample(words, 10000)

In [112]:
words = random_10k_words
encrypted_words = []
for word in words:
    enc_word = ''
    for sym in word:
        enc_word += caesar(sym, 2)
    encrypted_words.append(enc_word)

In [113]:
words[:5]

['volplanist', 'psoadic', 'contraindicative', 'scatteration', 'forehill']

In [114]:
encrypted_words[:5]

['xqnrncpkuv', 'ruqcfke', 'eqpvtckpfkecvkxg', 'uecvvgtcvkqp', 'hqtgjknn']

### Подготовка данных

In [115]:
CHARS = set('abcdefghijklmnopqrstuvwxyz ')  # все символы, которые мы хотим использовать для кодировки = наш словарь
INDEX_TO_CHAR = ['none'] + [w for w in CHARS]  # все неизвестные символы будут получать тег none
CHAR_TO_INDEX = {w: i for i, w in enumerate(INDEX_TO_CHAR)}  # словарь токен-индекс

In [117]:
MAX_LEN = 20
X = torch.zeros((len(words), MAX_LEN), dtype=int)  # создаём пустой вектор для текста, чтобы класть в него индексы токенов
for i in range(len(words)):  # для каждого слова
    for j, w in enumerate(words[i]):  # для каждого токена
        if j >= MAX_LEN:
            break
        X[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

In [118]:
Y = torch.zeros((len(encrypted_words), MAX_LEN), dtype=int)  # создаём пустой вектор для текста, чтобы класть в него индексы токенов
for i in range(len(encrypted_words)):  # для каждого слова
    for j, w in enumerate(encrypted_words[i]):  # для каждого токена
        if j >= MAX_LEN:
            break
        Y[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

In [120]:
X[:5]

tensor([[14, 27, 19,  9, 19,  7, 12, 25, 24, 16,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [ 9, 24, 27,  7,  4, 25, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [10, 27, 12, 16, 26,  7, 25, 12,  4, 25, 10,  7, 16, 25, 14, 15,  0,  0,
          0,  0],
        [24, 10,  7, 16, 16, 15, 26,  7, 16, 25, 27, 12,  0,  0,  0,  0,  0,  0,
          0,  0],
        [23, 27, 26, 15, 13, 25, 19, 19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])

In [121]:
Y[:5]

tensor([[21, 17, 12, 26, 12, 10,  9, 11,  6, 14,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [26,  6, 17, 10, 23, 11, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [15, 17,  9, 14, 16, 10, 11,  9, 23, 11, 15, 10, 14, 11, 21, 20,  0,  0,
          0,  0],
        [ 6, 15, 10, 14, 14, 20, 16, 10, 14, 11, 17,  9,  0,  0,  0,  0,  0,  0,
          0,  0],
        [13, 17, 16, 20, 22, 11, 12, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])

In [135]:
train_count = 8000 # и еще 2000 на тест
X_train = X[:train_count]
X_test = X[train_count:]
Y_train = Y[:train_count]
Y_test = Y[train_count:]

In [136]:
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.embedding = torch.nn.Embedding(28, 30)
        self.rnn = torch.nn.RNN(30, 128)
        self.out = torch.nn.Linear(128, 28)

    def forward(self, sentences, state=None):
        x = self.embedding(sentences)
        x, s = self.rnn(x) # берём выход с последнего слоя для всех токенов, а не скрытое состояние
        return self.out(x)

In [137]:
model = Network()

In [138]:
criterion = torch.nn.CrossEntropyLoss()  # типичный лосс многоклассовой классификации
optimizer = torch.optim.SGD(model.parameters(), lr=.05)

In [139]:
for ep in range(20):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    for i in range(int(len(X_train) / 100)):
        # берём батч в 100 элементов
        X_batch = X_train[i * 100:(i + 1) * 100]
        Y_batch = Y_train[i * 100:(i + 1) * 100]

        optimizer.zero_grad()
        answers = model.forward(X_batch)
        embeddings = torch.nn.Embedding(len(INDEX_TO_CHAR), 28)
        loss = criterion(answers, embeddings(Y_batch))
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

Epoch 0. Time: 0.806, Train loss: 1.080
Epoch 1. Time: 0.784, Train loss: -0.059
Epoch 2. Time: 0.786, Train loss: 0.234
Epoch 3. Time: 0.788, Train loss: 0.369
Epoch 4. Time: 0.786, Train loss: 0.852
Epoch 5. Time: 0.789, Train loss: 0.272
Epoch 6. Time: 0.788, Train loss: 0.345
Epoch 7. Time: 0.796, Train loss: 0.608
Epoch 8. Time: 0.793, Train loss: -0.455
Epoch 9. Time: 0.788, Train loss: 0.383
Epoch 10. Time: 0.792, Train loss: 0.017
Epoch 11. Time: 0.806, Train loss: -0.496
Epoch 12. Time: 0.800, Train loss: 0.614
Epoch 13. Time: 0.795, Train loss: 0.352
Epoch 14. Time: 0.795, Train loss: 0.492
Epoch 15. Time: 0.803, Train loss: -0.155
Epoch 16. Time: 0.804, Train loss: -1.978
Epoch 17. Time: 0.803, Train loss: -5.320
Epoch 18. Time: 0.795, Train loss: 1.576
Epoch 19. Time: 0.795, Train loss: 5.787
