# Include 

In [1]:
import json
import numpy as np
import os
import random

from os.path import expanduser

# Definitions 

# Main course

## Load geography

In [2]:
base = expanduser("~/database/shad/generate_text/ru")

In [3]:
filenames = ["city", "hamlet", "town", "village"]

In [4]:
dataset_raw = [json.loads(line) for current_name in filenames for line in open(os.path.join(base, f"place-{current_name}.ndjson"))]

In [5]:
len(dataset_raw)

253192

In [6]:
sum(1 for r in dataset_raw if "name" not in r)

57

In [7]:
names = [r["name"] for r in dataset_raw if "name" in r]

In [8]:
len(names)

253135

In [9]:
uniq_names = list(set(names))

In [10]:
len(uniq_names)

77411

In [11]:
random.shuffle(uniq_names)

In [12]:
uniq_names[:10]

['Нижнее Кучуково',
 'Кораблёвка',
 'Пуляева',
 'Пиженькасы',
 'Приданцево',
 'Минулово',
 'Ажерово',
 'Пашичи',
 'Кизилово',
 'Орбельяновка']

In [13]:
alphabet=sorted(list(set(c for r in uniq_names for c in r)))

In [14]:
"".join(alphabet)

' "(),-./0123456789IVXc\xa0«»́ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЫЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяё№'

In [15]:
ss_alphabet = alphabet + ["<", ">"]

In [16]:
c2ind = {c:ind for ind, c in enumerate(ss_alphabet)}
ind2c = {c:ind for ind, c in enumerate(ss_alphabet)}

In [17]:
ss_names = ["<" + v + ">" for v in uniq_names]

In [18]:
ss_names[:10]

['<Нижнее Кучуково>',
 '<Кораблёвка>',
 '<Пуляева>',
 '<Пиженькасы>',
 '<Приданцево>',
 '<Минулово>',
 '<Ажерово>',
 '<Пашичи>',
 '<Кизилово>',
 '<Орбельяновка>']

# Create dataset

In [19]:
text = "".join(ss_names)
len(text)

892218

In [20]:
text[:4000]

'<Нижнее Кучуково><Кораблёвка><Пуляева><Пиженькасы><Приданцево><Минулово><Ажерово><Пашичи><Кизилово><Орбельяновка><Кошибеево><Тарасово><Желтино><Вторая Глушковская><Рукавишниково><Трудовой><Ояш><Льнозавод><Гертма><Долбачи><Янавыл><отделения № 3 совхоза "Степной"><Ильицыно><Коконогово><Борисовский><Погост Дмитрия Солунского><Усть-Баргузин><Бабахино><Погари><Ванченки><Малый Липовчик><Повышево><Карамас-Пельга><Воловчик><Серегиж><Помозово><Большая Омутная><Адышево><Колодье><Подусово><Павельцево><Матохино><Чахлово><Кислицкий><Аудио><Воротыня><Омогаево><Буккамахи><Малыновщина><Аммала><Скользихино><Госсортучасток><Даньшин Ручей><Арбузов><Бакаи><Седлистое><Нижнее Кожухово><Красная Веретья><Большемедведевский><Нонбург><Баян-Гол><Балеевка><Ознобищево><Поляки><Разбегаево><Лухнево><Лангерево><Тепелево><Нижние Карамалы><Дубровская><Верхний Бисер><Авати><Тюпкильды><Банщиково><Лигрон><Каталовское><Новосёлка><Железный Перебор><Бенек><Белгородка><Бурундуки><Переяславка><Онуфриево><Прошинская><Старое Ши

# Forward pass

## Create a model

In [31]:
hsz = 100 # hidden size
vsz = len(ss_alphabet) # vocabulary size
seq_length = 25 # the size of window
lr = 1e-3

In [32]:
Wxh = np.random.randn(hsz, vsz) * 0.01
Whh = np.random.randn(hsz, hsz) * 0.01
Why = np.random.randn(vsz, hsz) * 0.01
bh = np.zeros((hsz, 1))
by = np.zeros((vsz, 1))

In [33]:
def gen_portion(text, seq_len):
    for iter in range(len(text) // seq_len):
        yield text[iter * seq_len: (iter + 1) * seq_len]

In [34]:
def calcLoss(ind_input, ind_target, hprev):
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    for t in range(len(ind_input)):
        xs[t] = np.zeros((vsz, 1))
        xs[t][ind_input[t]] = 1
        hs[t] = np.tanh(Wxh@xs[t] + Whh@hs[t-1] + bh)
        ys[t] = Why@hs[t] + by
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
        loss -= np.log(ps[t][ind_target[t], 0])

    dWhh, dWxh, dWhy = np.zeros_like(Whh), np.zeros_like(Wxh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(ind_input))):
        dy = np.copy(ps[t])
        dy[ind_target[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += dy@hs[t].T
        dby += dy
        dh = Why.T @ dy + dhnext # backprop into h
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
        dbh += dhraw
        dWxh += dhraw @ xs[t].T
        dWhh += dhraw @ hs[t-1].T
        dhnext = Whh.T @ dhraw
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[-1]

In [38]:
hprev = np.zeros((hsz, 1))
for ind, portion in enumerate(gen_portion(text, seq_length)):
    text_input, text_target = portion[:-1], portion[1:]
    ind_input = [c2ind[c] for c in text_input]
    ind_target = [c2ind[c] for c in text_target]
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = calcLoss(ind_input, ind_target, hprev)
    if ind % 10000 == 0:
        print("loss =", loss)

    for param, dparam in [[Wxh, dWxh], [Whh, dWhh], [Why, dWhy], [bh, dbh], [by, dby]]:
        param -= lr * dparam

loss = 42.89116707342369
loss = 47.112533795896745
loss = 49.0205968691343
loss = 57.02887554468034


In [33]:
portion = next(gen_portion(text, seq_length))
portion

'<Белая Глина><Напольные В'

In [38]:
text_input, text_target = portion[:-1], portion[1:]
ind_input = [c2ind[c] for c in text_input]
ind_target = [c2ind[c] for c in text_target]

In [39]:
t = len(ind_input) - 1

In [40]:
xs, hs, ys, ps = {}, {}, {}, {}

In [41]:
hs[-1] = np.copy(hprev)

In [42]:
xs[t] = np.zeros((vs, 1))

In [43]:
xs[t][ind_input[t]] = 1

In [45]:
Wxh@xs[t]

array([[ 6.46998760e-03],
       [ 7.08770909e-03],
       [-1.12802445e-02],
       [-1.02499101e-02],
       [-7.93905318e-05],
       [-2.67815815e-02],
       [ 7.17522060e-03],
       [-8.89663606e-03],
       [ 8.56868475e-03],
       [ 2.26787502e-03],
       [-1.07530893e-02],
       [ 6.13865942e-03],
       [-1.45111656e-03],
       [ 1.24106583e-02],
       [ 1.61592328e-02],
       [-1.69174022e-02],
       [ 8.51485273e-03],
       [-3.69515694e-03],
       [ 3.29967574e-03],
       [ 1.18893457e-03],
       [ 1.62581647e-02],
       [ 1.01745632e-02],
       [ 1.05941444e-02],
       [-6.12373812e-03],
       [-1.67165927e-03],
       [-8.21127770e-04],
       [ 3.59990998e-03],
       [ 1.33340931e-03],
       [ 5.59201696e-03],
       [ 1.36779960e-03],
       [-1.31541839e-02],
       [-6.63189396e-03],
       [ 8.73303025e-03],
       [-1.37820509e-02],
       [-5.86929152e-03],
       [-1.80181974e-02],
       [-5.78365577e-03],
       [ 1.07221777e-02],
       [-1.6