# Обучаем пословную LSTM в keras
([источник кода](https://machinelearningmastery.com/develop-word-based-neural-language-models-python-keras/))

In [66]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from nltk.tokenize import sent_tokenize, word_tokenize

import pickle
import pandas as pd
import numpy as np

In [12]:
df  = pd.read_csv("wiki_movie_plots_deduped.csv")
df.head(5)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [13]:
del df['Wiki Page']
del df['Director']
del df['Genre']

In [62]:
df.drop(df.index[1000:], inplace = True)

In [15]:
plots = list(df.Plot.apply(word_tokenize))

In [16]:
plots[0][:7]

['A', 'bartender', 'is', 'working', 'at', 'a', 'saloon']

In [17]:
len(plots)

1000

In [18]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(['\n'.join(' '.join(plot) for plot in plots)])
encoded = tokenizer.texts_to_sequences(['\n'.join(' '.join(plot) for plot in plots)])[0]

In [19]:
encoded[:7]

[4, 2973, 5, 419, 25, 4, 1480]

In [20]:
len(encoded)

225260

In [21]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 16466


#### create word -> word sequences (создаём данные для обучения)

In [22]:
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 225259


In [23]:
# x
sequences[0]

[4, 2973]

In [24]:
# y
sequences[1]

[2973, 5]

#### split into X and y elements

In [25]:
sequences = array(sequences)
X, y = sequences[:,0],sequences[:,1]
y = to_categorical(y, num_classes=vocab_size) # one hot encode outputs

In [26]:
X[:7]

array([   4, 2973,    5,  419,   25,    4, 1480])

In [27]:
y[:7]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

#### Строим модель

In [28]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 10)             164660    
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_2 (Dense)              (None, 16466)             839766    
Total params: 1,016,626
Trainable params: 1,016,626
Non-trainable params: 0
_________________________________________________________________
None


Собираем:

In [29]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

И обучаем:

In [30]:
model.fit(X, y, epochs=10, verbose=2)

Epoch 1/10
 - 560s - loss: 7.0585 - acc: 0.0654
Epoch 2/10
 - 560s - loss: 6.4654 - acc: 0.0976
Epoch 3/10


KeyboardInterrupt: 

### Генерируем последовательность

In [69]:
# множество всех слов, с которых начинаются сюжеты
SEEDS = set([text[0] for text in plots])
SEEDS = list(SEEDS)

In [70]:
SEEDS[:7]

['Looking', 'Bebe', 'Jason', 'World', 'Max', 'Pug', 'Buster']

In [40]:
# generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
    in_text, result = seed_text, seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = array(encoded)
        # predict a word in the vocabulary
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text, result = out_word, result + ' ' + out_word
    return result

In [54]:
generate_seq(loaded_model, tokenizer, seed_text='The', n_words=10)

'The film magazine of the film magazine of the film magazine'

In [55]:
generate_seq(loaded_model, tokenizer, seed_text='A', n_words=10)

'A young man and the film magazine of the film magazine'

In [61]:
generate_seq(loaded_model, tokenizer, seed_text='Looking', n_words=10)

'Looking by the film magazine of the film magazine of the'

#### Сохраняем обученную модель

In [31]:
# сохраняем веса
model.save_weights('lstm.weights')

In [41]:
# сохраняем токенизатор со словарём
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [71]:
# сохраняем начала сюжетов
with open('seeds.pkl', 'wb') as f:
    pickle.dump(SEEDS, f)

## Загружаем обученную модель с диска и используем её

#### Загружаем всё необходимое

In [72]:
with open('seeds.pkl', 'rb') as f:
    seeds = pickle.load(f)

In [42]:
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

In [43]:
vocab_size = len(tokenizer.word_index) + 1

In [44]:
# заново строим модель, чтобы загрузить веса
loaded_model = Sequential()
loaded_model.add(Embedding(vocab_size, 10, input_length=1))
loaded_model.add(LSTM(50))
loaded_model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 10)             164660    
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_2 (Dense)              (None, 16466)             839766    
Total params: 1,016,626
Trainable params: 1,016,626
Non-trainable params: 0
_________________________________________________________________
None


In [45]:
loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [46]:
loaded_model.load_weights('lstm.weights')

это та же функции generate seed, что и выше -- здесь для удобства:

In [47]:
def generate_seq(model, tokenizer, seed_text, n_words):
    in_text, result = seed_text, seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = array(encoded)
        # predict a word in the vocabulary
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text, result = out_word, result + ' ' + out_word
    return result

In [74]:
np.random.choice(seeds)

'Arrogant'

In [75]:
np.random.choice(seeds)

'Isadore'

In [76]:
def generate_some_plot():
    # она использует все глобальные переменные (модели, seeds и так далее), определённые выше
    # случайно выбирает начало и генерирует сюжет
    seed = np.random.choice(seeds)
    text = generate_seq(loaded_model, tokenizer, seed, n_words=100)
    return text

In [77]:
generate_some_plot()

'Angela is a young man and the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine of the film magazine'