#### IMPORTS

In [191]:
import numpy as np
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.losses import CategoricalCrossentropy
from tensorflow.python.keras.layers import LSTM, Dense
from tensorflow.python.keras.activations import softmax
from nltk import tokenize

#### PARAMS

In [192]:
FILE_NAME = "data/pride-and-prejudice.txt"
END_CHARS = ['.', '!', '?']
ALLOWED_CHARS = [' '] + END_CHARS
SKIP_WORD = "Chapter"
SEQUENCE = 20
SUFFIX = 10
MAX_LENGTH = 100

EPOCHS = 2
ACTIVATION = softmax
LOSS = CategoricalCrossentropy()
OPTIMIZER = 'adam'

#### DATA

In [193]:
def encode_symbol(symbol, encoder):
    encoded_symbol = [0] * len(encoder)
    encoded_symbol[encoder[symbol]] = 1
    return encoded_symbol


def encode_string(string, encoder):
    return [encode_symbol(symbol, encoder) for symbol in string]


def get_data(filename: str):
    def get_text():
        with open(filename, "r", encoding="utf8") as file:
            is_allowed_char = lambda c: c.isalpha() or c in ALLOWED_CHARS
            return ''.join(filter(is_allowed_char, ' '.join(file.read().replace(SKIP_WORD, '').lower().split())))

    def split_data(sentences, length=SEQUENCE):
        xs, ys = [], []
        for sentence in sentences:
            if len(sentence) <= length:
                continue
            for i in range(0, len(sentence) - length):
                xs.append(sentence[i:i + length])
                ys.append(sentence[i + length])
        return np.array(xs), np.array(ys)

    text = get_text()
    data = tokenize.sent_tokenize(text=text)
    print(data[:2])
    chars = list(set([symbol for string in data for symbol in string]))
    encoder = dict((c, i) for i, c in enumerate(chars))
    decoder = dict((i, c) for i, c in enumerate(chars))
    encoded_data = [encode_string(sentence, encoder) for sentence in data]
    return split_data(encoded_data), encoder, decoder, text

In [194]:
(data_x_main, data_y_main), encoder_main, decoder_main, text_main = get_data(FILE_NAME)

START = text_main.find('.', np.random.randint(0, len(text_main) // 2)) + 1
START_LSTM = text_main[START:START + SEQUENCE]
START_MARKOV = text_main[START:START + SEQUENCE]

[' it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife.', 'however little known the feelings or views of such a man may be on his first entering a neighbourhood this truth is so well fixed in the minds of the surrounding families that he is considered as the rightful property of some one or other of their daughters.']


#### LSTM

In [195]:
def build_lstm_model(data_x, data_y):
    model = Sequential([
        LSTM(128, input_shape=(data_x.shape[1], data_x.shape[2])),
        Dense(data_y.shape[1], activation=ACTIVATION)
    ])
    model.compile(loss=LOSS, optimizer=OPTIMIZER)
    return model


def solve_lstm(data_x, data_y, encoder, decoder, start_string=START_LSTM):
    lstm = build_lstm_model(data_x, data_y)
    lstm.fit(data_x, data_y, epochs=EPOCHS, verbose=1)
    print(start_string, end=" --> ")
    start_string = encode_string(start_string, encoder)
    for _ in range(MAX_LENGTH):
        (predictions, ) = lstm.predict([start_string], verbose=0)
        prediction = decoder[np.argmax(predictions)]
        if prediction in END_CHARS:
            print(prediction)
            return
        print(prediction, end="")
        start_string.append(encode_symbol(prediction, encoder))
        start_string = start_string[1:]

In [196]:
solve_lstm(data_x_main, data_y_main, encoder_main, decoder_main)

Epoch 1/2
Epoch 2/2
 you used us abomina --> tion of the same and such a some had been she could not her and she could not her and she could not 

#### MARKOV

In [197]:
def build_markov_model(text):
    stat = {}
    for i in range(len(text) - SUFFIX - 1):
        cur_symbols, next_symbols = text[i: i + SUFFIX], text[i + 1: i + SUFFIX + 1]
        stat.setdefault(cur_symbols, {})
        stat[cur_symbols].setdefault(next_symbols, 0)
        stat[cur_symbols][next_symbols] += 1
    probabilities = {}
    for cur, dict_next in stat.items():
        counts_sum = sum(dict_next.values())
        next_probabilities = [count / counts_sum for count in dict_next.values()]
        decoder = dict((i, c) for i, c in enumerate(dict_next))
        probabilities[cur] = (next_probabilities, decoder)
    return probabilities


def predict_markov(model, string):
    next_symbols, decoder = model[string]
    return decoder[np.argmax(next_symbols)][-1]


def solve_markov(text, start_string=START_MARKOV):
    markov = build_markov_model(text)
    print(start_string, end=" --> ")
    start_string = start_string[-SUFFIX:]
    for _ in range(MAX_LENGTH):
        prediction = predict_markov(markov, start_string)
        if prediction in END_CHARS:
            print(prediction)
            return
        print(prediction, end="")
        start_string = start_string[1:] + prediction

In [198]:
solve_markov(text_main)

 you used us abomina --> ble sort of conceited that the chimneypiece alone had cost eight hundred pounds she felt all the fel