# Input text genarators

## WikiStorage - class for working with wiki articles stored in MongoDB

In [117]:
from typing import Generator
import pymongo


class WikiStorage:
    """Class for working with MongoDB"""

    db: pymongo.database.Database
    col: pymongo.collection.Collection

    def __init__(self, db: pymongo.database.Database, col: pymongo.collection.Collection):
        self.db = db
        self.col = col

    @classmethod
    def connect(cls, host: str, port=27017, db_name='wiki', col_name='articles'):
        db = pymongo.MongoClient(host, port, unicode_decode_error_handler='ignore')[db_name]
        return cls(
            db=db,
            col=db[col_name])

    def get_articles(self, count=0) -> Generator:
        return self.col.find({}).limit(count)

    def get_article(self, title) -> dict:
        doc = self.col.find_one({'title': title})
        return doc if doc else {}

    def get_articles_headings_texts(self, count=0) -> list:
        for article in self.get_articles(count):
            yield article['text']['Заголовок']['text']

## Postgres storage - base class for all classes working with PostgreSQL

In [91]:
from typing import Generator
import psycopg2

class PostgresStorage:
    
    conn: psycopg2.extensions.connection
    
    def __init__(self, conn):
        self.conn = conn

    @classmethod
    def connect(cls, 
                host: str, 
                port: int = 5432,
                user: str = 'postgres',
                password: str = 'password',
                dbname: str = 'postgres'):
        return cls(conn=psycopg2.connect(
            host=host, port=port, user=user, password=password, dbname=dbname)
        )

## Habr storage - class for working posts from habr stored in PostgreSQL

In [109]:
class HabrStorage(PostgresStorage):

    def get_posts(self, 
                  count: int = 0, 
                  habs_list: list = None, 
                  tags_list: list = None) -> Generator:
        if not habs_list and not tags_list:
            cursor = self.conn.cursor()
            sql = 'SELECT * FROM posts'
            if count:
                sql += ' LIMIT %d' % count
            cursor.execute(sql)
            return (post for post in cursor.fetchall())
        elif habs_list:
            return self.__get_posts_by_habs(count, habs_list)
        elif tags_list:
            return self.__get_posts_by_tags(count, tags_list)

    def get_posts_texts(self,
                        count: int = 0,
                        habs_list: list = None, 
                        tags_list: list = None) -> Generator:
        posts_texts_gen = (post[2] for post in self.get_posts(count, habs_list, tags_list))
        return posts_texts_gen

    def __get_posts_by_habs(self, 
                            count: int,
                            habs_list: list) -> Generator:
        sql = '''SELECT P.* 
                   FROM posts P JOIN habs H ON P.post_id = H.post_id
                  WHERE H.hab in (%s)''' % ''.join(["'" + str(hab) + "', " for hab in habs_list])[:-2]
        sql = sql + " LIMIT %d" % count if count > 0 else sql
        cursor = self.conn.cursor()
        cursor.execute(sql)
        return (post for post in cursor.fetchall())

    def __get_posts_by_tags(self, 
                            count: int,
                            tags_list: list) -> Generator:
        sql = '''SELECT P.* 
                   FROM posts P JOIN tags T ON P.post_id = T.post_id
                  WHERE T.tag in (%s)''' % ''.join(["'" + str(tag) + "', " for tag in tags_list])[:-2]
        sql = sql + " LIMIT %d" % count if count > 0 else sql
        cursor = self.conn.cursor()
        cursor.execute(sql)
        return (post for post in cursor.fetchall())

## Text processors

In [233]:
from typing import Generator, Iterable
import re
import nltk


class Tokenizer:
    to_sentences = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s')
    remove_brackets = re.compile(r' \((.*?)\)')
    remove_punctuation = re.compile(r'[^a-zA-Zа-яА-Я ]')

    @classmethod
    def tokenize(cls, text: str, remove_punctuation=True, remove_brackets=True) -> Generator:
        buf = text.split('\n')
        buf = (item for item in buf if item)
        sentences = (sentence[:-1].lower().strip()
                     for sentence in cls.to_sentences.split(' '.join(buf))
                     if sentence[:-1])
        if remove_brackets:
            sentences = (cls.remove_brackets.sub('', sentence) for sentence in sentences)
        if remove_punctuation:
            return (cls.remove_punctuation.sub('', sentence) for sentence in sentences)
        return sentences


class TextProcessor:
    tokenizer = Tokenizer()

    @classmethod
    def get_sentences_gens(cls, texts: Iterable, remove_punctuation=True, remove_brackets=True) -> Generator:
        for text in texts:
            yield cls.tokenizer.tokenize(
                text=text,
                remove_punctuation=remove_punctuation,
                remove_brackets=remove_brackets)

    @classmethod
    def get_text_gen(cls, text_gens_gen: Iterable) -> Generator:
        for text_gen in text_gens_gen:
            for sentences_gen in cls.get_sentences_gens(text_gen):
                for sentence in sentences_gen:
                    yield sentence.split()

    @classmethod
    def get_ngram_gen(cls, text_gens_gen: Iterable, ngram_size: int = 3) -> Generator:
        for text_gen in text_gens_gen:
            for sentences_gen in cls.get_sentences_gens(text_gen):
                for sentence in sentences_gen:
                    yield [''.join(item) for item in nltk.ngrams(sentence, ngram_size)]

## Words encoder - class for encoding/decoding words stored as int nums

In [169]:
import json
from typing import Generator
        
class WordsEncoder:
    
    counter: int
    word2int: dict
    int2word: dict
    begin_word: int = 0
    end_word: int = -1
        
    def __init__(self, counter: int, word2int: dict, int2word: dict):
        self.counter = counter
        self.word2int = word2int
        self.int2word = int2word

    def fit(self, text_corpus):
        self.counter = 0
        self.word2int = {
            self.begin_word: self.begin_word,
            self.end_word: self.end_word
        }
        self.int2word = {
            self.begin_word: self.begin_word,
            self.end_word: self.end_word
        }
        for sentence in text_corpus:
            for word in sentence.split():
                if word not in self.word2int:
                    self.counter += 1
                    self.word2int[word] = self.counter
                    self.int2word[self.counter] = word
                    
    def fit_encode(self, text_corpus) -> list:
        corpus = list(text_corpus) if isinstance(text_corpus, Generator) else text_corpus
        self.fit(corpus)
        return self.encode_text_corpus_gen(corpus)

    def encode_words_list(self, words_list: list) -> list:
        return [self.word2int[word] for word in words_list]

    def encode_text_corpus(self, text_corpus: list) -> list:
        """List of lists of words"""
        return [self.encode_words_list(words_list) for words_list in text_corpus]

    def encode_text_corpus_gen(self, text_corpus_gen: Generator) -> Generator:
        """List of lists of words"""
        return (self.encode_words_list(sentence.split()) for sentence in text_corpus_gen)

    def decode_codes_list(self, codes_list: list) -> list:
        return [self.int2word[code] for code in codes_list]

    def to_dict(self):
        """
        Returns the underlying data as a Python dict.
        """
        return {
            "counter": self.counter,
            "word2int": self.word2int,
            "int2word": self.int2word
        }

    def to_json(self):
        """
        Returns the underlying data as a JSON string.
        """
        return json.dumps(self.to_dict())

    @classmethod
    def from_dict(cls, obj):

        int2word = obj["int2word"]
        for key in int2word:
            int2word[int(key)] = int2word.pop(key)

        int2word[cls.end_word] = cls.end_word
        int2word[cls.begin_word] = cls.begin_word

        word2int = obj["word2int"]
        word2int[cls.end_word] = int(word2int.pop(str(cls.end_word)))
        word2int[cls.begin_word] = int(word2int.pop(str(cls.begin_word)))

        return cls(
            counter=obj["counter"],
            word2int=word2int,
            int2word=int2word
        )

    @classmethod
    def from_json(cls, json_str):
        return cls.from_dict(json.loads(json_str))

## Encoder storage - class for working with words encoder stored in PostgreSQL

In [179]:
class EncoderStorage(PostgresStorage):
    
    model_name: str
    begin_word: int = 0
    end_word: int = -1
        
    def add_encoder(self, model_name: str, encoder: WordsEncoder):
        self.model_name = model_name
        
        cursor = self.conn.cursor()
        cursor.execute('CALL add_encoder(%s)', [model_name])
        self.conn.commit()
        
        for code, word in encoder.int2word.items():
            sql = f'''INSERT INTO {model_name}_encoder(code, word)
                      VALUES (%s, %s)'''
            cursor.execute(sql, [code, word])           
        self.conn.commit()
        self.__create_indexes(model_name)        
            
    def delete_encoder(self, model_name: str):
        cursor = self.conn.cursor()
        cursor.execute('CALL delete_encoder(%s)', [model_name])
        self.conn.commit()
        
    def load_encoder(self, model_name: str) -> WordsEncoder:
        cursor = self.conn.cursor()
        cursor.execute(f'SELECT code, word FROM {model_name}_encoder')
        int2word = {}
        word2int = {}
        for row in cursor.fetchall():
            code, word = row[0], row[1]
            int2word[code] = word
            word2int[word] = code
        word2int[self.end_word] = int(word2int.pop(str(self.end_word)))
        word2int[self.begin_word] = int(word2int.pop(str(self.begin_word)))
        counter = len(int2word) - 2 # except begin and end words
        return WordsEncoder(counter=counter,
                            int2word=int2word,
                            word2int=word2int)
        
    def __create_indexes(self, model_name: str):
        cursor = self.conn.cursor()
        cursor.execute('CALL create_encoder_indexes(%s)', [model_name]);
        self.conn.commit()
        
    def __drop_indexes(self, model_name: str):
        cursor = self.conn.cursor()
        cursor.execute('CALL drop_encoder_indexes(%s)', [model_name]);
        self.conn.commit()

# Chain storage - realization of markov chain stored in PostgreSQL

In [205]:
import random
import operator
import bisect
import json
import copy


def accumulate(iterable, func=operator.add):
    it = iter(iterable)
    total = next(it)
    yield total
    for element in it:
        total = func(total, element)
        yield total

def compile_next(next_dict):
    words = list(next_dict.keys())
    cff = list(accumulate(next_dict.values()))
    return [words, cff]


class ChainStorage(PostgresStorage):
    
    begin_word: int = 0
    end_word: int = -1
        
    def add_model(self, model_name: str, train_corpus: list, state_size: int):
        model_dict = self.__build_model(train_corpus, state_size)
        
        cursor = self.conn.cursor()
        cursor.execute('CALL add_model(%s, %s)', [model_name, self.end_word])
        self.conn.commit()
        
        for state_tuple in model_dict:
            buf = model_dict[state_tuple]
            choices_list, cumdist_list = buf[0], buf[1] 
            self.__add_state(cursor, model_name, state_tuple, choices_list, cumdist_list)            
        self.conn.commit()
        self.__create_index(model_name)
        
        del model_dict
            
    def delete_model(self, model_name: str):
        cursor = self.conn.cursor()
        cursor.execute('CALL delete_model(%s)', [model_name])
        self.conn.commit()
        
    def walk(self, model_name: str, init_state: list, phrase_len: int = 10):
        cursor = self.conn.cursor()
        cursor.execute(f'SELECT chain_walk_{model_name}(%s, %s)', [init_state, phrase_len])
        return cursor.fetchone()[0] or []
    
    def __build_model(self, train_corpus, state_size: int) -> dict:
        model = {}

        for run in train_corpus:
            items = ([ self.begin_word ] * state_size) + run + [ self.end_word ]
            for i in range(len(run) + 1):
                state = tuple(items[i:i+state_size])
                follow = items[i+state_size]
                if state not in model:
                    model[state] = {}

                if follow not in model[state]:
                    model[state][follow] = 0

                model[state][follow] += 1
                
        model = { state: compile_next(next_dict) for (state, next_dict) in model.items() }
        return model
    
    def __add_state(self,
            cursor,
            model_name: str,
            state: tuple, 
            choices: list, 
            cumdist: list):
        sql = f'''INSERT INTO {model_name}(state, choices, cumdist)
                  VALUES (%s, %s, %s)
                  ON CONFLICT DO NOTHING'''
        cursor.execute(sql, [list(state), choices, cumdist])
    
    def __create_index(self, model_name: str, hash_index: bool = True):
        cursor = self.conn.cursor()
        cursor.execute('CALL create_model_table_index(%s, %s)', [model_name, hash_index]);
        self.conn.commit()
        
    def __drop_index(self, model_name: str):
        cursor = self.conn.cursor()
        cursor.execute('CALL drop_model_table_index(%s)', [model_name]);
        self.conn.commit()

## Work example

In [7]:
encoded_corpus = [
    [1, 5, 6],
    [65, 4, 1, 54],
    [5, 65, 1, 324],
    [3, 6, 54]
]

pg_model = ChainStorage.connect('172.17.0.2', dbname='markov')
pg_model.add_model('test_sample', train_corpus=encoded_corpus, state_size=2)
pg_model.walk('test_sample', [0, 1])

[5, 6]

In [8]:
pg_model.delete_model('test_sample')

## Postgres chain usage with text processor & text encoder

In [53]:
pg_habs = HabrStorage.connect('172.17.0.3', dbname='habr')

texts_list = list(pg_habs.get_posts_texts(10))
train_corpus = list(TextProcessor.get_text_gen([texts_list,]))

encoder = WordsEncoder()
encoded_train_corpus = encoder.fit_encode(train_corpus)

pg_model = ChainStorage.connect('172.17.0.2', dbname='markov')
pg_model.add_model('another_test_sample', train_corpus=encoded_train_corpus, state_size=2)

encoder.decode_codes_list(pg_model.walk('another_test_sample', [0, 0]))

['возможности',
 'linkedin',
 'профиль',
 'теперь',
 'профиль',
 'возможности',
 'появившейся',
 'кнопки',
 'образовательного',
 'политики']

In [64]:
pg_model.delete_model('another_test_sample')

# Text generator model based on encoded markov chain

In [212]:
import re
from typing import Iterable

class TextGenerator:
    pg_chain: ChainStorage
    pg_encoder: EncoderStorage
    encoder: WordsEncoder
    model_name: str
    state_size: int
    re_process: re.Pattern = re.compile(r'[^a-zA-Zа-яА-Я ]')

    def __init__(self,
                 pg_chain: ChainStorage,
                 pg_encoder: EncoderStorage,
                 model_name: str,
                 state_size: int,
                 input_text: Iterable = None,
                 use_ngrams: bool = False,
                 ngram_size: int = None):
        self.pg_chain = pg_chain
        self.pg_encoder = pg_encoder
        self.model_name = model_name
        self.state_size = state_size

        if input_text:
            self.encoder = WordsEncoder()
            
            train_corpus = list(TextProcessor.get_text_gen(input_text))
            encoded_train_corpus = self.encoder.fit_encode(train_corpus)

            self.pg_encoder.add_encoder(model_name, self.encoder)
            self.pg_chain.add_model(model_name, encoded_train_corpus, state_size)
        else:
            self.encoder = self.pg_encoder.load_encoder(model_name)

    def words_split(self, sentence: str) -> list:
        words_list = []
        for word in sentence.split():
            processed_word = self.re_process.sub('', word.lower())
            if processed_word:
                words_list.append(processed_word)
        return words_list

    def res_join(self, words_list: list) -> str:
        return ' '.join(words_list)

    def make_sentence(self, init_state: list, **kwargs):
        tries = kwargs.get('tries', 10)
        max_words = kwargs.get('max_words', None)
        min_words = kwargs.get('min_words', None)

        if init_state is not None:
            init_state = self.encoder.encode_words_list(init_state)
            prefix = init_state
            for word in prefix:
                if word == self.encoder.begin_word:
                    prefix = prefix[1:]
                else:
                    break
        else:
            prefix = []

        for _ in range(tries):
            codes_list = prefix + self.pg_chain.walk(self.model_name, init_state)
            words_list = self.encoder.decode_codes_list(codes_list)
            if (max_words is not None and len(words_list) > max_words) or (
                    min_words is not None and len(words_list) < min_words):
                continue
            return self.res_join(words_list)
        return None

    def make_sentence_with_start(self, input_phrase: str, **kwargs):
        words_list = self.words_split(input_phrase)
        words_count = len(words_list)

        if words_count == self.state_size:
            init_state = words_list

        elif 0 < words_count < self.state_size:
            init_state = [self.encoder.begin_word] * (self.state_size - words_count) + words_list
        else:
            init_state = [self.encoder.begin_word] * self.state_size

        return self.make_sentence(init_state, **kwargs)

    def make_sentences_for_t9(self, beginning: str, first_words_count=1, count=20) -> list:
        phrases = set()
        for i in range(count):
            phrase = self.make_sentence_with_start(beginning)
            if phrase:
                words_list = phrase.split()
                if len(words_list) > 1:
                    phrases.add(" ".join(words_list[first_words_count:]))
        return list(phrases)


# Work example

In [114]:
def get_text_gen(
        mongo_wiki: WikiStorage,
        pg_habr: HabrStorage,
        wiki_articles_count=1000,
        habr_posts_count=1000,
        **kwargs
):
    habr_posts_gen = pg_habr.get_posts_texts(
        count=habr_posts_count, habs_list=kwargs.get('habs_list'), tags_list=kwargs.get('tags_list'))
    wiki_articles_gen = mongo_wiki.get_articles_headings_texts(count=wiki_articles_count)
    return (text_gen for text_gen in (habr_posts_gen, wiki_articles_gen))

## Establish connections to dbs

In [206]:
mongo_wiki = WikiStorage.connect(host='localhost')
pg_habr = HabrStorage.connect(host='172.17.0.3', dbname='habr')
pg_chain = ChainStorage.connect(host='172.17.0.2', dbname='markov')
pg_encoder = EncoderStorage.connect(host='172.17.0.2', dbname='markov')

## Train model

In [145]:
%%time
train_corpus_gen = get_text_gen(mongo_wiki=mongo_wiki,
                                pg_habr=pg_habr,
                                wiki_articles_count=100,
                                habr_posts_count=200,
                                habs_list=['Машинное обучение'])
model = TextGenerator(pg_chain=pg_chain,
                      pg_encoder=pg_encoder,
                      model_name='test_model',
                      state_size=3,
                      input_text=train_corpus_gen)

CPU times: user 21 s, sys: 13.7 s, total: 34.7 s
Wall time: 1min 23s


## Load model

In [213]:
model = TextGenerator(pg_chain=pg_chain,
                      pg_encoder=pg_encoder,
                      model_name='test_model',
                      state_size=3)

In [228]:
model.make_sentences_for_t9('привет хабр', first_words_count=2, count=10)

['',
 'за последние годы и самая большая за два десятилетия',
 'data engineering становится все более популярным методом и находит свое',
 'представляю вашему вниманию вторую часть статьи о поиске подозреваемых в',
 'за последние годы и самая большая по водности площади бассейна',
 'задача снижения размерности является одной из важнейших в машинном обучении',
 'мы уже говорили с theano но со временем города стали',
 'давайте вернемся к периодически затрагиваемой у нас теме машинного обучения']

# Ngrams

In [232]:
import nltk

ngram_size = 3
corpus = ['data engineering становится все более сложным',
 'за последние годы и самая большая по водности площади бассейна',
 'представляю вашему вниманию перевд статьи solving multiarmed bandits a comparison',
 'сегодня мы хотим поделиться с вами инструкцией по созданию бота',
 'задача снижения размерности является одной из земель федеративной республики германия',
 'мы уже говорили с theano но со временем города стали',
 'сегодня мы хотим поговорить о концепции insightdriven и о том']

for sentence in corpus[:2]:
    print([''.join(item) for item in nltk.ngrams(sentence, ngram_size)])

['dat', 'ata', 'ta ', 'a e', ' en', 'eng', 'ngi', 'gin', 'ine', 'nee', 'eer', 'eri', 'rin', 'ing', 'ng ', 'g с', ' ст', 'ста', 'тан', 'ано', 'нов', 'ови', 'вит', 'итс', 'тся', 'ся ', 'я в', ' вс', 'все', 'се ', 'е б', ' бо', 'бол', 'оле', 'лее', 'ее ', 'е с', ' сл', 'сло', 'лож', 'ожн', 'жны', 'ным']
['за ', 'а п', ' по', 'пос', 'осл', 'сле', 'лед', 'едн', 'дни', 'ние', 'ие ', 'е г', ' го', 'год', 'оды', 'ды ', 'ы и', ' и ', 'и с', ' са', 'сам', 'ама', 'мая', 'ая ', 'я б', ' бо', 'бол', 'оль', 'льш', 'ьша', 'шая', 'ая ', 'я п', ' по', 'по ', 'о в', ' во', 'вод', 'одн', 'дно', 'нос', 'ост', 'сти', 'ти ', 'и п', ' пл', 'пло', 'лощ', 'оща', 'щад', 'ади', 'ди ', 'и б', ' ба', 'бас', 'асс', 'ссе', 'сей', 'ейн', 'йна']
['пре', 'ред', 'едс', 'дст', 'ста', 'тав', 'авл', 'вля', 'ляю', 'яю ', 'ю в', ' ва', 'ваш', 'аше', 'шем', 'ему', 'му ', 'у в', ' вн', 'вни', 'ним', 'има', 'ман', 'ани', 'нию', 'ию ', 'ю п', ' пе', 'пер', 'ере', 'рев', 'евд', 'вд ', 'д с', ' ст', 'ста', 'тат', 'ать', 'тьи', 'ьи