In [1]:
from typing import Generator
import pymongo


class MongoStorage:
    """Class for working with MongoDB"""

    db: pymongo.database.Database
    col: pymongo.collection.Collection

    def __init__(self, db: pymongo.database.Database, col: pymongo.collection.Collection):
        self.db = db
        self.col = col

    @classmethod
    def connect(cls, host: str, port=27017, db_name='wiki', col_name='articles'):
        db = pymongo.MongoClient(host, port, unicode_decode_error_handler='ignore')[db_name]
        return cls(
            db=db,
            col=db[col_name])

    def get_articles(self, count=0) -> list:
        return list(self.col.find({}).limit(count))

    def get_articles_gen(self, count=0) -> Generator:
        return self.col.find({}).limit(count)

    def get_article(self, title) -> dict:
        doc = self.col.find_one({'title': title})
        return doc if doc else {}

    def get_articles_headings_texts(self, count=0) -> list:
        articles = self.get_articles(count)
        return [article['text']['Заголовок']['text'] for article in articles]

    def get_articles_headings_texts_gen(self, count=0) -> list:
        for article in self.get_articles_gen(count):
            yield article['text']['Заголовок']['text']


In [2]:
import psycopg2


class PostgresStorage:

    def __init__(self, conn):
        self.conn = conn
        self.cursor = conn.cursor()

    @staticmethod
    def connect(host, port=5432, user='postgres', password='password', dbname='habr'):
        return PostgresStorage(conn=psycopg2.connect(
            host=host, port=port, user=user, password=password, dbname=dbname)
        )

    def get_posts(self, count=0) -> list:
        if count > 0:
            self.cursor.execute('SELECT * FROM posts LIMIT %d' % count)
        else:
            self.cursor.execute('SELECT * FROM posts')
        posts = list(self.cursor.fetchall())
        return posts

    def get_posts_texts(self, count=0) -> list:
        posts_list = self.get_posts(count)
        return [post[2] for post in posts_list]


In [3]:
mdb = MongoStorage.connect(
            host='localhost')

In [4]:
pg = PostgresStorage.connect(
            host='172.17.0.2')

OperationalError: could not connect to server: No route to host
	Is the server running on host "172.17.0.2" and accepting
	TCP/IP connections on port 5432?


# Test

In [None]:
from typing import Generator

class PostgresStorage:

    def __init__(self, conn):
        self.conn = conn
        self.cursor = conn.cursor()

    @staticmethod
    def connect(host, port=5432, user='postgres', password='password', dbname='habr'):
        return PostgresStorage(conn=psycopg2.connect(
            host=host, port=port, user=user, password=password, dbname=dbname)
        )

    def get_posts(self, count=0) -> list:
        if count > 0:
            self.cursor.execute('SELECT * FROM posts LIMIT %d' % count)
        else:
            self.cursor.execute('SELECT * FROM posts')
        posts = list(self.cursor.fetchall())
        return posts

    def get_posts_texts(self, count=0) -> list:
        posts_list = self.get_posts(count)
        return [post[2] for post in posts_list]
    
    def get_posts(self, count=0) -> Generator:
        if count > 0:
            self.cursor.execute('SELECT * FROM posts LIMIT %d' % count)
        else:
            self.cursor.execute('SELECT * FROM posts')
        posts_gen = (post for post in self.cursor.fetchall())
        return posts_gen
    
    def get_posts_texts(self, count=0) -> Generator:
        if count > 0:
            self.cursor.execute('SELECT * FROM posts LIMIT %d' % count)
        else:
            self.cursor.execute('SELECT * FROM posts')
        posts_gen = (post[2] for post in self.cursor.fetchall())
        return posts_gen

In [None]:
pg = PostgresStorage.connect(
            host='172.17.0.2')

In [None]:
a = pg.get_posts_texts(count=-1)

In [None]:
import tqdm
from time import sleep
for i in tqdm.tqdm([1, 2, 3, 4, 5]):
    sleep(1)

In [5]:
import re
from typing import Generator
from nltk import ngrams


class Tokenizer:
    to_sentences = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s')
    remove_brackets = re.compile(r' \((.*?)\)')
    remove_punctuation = re.compile(r'[^a-zA-Zа-яА-Я ]')

    @classmethod
    def tokenize(cls, text: str, remove_punctuation=True, remove_brackets=True) -> Generator:
        buf = text.split('\n')
        buf = (item for item in buf if item)
        sentences = (sentence[:-1].lower()
                     for sentence in cls.to_sentences.split(' '.join(buf))
                     if sentence[:-1])
        if remove_brackets:
            sentences = (cls.remove_brackets.sub('', sentence) for sentence in sentences)
        if remove_punctuation:
            return (cls.remove_punctuation.sub('', sentence) for sentence in sentences)
        return sentences


class TextProcessor:
    tokenizer = Tokenizer()

    @classmethod
    def __get_sentences_list(cls, text_list: list, remove_punctuation=True, remove_brackets=True) -> list:
        sentences_list = []
        for text in text_list:
            sentences_list += list(cls.tokenizer.tokenize(
                text=text,
                remove_punctuation=remove_punctuation,
                remove_brackets=remove_brackets))
        return sentences_list

    @classmethod
    def get_sentences_gens(cls, text_gen: Generator, remove_punctuation=True, remove_brackets=True) -> Generator:
        for text in text_gen:
            yield cls.tokenizer.tokenize(
                text=text,
                remove_punctuation=remove_punctuation,
                remove_brackets=remove_brackets)

    @classmethod
    def process_text_list(cls, text_list: list, window_size=1) -> str:
        text = ''
        sentences_list = cls.__get_sentences_list(text_list)
        for sentence_num in range(len(sentences_list)):
            sentence = sentences_list[sentence_num]
            for i in range(window_size):
                text += (' '.join(sentence.split()[i:]) + '\n')
        return text[:-1]

    @classmethod
    def get_text_gen(cls, text_gens_gen: Generator, window_size=1) -> Generator:
        for text_gen in text_gens_gen:
            for sentences_gen in cls.get_sentences_gens(text_gen):
                for sentence in sentences_gen:
                    for ngram in (' '.join(ngram) for ngram in ngrams(sentence.split(), window_size)):
                        yield ngram

    @classmethod
    def process_text_gen(cls, text_gens_gen: Generator, window_size=1) -> str:
        text = ''
        for sentence in cls.get_text_gen(
                text_gens_gen=text_gens_gen,
                window_size=window_size):
            text += (sentence + '\n')
        return text[:-1]


In [6]:
from models.utils import mongo, postgres

def get_ram_model(
        mongo_storage: mongo.MongoStorage,
        postgres_storage: postgres.PostgresStorage,
        wiki_articles_count=1000,
        habr_posts_count=1000,
        model_state=3
):
    habr_posts = postgres_storage.get_posts_texts(
        count=habr_posts_count)
    wiki_articles = mongo_storage.get_articles_headings_texts(
        count=wiki_articles_count)
    input_text = TextProcessor.process_text_list(
        text_list=habr_posts + wiki_articles,
        window_size=model_state)
    model = MarkovModel(input_text, state_size=model_state)
    return model.compile()


def get_ram_model(
        mongo_storage: mongo.MongoStorage,
        postgres_storage: postgres.PostgresStorage,
        wiki_articles_count=1000,
        habr_posts_count=1000,
        model_state=3
):
    habr_posts_gen = postgres_storage.get_posts_texts(
        count=habr_posts_count)
    wiki_articles_gen = mongo_storage.get_articles_headings_texts_gen(
        count=wiki_articles_count)
    text_gen = TextProcessor.get_text_gen(
        text_gens_gen=(text_gen for text_gen in (habr_posts_gen, wiki_articles_gen)),
        window_size=model_state)
    model = MarkovModel(text_gen, state_size=model_state)
    return model.compile()
 

In [7]:
from models.markov.train import get_ram_model, get_ram_model
from models.utils import mongo, postgres
from models.markov.markov_model import MarkovModel

__mongo_storage: mongo.MongoStorage = None
__postgres_storage: postgres.PostgresStorage = None
__model: MarkovModel = None


def __get_wiki_storage():
    global __mongo_storage
    if __mongo_storage:
        return __mongo_storage
    else:
        __mongo_storage = mongo.MongoStorage.connect(
            host='localhost')
        return __mongo_storage


def get_habr_storage():
    global __postgres_storage
    if __postgres_storage:
        return __postgres_storage
    else:
        __postgres_storage = postgres.PostgresStorage.connect(
            host='172.17.0.2')
        return __postgres_storage

In [8]:
import os
import time
from typing import Generator
import markovify


class MarkovModel:

    model: markovify.Text

    def __init__(self, train_input, model=None, state_size=2):
        if model:
            self.model = model
        elif isinstance(train_input, str):
            start = time.time()
            print("Start training markovify.NewlineText")
            self.model = markovify.NewlineText(train_input, state_size=state_size)
            print("Finish training: ", time.time() - start)
        elif isinstance(train_input, Generator):
            self.model = markovify.Text(next(train_input), state_size=state_size)
            for sent in tqdm.tqdm(train_input):
                self.model = markovify.combine([self.model, markovify.Text(sent, state_size=state_size)])

    def compile(self):
        self.model.compile(inplace=True)
        return self

    @classmethod
    def load(cls, model_name='model1.0-habr-10000.json', models_path='models/markov/bin'):
        with open(os.path.join(models_path, model_name), 'r') as f:
            model_json = f.read()
        model = markovify.Text.from_json(model_json)
        return cls(model=model)

    def save(self, model_name):
        with open(f'models/markov/bin/{model_name}.json', 'w') as f:
            f.write(self.model.to_json())

    def generate_sample(self, beginning: str) -> str:
        return self.model.make_sentence_with_start(beginning)

    def make_sentences_for_t9(self, beginning: str, count=20) -> list:
        phrases = set()
        for i in range(count):
            phrase = self.generate_sample(beginning)
            if phrase:
                words_list = phrase.split()
                if len(words_list) > 1:
                    phrases.add(" ".join(words_list[1:6]))
        return list(phrases)



In [9]:
import tqdm
model = get_ram_model(
            mongo_storage=__get_wiki_storage(),
            postgres_storage=get_habr_storage(),
            model_state=3,
            wiki_articles_count=1000,
            habr_posts_count=1000
        )

OperationalError: could not connect to server: No route to host
	Is the server running on host "172.17.0.2" and accepting
	TCP/IP connections on port 5432?


In [None]:
model_state = 3
habr_posts_count = 1000
wiki_articles_count = 1000

## Old variant - without generators

In [84]:
start = time.time()
print("Start!")
habr_posts = pg.get_posts_texts(
    count=habr_posts_count)
print("Get posts: ", time.time() - start)
wiki_articles = mdb.get_articles_headings_texts(
    count=wiki_articles_count)
print("Get articles: ", time.time() - start)
input_text = TextProcessor.process_text_list(
    text_list=habr_posts + wiki_articles,
    window_size=model_state)
print("Processed text: ", time.time() - start)
model = MarkovModel(input_text, state_size=model_state).compile()
print("Finish: ", time.time() - start)

Start!
Get posts:  0.14567303657531738
Get articles:  0.40266871452331543
Processed text:  2.211005449295044
Start training markovify.NewlineText
Finish training:  7.211528778076172
Finish:  13.537837028503418


## New variant - with generators

In [85]:
start = time.time()
print("Start!")
habr_posts_gen = pg.get_posts_texts(
    count=habr_posts_count)
print("Get posts gen: ", time.time() - start)
wiki_articles_gen = mdb.get_articles_headings_texts_gen(
    count=wiki_articles_count)
print("Get articles gen: ", time.time() - start)
text_gen = TextProcessor.process_text_gen(
    text_gens_gen=(text_gen for text_gen in (habr_posts_gen, wiki_articles_gen)),
    window_size=model_state)
print("Processed text gens: ", time.time() - start)
model = MarkovModel(text_gen, state_size=model_state).compile()
print("Finish: ", time.time() - start)

Start!
Get posts gen:  0.14258933067321777
Get articles gen:  0.14279603958129883
Processed text gens:  2.6517739295959473
Start training markovify.NewlineText
Finish training:  12.632851839065552
Finish:  23.218796968460083


# End Test

In [5]:
habr_posts = pg.get_posts_texts(
        count=1000)
wiki_articles = mdb.get_articles_headings_texts(
    count=1000)

In [None]:
habr_posts[0]

In [188]:
src = (i.strip() for i in" ".join(map(lambda x:
       x.replace("\n", " ")
        .replace(".", " ")
        .replace("(", " ")
        .replace(")", " ")
        .replace(",", " ")
        .replace(";", " ")
        .replace(":", " ")
        .replace("'", " ")
        .replace('"', " ")
        .replace("?", " ")
        .replace("!", " ")
        .replace("»", " ")
        .replace("«", " ")
        .replace("-", " ")
        .replace("  ", " ")
        .lower(),
        habr_posts
      )).replace("  ", " ").split(" ") if i != "")

In [189]:
def ngrams(src:list, N:int=3):
    return zip(*(islice(seq, index, None) for index, seq in enumerate(tee(src, N))))

In [190]:
gen = ngrams(src)


In [191]:
# train = "\n".join(" ".join(x) for x in gen)

In [192]:
sentence_gen = (" ".join(x) for x in gen)
model = markovify.Text(next(sentence_gen))
max_it = 10000
for it, sent in enumerate(tqdm.tqdm(sentence_gen)):
    if it > max_it:
        break
    model = markovify.combine([model, markovify.Text(sent)])

2523it [00:03, 640.12it/s]


KeyError: ('___BEGIN__', '___BEGIN__')

In [194]:
model

<markovify.text.Text at 0x7f67f9438510>

In [187]:
it

895665

In [140]:
train[:100]

'уже 29 октября\n29 октября в\nоктября в олимпийском\nв олимпийском пройдут\nолимпийском пройдут бои\nпрой'

In [166]:
model.make_sentence_with_start("новое")

'новое предложение для тестов новое предложение для тестов новое предложение для тестов'

In [201]:
for i in range(100):
    text = input(">>>>").strip()
#     print(text)
    print(model.make_sentence_with_start(text))

>>>>несколько основных
несколько основных идей родившихся по результатам первых опытов
>>>>первых
None
>>>>первых
первых опытов боёв это независимость робота от
>>>>первых
первых опытов боёв это независимость робота от
>>>>первых
первых опытов боёв это независимость робота от
>>>>первых
первых опытов боёв это
>>>>первых
первых опытов боёв это независимость
>>>>первых
первых опытов боёв это
>>>>первых
None
>>>>первых
первых опытов боёв это независимость робота от переворота
>>>>первых
первых опытов боёв это
>>>>первых
первых опытов боёв это независимость
>>>>первых
первых опытов боёв это
>>>>первых
первых опытов боёв это независимость
>>>>первых
первых опытов боёв это
>>>>первых
первых опытов боёв это
>>>>первых
первых опытов боёв это
>>>>первых
первых опытов боёв это независимость
>>>>первых
первых опытов боёв это
>>>>первых
первых опытов боёв это
>>>>первых
None
>>>>первых
первых опытов боёв это независимость
>>>>это
это новый робот открытые колеса сверху 10 rip великобритания маленьк

KeyError: ('___BEGIN__', 'эти')

In [121]:
import tqdm
from time import sleep
for i in tqdm.tqdm([1, 2, 3, 4, 5]):
    sleep(1)

100%|██████████| 5/5 [00:05<00:00,  1.00s/it]


In [None]:
import re


class Tokenizer:

    to_sentences = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s')
    remove_brackets = re.compile(r' \((.*?)\)')
    remove_punctuation = re.compile(r'[^a-zA-Zа-яА-Я ]')

    @classmethod
    def tokenize(cls, text: str, remove_punctuation=True, remove_brackets=True, remove_endings=True) -> list:
        buf = text.split('\n')
        buf = [item for item in buf if item]
        sentences = []
        if not remove_endings:
            for sentence in cls.to_sentences.split(' '.join(buf)):
                buf_list = [item for item in sentence.split('!') if item]
                for item in buf_list:
                    if item[-1] == '.' or item[-1] == '?':
                        sentences.append(item.lower())
                    else:
                        sentences.append(item.lower() + '!')
        else:
            sentences = [sentence[:-1].lower() for sentence in cls.to_sentences.split(' '.join(buf)) if sentence[:-1]]
        if remove_brackets:
            sentences = [cls.remove_brackets.sub('', sentence) for sentence in sentences]
        if remove_punctuation:
            return [cls.remove_punctuation.sub('', sentence) for sentence in sentences]
        return sentences


class TextProcessor:

    tokenizer = Tokenizer()

    @classmethod
    def __get_sentences_list(cls, text_list: list, remove_brackets=True, remove_endings=True) -> list:
        sentences_list = []
        for text in text_list:
            sentences_list += cls.tokenizer.tokenize(
                text=text,
                remove_brackets=remove_brackets,
                remove_endings=remove_endings)
        return sentences_list

    @classmethod
    def process_text_list(cls, text_list: list, window_size=1, remove_brackets=True, remove_endings=True) -> str:
        text = ''
        sentences_list = cls.__get_sentences_list(
            text_list=text_list,
            remove_brackets=remove_brackets,
            remove_endings=remove_endings)
        for sentence_num in range(len(sentences_list)):
            sentence = sentences_list[sentence_num]
            for i in range(window_size):
                text += (' '.join(sentence.split()[i:]) + '\n')
        return text[:-1]


In [11]:
input_text = TextProcessor.process_text_list(
        text_list=habr_posts + wiki_articles,
        window_size=3)

In [None]:
input_text