In [1]:
import re
from collections import Counter

In [2]:
with open('text.txt', 'r') as file:
    TEXT = file.read()

In [3]:
def get_words(text):
    return re.findall(r'[а-я]+', text.lower(), re.I)

In [4]:
WORDS = get_words(TEXT)
WORDS[:3]

['маленький', 'принц', 'антуан']

In [5]:
COUNTS = Counter(WORDS)

In [6]:
COUNTS.most_common(10)

[('и', 57476),
 ('в', 48831),
 ('не', 32675),
 ('на', 30950),
 ('я', 28936),
 ('что', 23876),
 ('с', 19826),
 ('как', 13116),
 ('он', 12796),
 ('а', 11883)]

In [7]:
TOTAL_WORDS = sum(COUNTS.values())

In [8]:
def pword(word):
    return COUNTS[word] / TOTAL_WORDS

In [9]:
def pwords(words):
    result = 1
    for w in words:
        result *= pword(w)
    return result

In [10]:
def split_word(word, start=0):
    return [(word[:i], word[i:]) for i in range(start, len(word) + 1)]

In [11]:
split_word('привет')

[('', 'привет'),
 ('п', 'ривет'),
 ('пр', 'ивет'),
 ('при', 'вет'),
 ('прив', 'ет'),
 ('приве', 'т'),
 ('привет', '')]

In [12]:
alph = ''.join([chr(i) for i in range(1072, 1104)])
alph

'абвгдежзийклмнопрстуфхцчшщъыьэюя'

In [13]:
goal = 'привет'

In [14]:
split_word(goal)

[('', 'привет'),
 ('п', 'ривет'),
 ('пр', 'ивет'),
 ('при', 'вет'),
 ('прив', 'ет'),
 ('приве', 'т'),
 ('привет', '')]

In [15]:
def split_word_segment(text, l = 35):
    return [(text[:i], text[i:]) for i in range(1, min(len(text) + 1, l))]

In [26]:
def segment(text):
    if not text:
        return []
    return max(([a] + segment(b) for (a,b) in split_word_segment(text)), key=pwords)

In [27]:
def variants(word):
    swap_char   = [a + b[1] + b[0] +b[2:] for (a,b) in split_word(word) if len(b) > 1]
    miss_char   = [a + c + b for (a,b) in split_word(word) for c in alph]
    incor_char = [a + c + b[1:] for (a,b) in split_word(word) for c in alph if len(b) > 1]
    ex_char     = [a + b[1:] for (a,b) in split_word(word) if len(b) > 0]
    return set(swap_char + miss_char + incor_char + ex_char)

In [28]:
def known(words):
    return {w for w in words if w in COUNTS}

def edits0(word):
    return {word}

def edits1(word):
    return variants(word)

def edits2(word):
    return set(v for w in edits1(word) for v in edits1(w))

def correct_word(word):
    candidates = (known(edits0(word)) or
                  known(edits1(word)) or
                  known(edits2(word)) or
                  {word})
    if len(candidates) == 1 and not known(candidates):
        return ' '.join(segment(list(candidates)[0]))
    return max(candidates, key=COUNTS.get)

In [29]:
def correct_match(match):
    return correct_word(match.group())

def correct(text):
    return re.sub(r'[а-я]+', correct_match, text, flags=re.I)

In [30]:
max(WORDS, key=len)

'четырехсоттридцатидевятилетнего'

In [31]:
correct('рпивет, кк втои делка? сегодняхорошийдень')

'привет, как твои дела? сегодня хороший день'

In [32]:
segment('приветкакдела')

['привет', 'как', 'дела']