In [33]:
from itertools import zip_longest
from scipy.stats import chisquare
import numpy as np
from collections import Counter

from typing import Final

In [38]:
ENG_LCASE: Final = "abcdefghijklmnopqrstuvwxyz"
RUS_LCASE: Final = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"

_ALPHABETS: Final = {
    "russian": RUS_LCASE,
    "english": ENG_LCASE
}

IC_TABLE: Final = {
    "russian": 0.0553,
    "english": 0.0644
}

ENG_LETTER_FREQ_COMMON = {
    "a": 8.12,  "b": 1.49,  "c": 2.71,  "d": 4.32,  "e": 12.02, "f": 2.30,  "g": 2.03,
    "h": 5.92,  "i": 7.31,  "j": 0.10,  "k": 0.69,  "l": 3.98,  "m": 2.61,  "n": 6.95,
    "o": 7.68,  "p": 1.82,  "q": 0.11,  "r": 6.02,  "s": 6.28,  "t": 9.10,  "u": 2.88,
    "v": 1.11,  "w": 2.09,  "x": 0.17,  "y": 2.11,  "z": 0.07
}

RUS_LETTER_FREQ_COMMON = {
    "а": 8.01,  "б": 1.59,  "в": 4.54,  "г": 1.70,  "д": 2.98,  "е": 8.45,  "ё": 0.04,
    "ж": 0.94,  "з": 1.65,  "и": 7.35,  "й": 1.21,  "к": 3.49,  "л": 4.40,  "м": 3.21,
    "н": 6.70,  "о": 10.96, "п": 2.81,  "р": 4.73,  "с": 5.47,  "т": 6.26,  "у": 2.62,
    "ф": 0.26,  "х": 0.97,  "ц": 0.48,  "ч": 1.44,  "ш": 0.73,  "щ": 0.36,  "ъ": 0.04,
    "ы": 1.90,  "ь": 1.74,  "э": 0.32,  "ю": 0.64,  "я": 2.01
}


FREQ_TABLES: Final = {
    "russian": {
        "common": RUS_LETTER_FREQ_COMMON
    },
    "english": {
        "common": ENG_LETTER_FREQ_COMMON
    }
}

In [39]:
m = "влцдутжбюцхъяррмшбрхцэооэцгбрьцмйфктъъюьмшэсяцпунуящэйтаьэдкцибрьцгбрпачкъуцпъбьсэгкцъгуущарцёэвърюуоюэкааэбрняфукабъарпяъафкъиьжяффнйояфывбнэнфуюгбрьсшьжэтбэёчюъюръегофкбьчябашвёэуъъюаднчжчужцёэвлрнчулбюпцуруньъшсэюъзкцхъяррнрювяспэмасчкпэужьжыатуфуярюравртубурьпэщлафоуфбюацмнубсюкйтаьэдйюнооэгюожбгкбрънцэпотчмёодзцвбцшщвщепчдчдръюьскасэгъппэгюкдойрсрэвоопчщшоказръббнэугнялёкьсрбёуыэбдэулбюасшоуэтъшкрсдугэфлбубуъчнчтртпэгюкиугюэмэгюккъъпэгяапуфуэзьрадзьжчюрмфцхраююанчёчюъыхьъцомэфъцпоирькнщпэтэузуябащущбаыэйчдфрпэцъьрьцъцпоилуфэдцойэдятррачкубуфнйтаьэдкцкрннцюабугюуубурьпйюэъжтгюркующоъуфъэгясуоичщщчдцсфырэдщэъуяфшёчцюйрщвяхвмкршрпгюопэуцчйтаьэдкцибрьцыяжтюрбуэтэбдуящэубъибрювъежагибрбагбрымпуноцшяжцечкфодщоъчжшйуъцхчщвуэбдлдъэгясуахзцэбдэулькнъщбжяцэьрёдъьвювлрнуяфуоухфекьгцчччгэъжтанопчынажпачкъуъмэнкйрэфщэъьбудэндадъярьеюэлэтчоубъцэфэвлнёэгфдсэвэёкбсчоукгаутэыпуббцчкпэгючсаъбэнэфъркацхёваетуфяепьрювържадфёжбьфутощоявьъгупчршуитеачйчирамчюфчоуяюонкяжыкгсцбрясшчйотъъжрсщчл"

In [42]:
class AutocorrError(Exception):
    pass


class Autocorrelation:
    def __init__(self, text, delta: float = 0.001, max_len: int = 20, lang: str = "english"):
        self.text = text.lower()
        self.delta = delta
        self.max_len = max_len

        if lang not in _ALPHABETS.keys():
            raise Exception(f"The selected language must be from the list -> {_ALPHABETS.keys()}")

        if lang not in FREQ_TABLES.keys():
            raise Exception(f"The selected language must be from the list -> {FREQ_TABLES.keys()}")

        if lang not in IC_TABLE.keys():
            raise Exception(f"The selected language must be from the list -> {IC_TABLE.keys()}")

        self.lang = lang
        self.alphabet = _ALPHABETS.get(lang)
        self.threshold = IC_TABLE.get(lang)
        self.freq_table = list(FREQ_TABLES.get(lang).get("common").values())

        if not set(self.text).issubset(self.alphabet):
            raise AutocorrError("The text you entered contains invalid characters.")

    def find_possible_key_length(self):
        for t in range(1, min((len(self.text), self.max_len))):
            n = sum(1 for i in range(len(self.text) - t) if self.text[i] == self.text[i + t])
            autocorr_coff = n / (len(self.text) - t)
            if autocorr_coff > self.threshold - self.delta:
                return t

    def find_possible_key(self, key_len: int):
        groups = tuple(self.text[i:i + key_len] for i in range(0, len(self.text), key_len))
        columns = tuple("".join(column) for column in zip_longest(*groups, fillvalue=""))

        keys = []
        for column in columns:
            chi2_stats = []
            column_counter = dict.fromkeys(self.alphabet, 0.0000001)
            column_counter.update(Counter(column))
            column_freqs = list(column_counter.values())

            column_freqs = list(map(lambda x: x / sum(column_freqs) * 100, column_freqs))
            for i in range(len(self.alphabet)):
                shifted_column_freqs = column_freqs[i:] + column_freqs[:i]
                chi2_stats.append(chisquare(shifted_column_freqs, self.freq_table).statistic)

            keys.append(np.argmin(chi2_stats))
        return "".join(map(lambda x: self.alphabet[x], keys))


In [43]:
Autocorrelation(m, lang="russian").find_possible_key(5)

'слово'