In [1]:
import string
from matplotlib import pyplot as plt
import re
import numpy as np

In [2]:
filename = 'D:/Workspace/BSU-NLP/corpus/Asimov Isaac. Foundation.txt'
with open(filename, 'r') as file:
    text = file.read().lower()
print(len(text))
print(text[500:800])

ch i was hastening.

i was 21 years old, a graduate student in chemistry at columbia university, and i had been writing science fiction professionally for three years. in that time, i had sold five stories to john campbell, editor of astounding, and the fifth story, "nightfall," was about to appear 


In [3]:
alphabet = list(map(chr, range(ord('a'), ord('z')+1)))

letters_freq = {
    'A':0.08167,
    'B':0.01492,
    'C':0.02782,
    'D':0.04253,
    'E':0.12702,
    'F':0.0228,
    'G':0.02015,
    'H':0.06094,
    'I':0.06966,
    'J':0.00153,
    'K':0.00772,
    'L':0.04025,
    'M':0.02406,
    'N':0.06749,
    'O':0.07507,
    'P':0.01929,
    'Q':0.00095,
    'R':0.05987,
    'S':0.06327,
    'T':0.09056,
    'U':0.02758,
    'V':0.00978,
    'W':0.0236,
    'X':0.0015,
    'Y':0.01974,
    'Z':0.00074
}

In [4]:
def make_dicts(permuted):
    dct = dict(zip(alphabet, permuted))
    inv = dict(zip(permuted, alphabet))
    
    return dct, inv

def encrypt_caesar(text, mod=3, inverse=False):
    permuted = alphabet[mod:]+alphabet[:mod]
    dct, inv = make_dicts(permuted)
    if inverse: dct = inv
    result = ''.join([dct.get(c, c) for c in text])
    return result

def encrypt_vigenere(text, keyword='mouse', inverse=False):
    m = len(keyword)
    dcts = []
    invs = []
    for keychar in keyword:
        mod = ord(keychar) - ord('a')
        permuted = alphabet[mod:]+alphabet[:mod]
        dct, inv = make_dicts(permuted)
        dcts.append(dct)
        invs.append(inv)
    result = ''    
    if inverse: dcts = invs
    for i, c in enumerate(text):
        c = dcts[i%m].get(c, c)
        result += c
    return result


In [5]:
encr_c = encrypt_caesar(text[100:200])
print('Encrypted with caesar:\n', encr_c)
decr_c = encrypt_caesar(encr_c, inverse=True)
print('Decrypted with caesar:\n', decr_c)

Encrypted with caesar:
 w 1, 1941. zruog zdu ll kdg ehhq udjlqj iru wzr bhduv. iudqfh kdg idoohq, wkh edwwoh ri eulwdlq kdg 
Decrypted with caesar:
 t 1, 1941. world war ii had been raging for two years. france had fallen, the battle of britain had 


In [6]:
encr_v = encrypt_vigenere(text[100:200])
print('Encrypted with vigenere:\n', encr_v)
decr_v = encrypt_vigenere(encr_v, inverse=True)
print('Decrypted with vigenere:\n', decr_v)

Encrypted with vigenere:
 f 1, 1941. kijpp qsv wc lmr tiqb jeswhy rcl xic qimfm. rfufgq bsh tudpqb, xts tefhfw at tvuhuar vuv 
Decrypted with vigenere:
 t 1, 1941. world war ii had been raging for two years. france had fallen, the battle of britain had 


In [7]:
def analyze_kasiski(text):
    t = len(text)
    dist = np.zeros(t, dtype=int)
    for l in range(3,5):
        lgrams = []
        for i in range(t-l+1):
            lgram = text[i:i+l]
            if lgram.isalpha(): lgrams.append(lgram)
        for i, lgram in enumerate(lgrams):
            #print(lgram)
            starts = [m.start() for m in re.finditer(lgram, text[i:])]
            #print(starts)
            for i in range(1,len(starts)):
                d = starts[i]-starts[i-1]
                dist[d] += 1
    pairs = np.column_stack((np.arange(t)[dist>0], dist[dist>0]))
    predicted_length = 1
    threshold = 1000
    while predicted_length < 2:
        predicted_length = np.gcd.reduce(pairs[pairs[:,1] > threshold][:,0])
        threshold += 100
    return pairs, predicted_length

In [8]:
encrypted_text = encrypt_vigenere(text)
print(encrypted_text[500:800])

ov a iom lmgnwruba.

w oee 21 ksujw cfv, o yvmrosxq mlypshl ub ulqackxdm sx qidyypcs gbcnidgclc, ufh w zep vwiz qjmfwhy eqcwros xmohcgr dlgjqgmaszofdc tij fvlwi mysve. ar hbsx hcei, c lmr ksxr xmhs kxafcww hi navh gmajtixz, ipwngv cz eehimrpwhy, ohv fvy jutnz ehijc, "fmsvnxexz," iom encol fc stbsuj 


In [9]:
pairs, predicted_length = analyze_kasiski(encrypted_text)
#print(pairs)
print(predicted_length)

5


In [10]:
def get_freqs(text):
    freqs = {a: text.count(a) for a in alphabet}
    total = sum(list(freqs.values()))
    for key in freqs:    
        freqs[key] /=  total
    return freqs

def find_letter(freq_list):
    true_freq_list = np.array(list(letters_freq.values()))
    errors = np.empty(len(true_freq_list))
    for delta in range(len(true_freq_list)):
        shifted_freq_list = np.roll(freq_list,-delta)
        errors[delta] = np.linalg.norm(shifted_freq_list-true_freq_list)
    return chr(ord('a')+np.argmin(errors))

def decrypt(text, keyword_length):
    keyword = ''
    for i in range(keyword_length):
        freqs = get_freqs(text[i::keyword_length])
        keyword += find_letter(list(freqs.values()))
    print(keyword)
    return encrypt_vigenere(text, keyword, True)

In [11]:
decrypted_text = decrypt(encrypted_text, 5)
print(decrypted_text[500:800])

mouse
ch i was hastening.

i was 21 years old, a graduate student in chemistry at columbia university, and i had been writing science fiction professionally for three years. in that time, i had sold five stories to john campbell, editor of astounding, and the fifth story, "nightfall," was about to appear 


In [None]:
def get_prob_text_len(text, l):
    chunks = text.split(text, len(text)//l)
    encr_chunks = [encrypt_vigenere(chunk) for chunk in chunks]
    
    decr_chunks = []