# Homework: Decipherment

In [1]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict, Counter
import collections
import pprint
import math
import bz2
from ngram import *
import sys, string
import copy
import pickle
from joblib import Parallel, delayed
import itertools
pp = pprint.PrettyPrinter(width=45, compact=True)

First let us read in the cipher text from the `data` directory:

In [2]:
def read_file(filename):
    if filename[-4:] == ".bz2":
        with bz2.open(filename, 'rt', encoding='utf8') as f:
            content = f.read()
            f.close()
    else:
        with open(filename, 'r', encoding='utf8') as f:
            content = f.read()
            f.close()
    return content

cipher = read_file("data/cipher.txt")
print(cipher)

º∫P/Z/uB∫ÀOR•–X•B
WV+≈GyF∞ºHPπKÇ—y≈
MJy^uIÀΩ—T‘NQyDµ£
S¢/º∑BPORAu∫∆RÃ—E
À^LMZJƒ“\–FHVW≈æy
π+—GDºKI£∞—Xæµ§S¢
RN‘IyEÃOæ—GBTQS∑B
Lƒ/P∑BπX—EHMu^RRÀ
√ZK—–I£W—ÇæµLM“º∑
BPDR+j•∞\N¢≈EuHÀF
Z√–OVWIµ+‘L£Ã^R∞H
IºDR∏Ty“\ƒ≈/πXJQA
PµMæRu‘∫L£NVEKH•G
“IÇJÀµºæLMÃNA£Z¢P
§u–ÀAº∑BVW\+VT‘OP
^•S“Ã∆u≈∞ΩD§G∫∫IM
NÀ£S√E/º∫∫Z∆AP∑BV
–≈X—W—∏F∑æ√+πºAºB
∫OTµRu√+∏ƒy—∏^S—W
VZ≈GyKE∏TyAº∫∑L‘∏
HÇFBXº§XADƒ\ΩLÇ•—
∏≈ƒ∑∑∞≈µPORXQF∫G√
ZπJT‘—∏æJI+“BPQW∞
VEX“ºWI∞—EHM£•uIÀ


For the default solution we need to compute statistics like length, number of symbols/letters, 
unique occurences, frequencies and relative frequencies of a given file. This is done in the function `get_statistics` below.

While using `get_statistics`, make sure that `cipher=True` is set when the input is a ciphertext.

In [3]:
def get_statistics(content, cipher=True):
    stats = {}
    content = list(content)
    split_content = [x for x in content if x != '\n' and x!=' ']
    length = len(split_content)
    symbols = set(split_content)
    uniq_sym = len(list(symbols))
    freq = collections.Counter(split_content)
    rel_freq = {}
    for sym, frequency in freq.items():
        rel_freq[sym] = (frequency/length)*100
        
    if cipher:
        stats = {'content':split_content, 'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    else:
        stats = {'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    return stats

In [4]:
"""
ATTENTION!
For grading purposes only. Don't bundle with the assignment. 
Make sure '_ref.txt' is removed from the 'data' directory before publishing.
"""

def read_gold(gold_file):
    with open(gold_file) as f:
        gold = f.read()
    f.close()
    gold = list(gold.strip())
    return gold

def symbol_error_rate(dec, _gold):
    gold = read_gold(_gold)
    correct = 0
    if len(gold) == len(dec):
        for (d,g) in zip(dec, gold):
            if d==g:
                correct += 1
    wrong = len(gold)-correct
    error = wrong/len(gold)
    
    return error

In [5]:
cipher_desc = get_statistics(cipher, cipher=True)
plaintxt = read_file("data/default.wiki.txt.bz2")
plaintxt_desc = get_statistics(plaintxt, cipher=False)
#pp.pprint(cipher_desc)

## Load the 6-gram model

In [6]:
%%time
sequence = 'In a few cases, a multilingual artifact has been necessary to facilitate decipherment, the Rosetta Stone being the classic example. Statistical techniques provide another pathway to decipherment, as does the analysis of modern languages derived from ancient languages in which undeciphered texts are written. Archaeological and historical information is helpful in verifying hypothesized decipherments.'

# lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=True)
lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=False)

Reading language model from data/6-gram-wiki-char.lm.bz2...


Wall time: 15.1 s


Done.


In [7]:
print(sequence)
lm_logprob = lm.score_seq(sequence)
print("TOTAL LM LOGPROB: {}".format(lm_logprob), file=sys.stderr)
print("TOTAL LM LOGPROB: {}".format(lm.score_seq('this is the text.')), file=sys.stderr)
print(lm.get_bitstring_spans('..oo...ooo..'))
print(lm.score_bitstring('thisisatest', 'ooooooooooo'))

In a few cases, a multilingual artifact has been necessary to facilitate decipherment, the Rosetta Stone being the classic example. Statistical techniques provide another pathway to decipherment, as does the analysis of modern languages derived from ancient languages in which undeciphered texts are written. Archaeological and historical information is helpful in verifying hypothesized decipherments.
{2: 3, 3: 4, 7: 8, 8: 9, 9: 10}
-11.05281791


TOTAL LM LOGPROB: -221.09434842188
TOTAL LM LOGPROB: -9.76947916


## Implementation for Reference 3 to find the optimal extension order

This is the implementation of the reference 'Beam Search for Solving Substitution Ciphers'. The goal is to find the best extension order. As the paper mentioned, it is important to find a set of weights for the ngram order. I chose the weights \[1,1,1,1,2,3\] suggested by Anoop in a discussion post. I also tried several sets of weights. For beamsize of 10000, the result was not influenced quite much.

In [8]:
def find_sharp_n(cipher_desc, symbols_found, n_order):
    '''
    finds the #n for order n_order
    cipher_desc -- cipher statistics
    symbols_found -- list of single character string,
                     specifies the list of symbols have been placed in the extention order
    n_order -- int, specifies the order of n-gram
    '''
    sharp_n = 0
    for i in range(len(cipher_desc['content'])-n_order+1):
        #flag = True
        for j in range(i, i+n_order, 1):
            if cipher_desc['content'][j] not in symbols_found:
                break
            if j == (i+n_order-1):
                sharp_n += 1
                #print(cipher_desc['content'][i:i+n_order])
    return sharp_n            

This is used to test the find_sharp_n function above. The results should be correct.

In [9]:
# test_str = 'ASCREAMINGCOMESACROSSTHESKY'
# print(find_sharp_n(get_statistics(test_str), ['A','G','H','K','Y'],6))
# find_sharp_n(get_statistics(test_str), ['S','C','E','M','O'],2)

Use a beam search to find the optimal extension order. The code below is pretty similar to the beam_search function. Most of the code is copied from it. For simplicity, the variable name might not quite make sense.

In [10]:
def find_ext_order(cipher_desc, topn=100, weights=[1,1,1,1,2,3]):
    '''
    finds the best order of deciphering cipher symbols (find best extention order)
    cipher_desc -- cipher statistics
    topn -- int, number of best trees we want to keep during iteration
    weights -- list of int, weight for #n, n varies from 1 to 6
    '''
    # symbols_found = list()
    # symbols_found.append(sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)[0])
    # symbols already found with score
    Hs = [([], 0)]
    # hypothesis extended symbols with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    if weights[0] == 0:
        cardinality += 1
        Hs.append(([sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)[0]], 0))
    # list of cipher characters
    Ve = sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)
    while cardinality < cipher_desc['vocab_length']:
    #while cardinality < 3:
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                if e in phi_prime:
                    continue
                else:
                    phi_prime.append(e)
                    this_score = 0
                    for i in range(6):
                        this_score += weights[i]*find_sharp_n(cipher_desc, phi_prime, i+1)
                    Ht.append((phi_prime, this_score))
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        #print('Done with symbol number', cardinality, '; Current best score: ', Hs[0][1])
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [66]:
%%time
# test the function above
ext_orders = find_ext_order(cipher_desc, topn=100, weights=[0,1,1,1,2,3])

Wall time: 3min 46s


In [67]:
ext_order = ext_orders[0][0]
pp.pprint(ext_order)

['—', '∑', 'B', 'P', 'º', '∫', 'A', '/', 'Z',
 '∆', 'u', 'O', 'R', 'À', 'Ã', 'E', 'V', '–',
 '√', 'W', '^', 'K', 'I', '£', '∞', 'H', 'M',
 '•', 'X', '≈', 'π', 'F', '“', 'S', 'N', 'L',
 'µ', 'æ', '+', 'G', 'y', '‘', '∏', 'Ç', 'J',
 'T', 'Q', 'ƒ', 'D', '\\', '¢', '§', 'Ω',
 'j']


In [None]:
with open('ext_order.pkl', 'wb') as fh:
    pickle.dump(ext_orders, fh)

In [None]:
# with open('ext_order.pkl', 'rb') as fh:
#     ext_order = pickle.load(fh, encoding='utf8')[0][0]

## Baseline

I rewrote the score function. The change I made is to score the newly fixed symbol plantext character pair and corresponding influenced previously fixed plantext character based on the previous score instead of scoring the whole bitstring in each iteration. For instance, 'oooo...o' -> 'oooo..xo'. The new score can be calculated by adding unigram score of 'x' to the previous score, substracting unigram score of 'o' following 'x' and bigram score of '<\s>' from the previous score, and adding bigram score of 'o' following 'x' and trigram score of '<\s>' to the previous score. With this approach, the running time was improved to 20 minitues from 1 hour with a beamsize of 10000. And the computed score is very close to the score computed with score_bit_string function (the difference is within 0.0001).

In [16]:
def score(cipher, phi, new_f, new_e, previous_score):
    '''
    scores the phi_prime based on the previous score, returns a float
    cipher -- list of single character string
    phi -- dictionary, old mapping e->[f]
    new_f -- single-character string, extended symbol
    previous_score -- float, old score for phi
    '''
    mapping = phi
    new_score = previous_score
    # for the first iteration, the previous score should be -2.545382 instead of 0
    if len(phi)==0:
        new_score += -2.545382
    lm_state = lm.begin()
    old_lm_state = lm.begin()
    triggerChangeFlag = 0
    for i in range(len(cipher)):
        char = cipher[i]
        if (char in mapping.keys()) and (triggerChangeFlag==0):
            token = mapping[char]
            ngram = lm_state + (token,)
            while len(ngram)> 0:
                if ngram in lm.table:
                    lm_state = ngram[-lm.history:]
                    break
                else: #backoff
                    ngram = ngram[1:]
            if len(ngram)==0:
                lm_state = ()
            old_lm_state = lm_state
        elif (char in mapping.keys()) and (triggerChangeFlag>0):
            token = mapping[char]
            old_lm_state, old_logprob = lm.score(old_lm_state, token)
            new_score -= old_logprob
            lm_state, logprob = lm.score(lm_state, token)
            new_score += logprob
            triggerChangeFlag -= 1
        elif char == new_f:
            (lm_state, logprob) = lm.score(lm_state, new_e)
            new_score += logprob
            triggerChangeFlag = 5
            old_lm_state = ()
        else:
            lm_state = ()
            old_lm_state = ()
            triggerChangeFlag = 0
        #print('old lm state', old_lm_state)
        #print('lm state', lm_state)
    if triggerChangeFlag:
        new_score -= lm.end(old_lm_state)
        new_score += lm.end(lm_state)
    return new_score

In [17]:
def score_new(cipher, phi, new_f, new_e, previous_score):
    '''
    scores the phi_prime based on the previous score, returns a float
    cipher -- list of single character string
    phi -- dictionary, old mapping e->[f]
    new_f -- single-character string, extended symbol
    previous_score -- float, old score for phi
    '''
    mapping = phi
    new_score = previous_score
    lm_state = lm.begin()
    for i in range(len(cipher)):
        char = cipher[i]
        if char in mapping.keys():
            token = mapping[char]
            ngram = lm_state + (token,)
            while len(ngram)> 0:
                if ngram in lm.table:
                    lm_state = ngram[-lm.history:]
                    break
                else: #backoff
                    ngram = ngram[1:]
            if len(ngram)==0:
                lm_state = ()
        elif char == new_f:
            (lm_state, logprob) = lm.score(lm_state, new_e)
            new_score += logprob
        else:
            lm_state = ()
    return new_score

In [87]:
def beam_search(cipher, ext_order, score_func, ext_limits=1, topn=1):
    '''
    finds the mappings between cipher char and plaintext char, returns the mapping dictionary
    ext_order -- list, the unigram char list sorted by their count DESC
    ext_limits -- int, defines maximum number of cipher char can be mapped to a plaintext char
    topn -- int, defines the number of dictionaries we want to keep while pruning
    '''
    print('Number of unique symbols in cipher:', len(ext_order))
    # mapping relationships already found with score
    Hs = [(defaultdict(dict), 0)]
    # hypothesis mapping relationships with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # list of plaintext characters
    Ve = [chr(i) for i in range(97, 123, 1)]
    while cardinality < len(ext_order):
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality+1})')
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                new_map = {f: e}
                phi_prime.update(new_map)
                counts = len([v for k, v in phi_prime.items() if v == e])
                ext_limits = counts_dict[e]
                if counts <= ext_limits:
                    Ht.append((phi_prime, score_func(cipher, phi, f, e, previous_score)))
        # prune the histogram

        if cardinality in [3,7]:
            Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[topn*2:topn*3]
        elif cardinality in [6]:
            Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[topn*3:topn*4]
#         elif cardinality in [16]:
#             Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[topn*1:topn*2]
        else:
            Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        print('Current score: ', Hs[0][1], 'Worst score: ', Hs[min(len(Hs)-1, topn-1)][1])
        
        found_symbols = ext_order[:cardinality]
        phi_temp = dict()
        for key,value in gold_dict.items():
            if key in found_symbols:
                phi_temp[key] = value
        partial_text = ''
        bit_string = ''
        for cipher_char in cipher_desc['content']:
            if cipher_char in phi_temp.keys():
                partial_text += phi_temp[cipher_char]
                bit_string += 'o'
            else:
                partial_text += '_'
                bit_string += '.'
        gold_score = lm.score_bitstring(partial_text, bit_string)
        print('gold score', gold_score)
        if gold_score < Hs[min(len(Hs)-1, topn-1)][1] or gold_score > Hs[0][1]:
            print('Wrong!')
        
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [88]:
%%time
# sorted_keys = sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)
# mappings = beam_search(cipher_desc['content'], sorted_keys, 7, 100)
mappings = beam_search(cipher_desc['content'], ext_order, score, 7, 10000)
# mappings = beam_search(cipher_desc['content'], ext_order, score_new, 7, 10000)

Number of unique symbols in cipher: 54
Working on symbol:  — (1)
Current score:  -17.194697199999997 Worst score:  -45.951461999999985
gold score -27.947701999999992
Working on symbol:  ∑ (2)
Current score:  -26.92681499999999 Worst score:  -67.70077199999999
gold score -41.34324120000001
Working on symbol:  B (3)
Current score:  -38.52903960000001 Worst score:  -71.79167000000001
gold score -55.25409240000002
Working on symbol:  P (4)
Current score:  -65.02454639999998 Worst score:  -67.1545066
gold score -65.89092420000003
Working on symbol:  º (5)
Current score:  -76.06826050000004 Worst score:  -80.346168
gold score -79.50212730000001
Working on symbol:  ∫ (6)
Current score:  -87.55035280000003 Worst score:  -92.19305749999994
gold score -92.02616899999997
Working on symbol:  A (7)
Current score:  -101.87003810000007 Worst score:  -102.40576109999998
gold score -102.23625073999996
Working on symbol:  / (8)
Current score:  -109.82889010000001 Worst score:  -110.32650649999998
gold s

In [89]:
mapping = mappings[0][0]
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)
print('score', lm.score_seq(decipher_text))
print(len(decipher_text))

print(symbol_error_rate(decipher_text, 'data/_ref_Zodiac_408.txt'))

ilikekillingpeoplerecauseitissomuchfunitiemorefunthankillingwildgameintheforsestrecausemanisthemoatdangertueanamalofalltokillsomethinggivesmethemoatthrillingexpesenceitisevenretterthangettingyoursocksoffwithagirltherestpartofitiathaewhenidieiwillreserorninparadiceendalltheihavekilledwillrecomemyslavesiwillnotgiveyoumynamerecauseyouwilltrytosloidownosetopmycollectingofslavesformyafterlifeereorietemethhpiti
score -392.92110906712
408
0.05392156862745098


## Multi-processing

The following code is to utilize the joblib to multiprocessing the computations. However, the running time doesn't drop a lot (while experimenting with beamsize of 10000). When I monitered the CPU usage, I found each iteration only took around 30s to compute the parallelized section. Most of the time was used on the serialized section, such as data transportations and sorting the Ht list. Although the computed scores are consistent with the scores computed with single core computation, I still stick to the single core implementation since the memory usage is huge with multi-processing. The memory usage has been up to 100% when I only used four cores and beamsize of 10000 on my machine with 16G RAM.

In [None]:
def work(Hs, Ve, f, cipher, topn):
    ret = []
    for phi, previous_score in Hs:
        for e in Ve:
            phi_prime = copy.deepcopy(phi)
            new_map = {f: e}
            phi_prime.update(new_map)
            counts = len([v for k, v in phi_prime.items() if v == e])
            if counts <= counts_dict[e]:
                ret.append((phi_prime, score(cipher, phi, f, e, previous_score)))
    return sorted(ret, key=lambda x:x[1], reverse=True)[:topn]

def grouper(n, iterable):
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, n))
        if not chunk:
            return
        yield chunk

def beam_search_mp(cipher, ext_order, ext_limits=1, topn=1):
    '''
    finds the mappings between cipher char and plaintext char, returns the mapping dictionary
    ext_order -- list, the unigram char list sorted by their count DESC
    ext_limits -- int, defines maximum number of cipher char can be mapped to a plaintext char
    topn -- int, defines the number of dictionaries we want to keep while pruning
    '''
    print('Number of unique symbols in cipher:', len(ext_order))
    # mapping relationships already found with score
    Hs = [(defaultdict(dict), 0)]
    # hypothesis mapping relationships with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # list of plaintext characters
    Ve = [chr(i) for i in range(97, 123, 1)]
    
    N_JOBS = 4
    p = Parallel(n_jobs=N_JOBS, verbose=2, )
    
    while cardinality < len(ext_order):
    #while cardinality < 2:
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality+1})')
        jobs = list(grouper(math.ceil(len(Hs) / N_JOBS), Hs))

        print(f"Num of jobs {len(jobs)} w {len(jobs[0])}")
        Hts = p(delayed(work)(job, Ve, f, cipher, topn) for job in jobs)
        Ht = [e for l in Hts for e in l]
        
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        print('Current score: ', Hs[0][1])
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [None]:
%%time
mappings = beam_search_mp(cipher_desc['content'], ext_order[0][0], 7, 10000)

In [None]:
mapping = mappings[0][0]
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)
print('score', lm.score_seq(decipher_text))
print(len(decipher_text))

print(symbol_error_rate(decipher_text, 'data/_ref_Zodiac_408.txt'))

## Gold

In [13]:
with open('data/_ref_Zodiac_408.txt', 'r') as fh:
    ground_truth = fh.read()
print(ground_truth)
print('score', lm.score_seq(ground_truth))
print(len(ground_truth))

ilikekillingpeoplebecauseitissomuchfunitismorefunthankillingwildgameintheforrestbecausemanisthemostdangeroueanamalofalltokillsomethinggivesmethemostthrillingexperenceitisevenbetterthangettingyourrocksoffwithagirlthebestpartofitisthaewhenidieiwillbereborninparadicesndalltheihavekilledwillbecomemyslavesiwillnotgiveyoumynamebecauseyouwilltrytosloidownorstopmycollectiogofslavesformyafterlifeebeorietemethhpiti
score -359.0030393831197
408


In [14]:
gold = read_gold('data/_ref_Zodiac_408.txt')
gold_dict = dict()
for index in range(len(cipher_desc['content'])):
    cipher_char = cipher_desc['content'][index]
    if cipher_char not in gold_dict.keys():
        gold_dict[cipher_char] = gold[index]
print(gold_dict)

{'º': 'i', '∫': 'l', 'P': 'i', '/': 'k', 'Z': 'e', 'u': 'i', 'B': 'l', 'À': 'i', 'O': 'n', 'R': 'g', '•': 'p', '–': 'e', 'X': 'o', 'W': 'e', 'V': 'b', '+': 'e', '≈': 'c', 'G': 'a', 'y': 'u', 'F': 's', '∞': 'e', 'H': 't', 'π': 's', 'K': 's', 'Ç': 'o', '—': 'm', 'M': 'h', 'J': 'f', '^': 'n', 'I': 't', 'Ω': 's', 'T': 'o', '‘': 'r', 'N': 'e', 'Q': 'f', 'D': 'n', 'µ': 't', '£': 'h', 'S': 'a', '¢': 'n', '∑': 'l', 'A': 'w', '∆': 'd', 'Ã': 'a', 'E': 'e', 'L': 't', 'ƒ': 'o', '“': 'r', '\\': 'r', 'æ': 'a', '§': 'd', '√': 'v', 'j': 'x', '∏': 'y'}


In [68]:
for index in range(1, len(ext_order)+1, 1):
    found_symbols = ext_order[:index]
    phi_temp = dict()
    for key,value in gold_dict.items():
        if key in found_symbols:
            phi_temp[key] = value
    partial_text = ''
    bit_string = ''
    for cipher_char in cipher_desc['content']:
        if cipher_char in phi_temp.keys():
            partial_text += phi_temp[cipher_char]
            bit_string += 'o'
        else:
            partial_text += '_'
            bit_string += '.'
    print(f'Iteration {index}, symbol {ext_order[index-1]}, score: {lm.score_bitstring(partial_text, bit_string)}')
    print(partial_text)

Iteration 1, symbol —, score: -27.947701999999992
_______________________________m__________m_______________________m____________________m_______m_______________m_______________m___________m____m___________________________________________________________________________________________________________________________________________________m_m______________________m___m__________________________________m______________________m___________________m________
Iteration 2, symbol ∑, score: -41.34324120000001
_______________________________m__________m____________l__________m____________________m_______m_______________m_____l_____l___m___________m____m_______l___________________________________________________________________________________________l_________________________________________l_____m_m__l___________________m___m______________l___________________m___ll_________________m___________________m________
Iteration 3, symbol B, score: -55.25409240000002
_______l________l______________m__

Iteration 50, symbol \, score: -348.70242448012
ilikekillingpeoplebecauseitissomuchfuniti_morefuntha_killingwildgameintheforrestbecausemanisthemoat_a_gertueanamalofalltokillsomethinggivesmethemoatthrillinge_pere_ceitisevenbetterthangettingyourrocksoffwithagirlthebestpartofitiathaewhe_i_ieiwillbereborninparadice_n_alltheihavekilledwillbecomemyslavesiwillnotgiveyoumynamebecauseyouwilltrytosloi_ownor_topmycollectingofslavesformyafterlifeebeorietemethhpiti
Iteration 51, symbol ¢, score: -349.3454667401199
ilikekillingpeoplebecauseitissomuchfuniti_morefunthankillingwildgameintheforrestbecausemanisthemoat_angertueanamalofalltokillsomethinggivesmethemoatthrillinge_perenceitisevenbetterthangettingyourrocksoffwithagirlthebestpartofitiathaewheni_ieiwillbereborninparadice_n_alltheihavekilledwillbecomemyslavesiwillnotgiveyoumynamebecauseyouwilltrytosloi_ownor_topmycollectingofslavesformyafterlifeebeorietemethhpiti
Iteration 52, symbol §, score: -355.7758263201198
ilikekillingpeoplebecauseitissomuc

In [72]:
for key,value in gold_dict.items():
    freq_cipher = cipher_desc['relative_freq'][key]
    freq_plain = plaintxt_desc['relative_freq'][value]
    print(f'{key} {value}; Cipher char frequence: {freq_cipher}; plaintext char frequency: {freq_plain}')

º i; Cipher char frequence: 3.431372549019608; plaintext char frequency: 7.395771339569833
∫ l; Cipher char frequence: 2.696078431372549; plaintext char frequency: 4.12837076781324
P i; Cipher char frequence: 2.696078431372549; plaintext char frequency: 7.395771339569833
/ k; Cipher char frequence: 1.4705882352941175; plaintext char frequency: 0.6846719094465245
Z e; Cipher char frequence: 1.9607843137254901; plaintext char frequency: 12.140870772361387
u i; Cipher char frequence: 2.450980392156863; plaintext char frequency: 7.395771339569833
B l; Cipher char frequence: 2.941176470588235; plaintext char frequency: 4.12837076781324
À i; Cipher char frequence: 2.2058823529411766; plaintext char frequency: 7.395771339569833
O n; Cipher char frequence: 1.715686274509804; plaintext char frequency: 7.342333650329038
R g; Cipher char frequence: 2.941176470588235; plaintext char frequency: 2.0428939941058446
• p; Cipher char frequence: 1.715686274509804; plaintext char frequency: 2.02330664319

In [None]:
Ve = [chr(i) for i in range(97, 123, 1)]
for e in Ve:
    keys = [k for k, v in gold_dict.items() if v == e]
    freq_cipher = 0
    for key in keys:
        freq_cipher += cipher_desc['relative_freq'][key]
    freq_plain = plaintxt_desc['relative_freq'][e]
    print(f'{e}: {keys}, {len(keys)}, plain freq {freq_plain}, cipher freq {freq_cipher}')

In [20]:
counts_dict = {'a':4, 'b':2, 'c':2, 'd':2, 'e':8, 'f':2, 'g':2,\
          'h':2, 'i':4, 'j':0, 'k':2, 'l':3, 'm':2, 'n':4,\
          'o':4, 'p':2, 'q':0, 'r':3, 's':4, 't':4, 'u':2,\
          'v':2, 'w':2, 'x':1, 'y':1, 'z':0}

Notice that the default solution provides a very bad decipherment. Your job is to make it better!

## Grading

Ignore the following cells. They are for grading against the reference decipherment. Based on the clues provided in the decipherment homework description, you can easily find a reasonable reference text online for this cipher text.

In [None]:
"""
ATTENTION!
For grading purposes only. Don't bundle with the assignment. 
Make sure '_ref.txt' is removed from the 'data' directory before publishing.
"""

def read_gold(gold_file):
    with open(gold_file) as f:
        gold = f.read()
    f.close()
    gold = list(gold.strip())
    return gold

def symbol_error_rate(dec, _gold):
    gold = read_gold(_gold)
    correct = 0
    if len(gold) == len(dec):
        for (d,g) in zip(dec, gold):
            if d==g:
                correct += 1
    wrong = len(gold)-correct
    error = wrong/len(gold)
    
    return error

In [None]:
# gold decipherment
gold_file = "data/_ref.txt"
ser = symbol_error_rate(decipherment, gold_file)
print('Error: ', ser*100, 'Accuracy: ', (1-ser)*100)