# Homework: Decipherment

In [1]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict, Counter
import collections
import pprint
import math
import bz2
from ngram import *
import sys, string, os
import copy
import pickle
#from joblib import Parallel, delayed
import itertools
from multiprocessing import Process,Pool, cpu_count
import datetime, time, random
pp = pprint.PrettyPrinter(width=45, compact=True)

First let us read in the cipher text from the `data` directory:

In [2]:
def read_file(filename):
    if filename[-4:] == ".bz2":
        with bz2.open(filename, 'rt', encoding='utf8') as f:
            content = f.read()
            f.close()
    else:
        with open(filename, 'r', encoding='utf8') as f:
            content = f.read()
            f.close()
    return content

cipher = read_file("data/cipher.txt")
# print(cipher)

For the default solution we need to compute statistics like length, number of symbols/letters, 
unique occurences, frequencies and relative frequencies of a given file. This is done in the function `get_statistics` below.

While using `get_statistics`, make sure that `cipher=True` is set when the input is a ciphertext.

In [3]:
def get_statistics(content, cipher=True):
    stats = {}
    content = list(content)
    split_content = [x for x in content if x != '\n' and x!=' ']
    length = len(split_content)
    symbols = set(split_content)
    uniq_sym = len(list(symbols))
    freq = collections.Counter(split_content)
    rel_freq = {}
    for sym, frequency in freq.items():
        rel_freq[sym] = (frequency/length)*100
        
    if cipher:
        stats = {'content':split_content, 'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    else:
        stats = {'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    return stats

In [4]:
def read_gold(gold_file):
    with open(gold_file) as f:
        gold = f.read()
    f.close()
    gold = list(gold.strip())
    return gold

def symbol_error_rate(dec, _gold):
    gold = read_gold(_gold)
    correct = 0
    if len(gold) == len(dec):
        for (d,g) in zip(dec, gold):
            if d==g:
                correct += 1
    wrong = len(gold)-correct
    error = wrong/len(gold)
    
    return error

In [5]:
cipher_desc = get_statistics(cipher, cipher=True)
plaintxt = read_file("data/default.wiki.txt.bz2")
plaintxt_desc = get_statistics(plaintxt, cipher=False)
#pp.pprint(cipher_desc)

## Load the 6-gram model

In [6]:
%%time
sequence = 'In a few cases, a multilingual artifact has been necessary to facilitate decipherment, the Rosetta Stone being the classic example. Statistical techniques provide another pathway to decipherment, as does the analysis of modern languages derived from ancient languages in which undeciphered texts are written. Archaeological and historical information is helpful in verifying hypothesized decipherments.'

# lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=True)
lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=False)

Reading language model from data/6-gram-wiki-char.lm.bz2...


Wall time: 14.6 s


Done.


In [7]:
# print(sequence)
# lm_logprob = lm.score_seq(sequence)
# print("TOTAL LM LOGPROB: {}".format(lm_logprob), file=sys.stderr)
# print("TOTAL LM LOGPROB: {}".format(lm.score_seq('this is the text.')), file=sys.stderr)
# print(lm.get_bitstring_spans('..oo...ooo..'))
# print(lm.score_bitstring('thisisatest', 'ooooooooooo'))
print(lm.score_bitstring('thisisatest', '...........'))

-2.545382


## Implementation for Reference 3 to find the optimal extension order

This is the implementation of the reference 'Beam Search for Solving Substitution Ciphers'. The goal is to find the best extension order. As the paper mentioned, it is important to find a set of weights for the ngram order. I chose the weights \[1,1,1,1,2,3\] suggested by Anoop in a discussion post. I also tried several sets of weights. For beamsize of 10000, the result was not influenced quite much.

In [8]:
def find_sharp_n(cipher_desc, symbols_found, n_order):
    '''
    finds the #n for order n_order
    cipher_desc -- cipher statistics
    symbols_found -- list of single character string,
                     specifies the list of symbols have been placed in the extention order
    n_order -- int, specifies the order of n-gram
    '''
    sharp_n = 0
    for i in range(len(cipher_desc['content'])-n_order+1):
        for j in range(i, i+n_order, 1):
            if cipher_desc['content'][j] not in symbols_found:
                break
            if j == (i+n_order-1):
                sharp_n += 1
                #print(cipher_desc['content'][i:i+n_order])
    return sharp_n            

This is used to test the find_sharp_n function above. The results should be correct.

In [9]:
# test_str = 'ASCREAMINGCOMESACROSSTHESKY'
# print(find_sharp_n(get_statistics(test_str), ['A','G','H','K','Y'],6))
# find_sharp_n(get_statistics(test_str), ['S','C','E','M','O'],2)

Use a beam search to find the optimal extension order. The code below is pretty similar to the beam_search function. Most of the code is copied from it. For simplicity, the variable name might not quite make sense.

In [39]:
def find_ext_order(cipher_desc, topn=100, weights=[1,1,1,1,2,3]):
    '''
    finds the best order of deciphering cipher symbols (find best extention order)
    cipher_desc -- cipher statistics
    topn -- int, number of best trees we want to keep during iteration
    weights -- list of int, weight for #n, n varies from 1 to 6
    '''
    # symbols already found with score
    Hs = [([], 0)]
    # hypothesis extended symbols with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # if no weight is specified for unigram, use the most frequent symbol as the starting point
    if weights[0] == 0:
        cardinality += 1
        Hs.append(([sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)[0]], 0))
    # list of cipher symbols
    Ve = sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)
    while cardinality < cipher_desc['vocab_length']:
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                if e in phi_prime:
                    continue
                else:
                    phi_prime.append(e)
                    this_score = 0
                    for i in range(len(weights)):
                        this_score += weights[i]*find_sharp_n(cipher_desc, phi_prime, i+1)
                    Ht.append((phi_prime, this_score))
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
#         if cardinality <= 5:
#             print(Hs)
        #print('Done with symbol number', cardinality, '; Current best score: ', Hs[0][1])
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [40]:
%%time
ext_orders = find_ext_order(cipher_desc, topn=100, weights=[1,1,1,1,2,3])

Wall time: 3min 17s


In [44]:
seen_first_symbols = []
candidate_orders = []
for item in ext_orders:
    if item[0][0] not in seen_first_symbols:
        seen_first_symbols.append(item[0][0])
        candidate_orders.append(item[0])

In [45]:
candidate_orders

[['B',
  '∑',
  'º',
  '∫',
  'P',
  'A',
  'O',
  'R',
  'u',
  'À',
  '/',
  'Z',
  '∆',
  'V',
  '–',
  'W',
  '√',
  'I',
  '•',
  'X',
  '≈',
  '—',
  'µ',
  '∞',
  'E',
  'π',
  'H',
  'F',
  '+',
  '“',
  'Q',
  'G',
  'y',
  'æ',
  'K',
  'D',
  '£',
  'M',
  'Ç',
  'J',
  '∏',
  'T',
  'ƒ',
  '\\',
  'L',
  '‘',
  '^',
  'Ã',
  'S',
  'N',
  '¢',
  '§',
  'Ω',
  'j'],
 ['∑',
  'B',
  'º',
  '∫',
  'P',
  'A',
  'O',
  'R',
  'u',
  'À',
  '/',
  'Z',
  '∆',
  'V',
  '–',
  'W',
  '√',
  'I',
  '•',
  'X',
  '≈',
  '—',
  'µ',
  '∞',
  'E',
  'π',
  'H',
  'F',
  '+',
  '“',
  'Q',
  'G',
  'y',
  'æ',
  'K',
  'D',
  '£',
  'M',
  'Ç',
  'J',
  '∏',
  'T',
  'ƒ',
  '\\',
  'L',
  '‘',
  '^',
  'Ã',
  'S',
  'N',
  '¢',
  '§',
  'Ω',
  'j'],
 ['º',
  '∑',
  'B',
  '∫',
  'P',
  'A',
  'O',
  'R',
  'u',
  'À',
  '/',
  'Z',
  '∆',
  'V',
  '–',
  'W',
  '√',
  'I',
  '•',
  'X',
  '≈',
  '—',
  'µ',
  '∞',
  'E',
  'π',
  'H',
  'F',
  '+',
  '“',
  'Q',
  'G',
  'y',
  'æ',
  

In [37]:
#ext_order = ext_orders[0][0]
ext_order = ['P', 'B', '∑', 'º', '∫', '/', 'Z', 'u', 'A',
 '∆', 'O', 'R', 'À', 'V', '–', '•', 'X', 'W',
 '§', 'π', '≈', '—', '+', 'D', 'G', '√', 'E',
 'K', 'y', 'æ', 'Ã', 'I', 'H', 'F', '∞', '“',
 'µ', 'Q', '£', 'M', 'Ç', 'J', '^', 'L', '∏',
 'ƒ', 'S', 'T', '‘', 'N', '¢', '\\', 'Ω',
 'j']
pp.pprint(ext_order)

['P', 'B', '∑', 'º', '∫', '/', 'Z', 'u', 'A',
 '∆', 'O', 'R', 'À', 'V', '–', '•', 'X', 'W',
 '§', 'π', '≈', '—', '+', 'D', 'G', '√', 'E',
 'K', 'y', 'æ', 'Ã', 'I', 'H', 'F', '∞', '“',
 'µ', 'Q', '£', 'M', 'Ç', 'J', '^', 'L', '∏',
 'ƒ', 'S', 'T', '‘', 'N', '¢', '\\', 'Ω',
 'j']


In [13]:
# with open('ext_order.pkl', 'wb') as fh:
#     pickle.dump(ext_orders, fh)

In [14]:
# with open('ext_order.pkl', 'rb') as fh:
#     ext_order = pickle.load(fh, encoding='utf8')[0][0]

## Gold

In this section, we investigate what is the ground truth mapping relationship of Zodiac cipher. The goal is to get some sense of how good or bad our extension order is. If the correct answer has been pruned out at a very early stage, it is not quite possible that the text can be fully deciphered since the following score is computed based on a wrong partial deciphered text.

We tried several sets of weights to get different extension order results. An interesting finding is that the score is always quite bad at the beginning. This significantly increases the possiblity of pruning the correct mapping. When we tried different extension orders, what we were looking for was an order that the score of the correct answer can be pretty decent at the first 5 or 6 iterations.

In [15]:
with open('data/_ref_Zodiac_408.txt', 'r') as fh:
    ground_truth = fh.read()
print(ground_truth)
print('score', lm.score_seq(ground_truth))
print(len(ground_truth))

ilikekillingpeoplebecauseitissomuchfunitismorefunthankillingwildgameintheforrestbecausemanisthemostdangeroueanamalofalltokillsomethinggivesmethemostthrillingexperenceitisevenbetterthangettingyourrocksoffwithagirlthebestpartofitisthaewhenidieiwillbereborninparadicesndalltheihavekilledwillbecomemyslavesiwillnotgiveyoumynamebecauseyouwilltrytosloidownorstopmycollectiogofslavesformyafterlifeebeorietemethhpiti
score -359.0030393831197
408


In [16]:
gold = read_gold('data/_ref_Zodiac_408.txt')
gold_dict = dict()
for index in range(len(cipher_desc['content'])):
    cipher_char = cipher_desc['content'][index]
    if cipher_char not in gold_dict.keys():
        gold_dict[cipher_char] = gold[index]

In [17]:
for index in range(1, len(ext_order)+1, 1):
    found_symbols = ext_order[:index]
    phi_temp = dict()
    for key,value in gold_dict.items():
        if key in found_symbols:
            phi_temp[key] = value
    partial_text = ''
    bit_string = ''
    for cipher_char in cipher_desc['content']:
        if cipher_char in phi_temp.keys():
            partial_text += phi_temp[cipher_char]
            bit_string += 'o'
        else:
            partial_text += '_'
            bit_string += '.'
    print(f'Iteration {index}, symbol {ext_order[index-1]}, score: {lm.score_bitstring(partial_text, bit_string)}')
    print(partial_text)

Iteration 1, symbol P, score: -15.017390999999996
__i________________________i_____________________________i________________________________________________________________i_______________________________i_________________________________________________i________________________________i________________i______________________________i_______________________________________________________________________________i_____________________i____________________
Iteration 2, symbol B, score: -30.9290574
__i____l________l__________i____________________________li_______________________________________________________l____l___i_l____________________________li_________________________________________________i________________________________i_______l________i______________________________i_l_________________l_____________________________________l_____________________i____________________li____________________
Iteration 3, symbol ∑, score: -40.48860420000001
__i____l________l__________i_____________

In [18]:
# for key,value in gold_dict.items():
#     freq_cipher = cipher_desc['relative_freq'][key]
#     freq_plain = plaintxt_desc['relative_freq'][value]
#     print(f'{key} {value}; Cipher char frequence: {freq_cipher}; plaintext char frequency: {freq_plain}')

In [19]:
# Ve = [chr(i) for i in range(97, 123, 1)]
# for e in Ve:
#     keys = [k for k, v in gold_dict.items() if v == e]
#     freq_cipher = 0
#     for key in keys:
#         freq_cipher += cipher_desc['relative_freq'][key]
#     freq_plain = plaintxt_desc['relative_freq'][e]
#     print(f'{e}: {keys}, {len(keys)}, plain freq {freq_plain}, cipher freq {freq_cipher}')

## Baseline with better extension order

The first change I made to the baseline was to rewrite the score function to optimize the running speed. The change I made is to score the newly fixed symbol plantext character pair and corresponding influenced previously fixed plantext character based on the previous score instead of scoring the whole bitstring in each iteration. For instance, 'oooo...o' -> 'oooo..xo'. The new score can be calculated by adding unigram score of 'x' to the previous score, substracting unigram score of 'o' following 'x' and bigram score of '<\s>' from the previous score, and adding bigram score of 'o' following 'x' and trigram score of '<\s>' to the previous score. With this approach, the running time was improved to 20 minitues from 1 hour with a beamsize of 10000 on my machine with i7 7700k cpu. And the computed score is almost same as the score computed with score_bit_string function (the difference is within 0.0000001). Another approach to speed up the whole process we tried is multiprocessing. We will talk about that in the following notebook.

In [20]:
def score(cipher, phi, new_f, new_e, previous_score):
    '''
    scores the phi_prime based on the previous score, returns a float
    cipher -- list of single character string
    phi -- dictionary, old mapping e->[f]
    new_f -- single-character string, extended symbol
    previous_score -- float, old score for phi
    '''
    mapping = phi
    new_score = previous_score
    # for the first iteration, the previous score should be -2.545382 instead of 0
    # this is because the score of an empty string is not 0 whiling scoring with bitstring
    # the value can be obtained by calling lm.score_bitstring('thisisatest', '...........')
    if len(phi)==0:
        new_score += -2.545382
    lm_state = lm.begin()
    old_lm_state = lm.begin()
    # this Flag is used to track if a newly-fixed character affects the previously-fixed character
    triggerChangeFlag = 0
    for i in range(len(cipher)):
        char = cipher[i]
        # if this is a previously fixed character and not influenced by the newly-fixed character
        # we only need to track the lm_state and old lm_state, no need to compute the score
        if (char in mapping.keys()) and (triggerChangeFlag==0):
            token = mapping[char]
            ngram = lm_state + (token,)
            while len(ngram)> 0:
                if ngram in lm.table:
                    lm_state = ngram[-lm.history:]
                    break
                else: #backoff
                    ngram = ngram[1:]
            if len(ngram)==0:
                lm_state = ()
            old_lm_state = lm_state
        # if this is a previously fixed character and influenced by the newly-fixed character
        # substract the old score and add the new score to the previous score.
        elif (char in mapping.keys()) and (triggerChangeFlag>0):
            token = mapping[char]
            old_lm_state, old_logprob = lm.score(old_lm_state, token)
            new_score -= old_logprob
            lm_state, logprob = lm.score(lm_state, token)
            new_score += logprob
            triggerChangeFlag -= 1
        # if this is a newly-fixed charater, simply add the new score to the previous score
        elif char == new_f:
            (lm_state, logprob) = lm.score(lm_state, new_e)
            new_score += logprob
            triggerChangeFlag = 5
            old_lm_state = ()
        # if this is a unknown character, there is no influence on the previous score
        else:
            lm_state = ()
            old_lm_state = ()
            triggerChangeFlag = 0
    # treat the end tag '<\s>' as previously fixed character
    if triggerChangeFlag:
        new_score -= lm.end(old_lm_state)
        new_score += lm.end(lm_state)
    return new_score

The beam_search we implemented is based on the pseudo code mentioned in the assignment. The main change I made was to use customized ext_limit for each plaintext character instead of a general value. The details can be found in the following notebook.

In [46]:
def beam_search(cipher, ext_order, score_func, ext_limits, topn=1):
    '''
    finds the mappings between cipher char and plaintext char, returns the mapping dictionary
    ext_order -- list, the unigram char list sorted by their count DESC
    ext_limits -- int, defines maximum number of cipher char can be mapped to a plaintext char
    topn -- int, defines the number of dictionaries we want to keep while pruning
    '''
    print('Number of unique symbols in cipher:', len(ext_order))
    # mapping relationships already found with score
    Hs = [(defaultdict(dict), 0)]
    # hypothesis mapping relationships with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # list of plaintext characters
    Ve = [chr(i) for i in range(97, 123, 1)]
    while cardinality < len(ext_order):
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality+1})')
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                new_map = {f: e}
                phi_prime.update(new_map)
                counts = len([v for k, v in phi_prime.items() if v == e])
                ext_limit = ext_limits[e]
                
#                 keys = [k for k, v in phi_prime.items() if v == e]
#                 freq_cipher = 0
#                 for key in keys:
#                     freq_cipher += cipher_desc['relative_freq'][key]
#                 freq_plain = plaintxt_desc['relative_freq'][e]
                
#                 # the relative frequency of the cipher symbol should not be too large compared to the 
#                 # relative frequency of the plaintext character. Note that the mapping relationship between them is
#                 # many to one. This can prune the trees a little bit (not much though). For instance,
#                 # cipher symbol 'B' will never be mapped to 'z' since the relative frequency between them is
#                 # way too large
                
#                 if (freq_cipher / freq_plain > 3):
#                     continue
#                 # for the first 5 symbols, at most three of them can be mapped to the same plaintext character
#                 # this rule is set by intuition. The goal is to map the characters evenly at the early stage
#                 if cardinality < 5:
#                     ext_limits = min(ext_limits,3)
                if counts <= ext_limit:
                    Ht.append((phi_prime, score_func(cipher, phi, f, e, previous_score)))
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        
#         print('Current score: ', Hs[0][1], 'Worst score: ', Hs[min(len(Hs)-1, topn-1)][1])
#         found_symbols = ext_order[:cardinality]
#         phi_temp = dict()
#         for key,value in gold_dict.items():
#             if key in found_symbols:
#                 phi_temp[key] = value
#         partial_text = ''
#         bit_string = ''
#         for cipher_char in cipher_desc['content']:
#             if cipher_char in phi_temp.keys():
#                 partial_text += phi_temp[cipher_char]
#                 bit_string += 'o'
#             else:
#                 partial_text += '_'
#                 bit_string += '.'
#         gold_score = lm.score_bitstring(partial_text, bit_string)
#         print('gold score', gold_score)
#         if gold_score < Hs[min(len(Hs)-1, topn-1)][1] or gold_score > Hs[0][1]:
#             print('Wrong!')
        #print(Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

## Test case

Before deciphering the Zodiac Killer cipher, test the algorithm a simple test case

The first test case is a simple one to one mapping cipher. <br>
Plaintext: `defendtheeastwallofthecastle` <br>
Cipher: `giuifgceiiprctpnnduceiqprcni` <br>

In [22]:
one_to_one_cipher = 'giuifgceiiprctpnnduceiqprcni'
one_to_one_cipher_desc = get_statistics(one_to_one_cipher, cipher=True)
one_to_one_ext_order = find_ext_order(one_to_one_cipher_desc)[0][0]
one_to_one_ext_limits = dict()
for e in [chr(i) for i in range(97, 123, 1)]:
    one_to_one_ext_limits[e] = 1
one_to_one_mappings = beam_search(one_to_one_cipher_desc['content'], one_to_one_ext_order,\
                                  score, one_to_one_ext_limits, 50)
one_to_one_mapping = one_to_one_mappings[0][0]
one_to_one_decipher_text = ''
for char in one_to_one_cipher_desc['content']:
    one_to_one_decipher_text += one_to_one_mapping[char]
print('Deciphered result: ', one_to_one_decipher_text)

Number of unique symbols in cipher: 12
Working on symbol:  i (1)
Current score:  -8.570688 Worst score:  -21.082497999999998
gold score -2.545382
Wrong!
Working on symbol:  p (2)
Current score:  -11.379961100000001 Worst score:  -13.823382
gold score -2.545382
Wrong!
Working on symbol:  r (3)
Current score:  -12.61742569 Worst score:  -14.178162599999997
gold score -2.545382
Wrong!
Working on symbol:  c (4)
Current score:  -15.489765799999997 Worst score:  -17.629812
gold score -2.545382
Wrong!
Working on symbol:  t (5)
Current score:  -16.03725842 Worst score:  -17.907934541
gold score -2.545382
Wrong!
Working on symbol:  n (6)
Current score:  -18.078716399999998 Worst score:  -22.135148320000003
gold score -2.545382
Wrong!
Working on symbol:  e (7)
Current score:  -16.847339719999997 Worst score:  -23.640595200000007
gold score -2.545382
Wrong!
Working on symbol:  q (8)
Current score:  -15.808824288999997 Worst score:  -20.684350920000004
gold score -2.545382
Wrong!
Working on symbol

We didn't come up with a homophobic cipher since our algorithm is able to decipher the Zodiac with a beamsize of . The best symbol error rate we can achieve is . So we believe our algorithm is correct.

## Multi-processing

In [23]:
def parallel_fn(Ve, phi, f, cipher, previous_score, ext_limits):
    ret = []
    for e in Ve:
        phi_prime = copy.deepcopy(phi)
        new_map = {f: e}
        phi_prime.update(new_map)
        counts = len([v for k, v in phi_prime.items() if v == e])
        if counts <= ext_limits[e]:
            ret.append((phi_prime, score(cipher, phi, f, e, previous_score)))
    return ret

def beam_search_mp(cipher, ext_order, ext_limits, topn=1):

    # initialization
    Hs = [(defaultdict(dict), 0)]
    Ht = []
    cardinality = 0
    Ve = [chr(i) for i in range(97, 123, 1)]
    
    while cardinality < len(ext_order):
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality+1})')
        
        mainStart = time.time()
        result = []
        p = Pool(cpu_count())
             
        for phi, previous_score in Hs: 
            result.append(p.apply_async(parallel_fn, args=(Ve, phi, f, cipher, previous_score, ext_limits))) 
                            
        p.close() 
        p.join()  

        Ht = []
        for subp in result:
            Ht += subp.get()
    
        # prune the histogram
        mainEnd = time.time()
        print ('Running Time for this symbol: %0.2f seconds.' % (mainEnd-mainStart))
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]    
        
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        print('Current score: ', Hs[0][1])
        
    return sorted(Hs, key=lambda x:x[1], reverse=True)

## Decipher Zodiac Killer cipher

As mentioned above, we use different ext_limit for different plaintext symbol. We use the following `ext_limits` to limit the ext_order. The goal is to customize the `ext_limit` for each plaintext character so that more symbols can be mapped to the more frequent plaintext character. The `ext_limit` is calculated by multiplying the relative frequency of the paintext character in wiki text by the number of unique cipher symbols. We believe this can dramatically improve the running time since the required amount of computation in each iteration is minimized. We use the ceiling instead of floor to ensure the `ext_limit` is large enough.

In [24]:
ext_limits = dict()
for e in [chr(i) for i in range(97, 123, 1)]:
    ext_limits[e] = math.ceil(plaintxt_desc['relative_freq'][e]*cipher_desc['vocab_length']/100)
print(ext_limits)

{'a': 5, 'b': 1, 'c': 2, 'd': 3, 'e': 7, 'f': 2, 'g': 2, 'h': 3, 'i': 4, 'j': 1, 'k': 1, 'l': 3, 'm': 2, 'n': 4, 'o': 4, 'p': 2, 'q': 1, 'r': 4, 's': 4, 't': 5, 'u': 2, 'v': 1, 'w': 1, 'x': 1, 'y': 1, 'z': 1}


In [None]:
%%time
ext_order = candidate_orders[3]
mappings = beam_search(cipher_desc['content'], ext_order, score, ext_limits, 100000)
mapping = mappings[0][0]
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)
#print('score', lm.score_seq(decipher_text))
print(symbol_error_rate(decipher_text, 'data/_ref_Zodiac_408.txt'))

In [26]:
# %%time
# mappings = beam_search_mp(cipher_desc['content'], ext_order, ext_limits, 5000)

In [None]:
mapping = mappings[0][0]
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)
#print('score', lm.score_seq(decipher_text))
print(symbol_error_rate(decipher_text, 'data/_ref_Zodiac_408.txt'))