# Homework: Decipherment

Load the packages needed.

In [1]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict, Counter
import collections
import pprint
import math
import bz2
from ngram import *
import sys, string, os
import copy
import pickle
#from joblib import Parallel, delayed
import itertools
from multiprocessing import Process,Pool, cpu_count
import datetime, time, random
pp = pprint.PrettyPrinter(width=45, compact=True)

First let us read in the cipher text from the `data` directory:

In [2]:
def read_file(filename):
    if filename[-4:] == ".bz2":
        with bz2.open(filename, 'rt', encoding='utf8') as f:
            content = f.read()
            f.close()
    else:
        with open(filename, 'r', encoding='utf8') as f:
            content = f.read()
            f.close()
    return content

cipher = read_file("data/cipher.txt")
# print(cipher)

While using `get_statistics`, make sure that `cipher=True` is set when the input is a ciphertext.

In [3]:
def get_statistics(content, cipher=True):
    stats = {}
    content = list(content)
    split_content = [x for x in content if x != '\n' and x!=' ']
    length = len(split_content)
    symbols = set(split_content)
    uniq_sym = len(list(symbols))
    freq = collections.Counter(split_content)
    rel_freq = {}
    for sym, frequency in freq.items():
        rel_freq[sym] = (frequency/length)*100
        
    if cipher:
        stats = {'content':split_content, 'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    else:
        stats = {'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    return stats

In [4]:
cipher_desc = get_statistics(cipher, cipher=True)
plaintxt = read_file("data/default.wiki.txt.bz2")
plaintxt_desc = get_statistics(plaintxt, cipher=False)

## Load the 6-gram model

In [5]:
%%time
sequence = 'In a few cases, a multilingual artifact has been necessary to facilitate decipherment, the Rosetta Stone being the classic example. Statistical techniques provide another pathway to decipherment, as does the analysis of modern languages derived from ancient languages in which undeciphered texts are written. Archaeological and historical information is helpful in verifying hypothesized decipherments.'

# lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=True)
lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=False)

Reading language model from data/6-gram-wiki-char.lm.bz2...


Wall time: 1min 22s


Done.


## Implementation for Reference 3 to find the optimal extension order

This is the implementation of the reference 'Beam Search for Solving Substitution Ciphers'. The goal is to find the best extension order. As the paper mentioned, it is important to find a set of weights for the ngram order. I chose the weights \[1,1,1,1,2,3\] suggested by Anoop in a discussion post. I also tried several sets of weights. For beamsize of 10000, the result was not influenced quite much.

If the correct answer has been pruned out at a very early stage, it is not quite possible that the text can be fully deciphered since the following score is computed based on a wrong partial deciphered text. We tried several sets of weights to get different extension order results. But the results are pretty similar in terms of the orders. To try different orders, we pick all the ext_orders with different starting symbols. In the case of Zodiac, there are 5 candidate ext_order.

In [6]:
def find_sharp_n(cipher_desc, symbols_found, n_order):
    '''
    finds the #n for order n_order
    cipher_desc -- cipher statistics
    symbols_found -- list of single character string,
                     specifies the list of symbols have been placed in the extention order
    n_order -- int, specifies the order of n-gram
    '''
    sharp_n = 0
    for i in range(len(cipher_desc['content'])-n_order+1):
        for j in range(i, i+n_order, 1):
            if cipher_desc['content'][j] not in symbols_found:
                break
            if j == (i+n_order-1):
                sharp_n += 1
                #print(cipher_desc['content'][i:i+n_order])
    return sharp_n            

Use a beam search to find the optimal extension order. The code below is pretty similar to the beam_search function. Most of the code is copied from it. For simplicity, the variable name might not quite make sense.

In [7]:
def find_ext_order(cipher_desc, topn=100, weights=[1,1,1,1,2,3]):
    '''
    finds the best order of deciphering cipher symbols (find best extention order)
    cipher_desc -- cipher statistics
    topn -- int, number of best trees we want to keep during iteration
    weights -- list of int, weight for #n, n varies from 1 to 6
    '''
    # symbols already found with score
    Hs = [([], 0)]
    # hypothesis extended symbols with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # if no weight is specified for unigram, use the most frequent symbol as the starting point
    if weights[0] == 0:
        cardinality += 1
        Hs.append(([sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)[0]], 0))
    # list of cipher symbols
    Ve = sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)
    while cardinality < cipher_desc['vocab_length']:
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                if e in phi_prime:
                    continue
                else:
                    phi_prime.append(e)
                    this_score = 0
                    for i in range(len(weights)):
                        this_score += weights[i]*find_sharp_n(cipher_desc, phi_prime, i+1)
                    Ht.append((phi_prime, this_score))
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
#         if cardinality <= 5:
#             print(Hs)
        #print('Done with symbol number', cardinality, '; Current best score: ', Hs[0][1])
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [8]:
# %%time
# ext_orders = find_ext_order(cipher_desc, topn=100, weights=[1,1,1,1,2,3])

In [25]:
ext_order = ['E', '∑', 'B', 'P', 'º', '∫', '/', 'Z', 'A',
 '∆', 'u', 'O', 'R', 'À', 'V', '–', '•', 'X',
 'W', '√', '§', 'F', 'H', '≈', '—', 'π', '+',
 'G', 'D', 'K', 'y', '∞', 'I', 'æ', 'µ', '∏',
 'T', 'ƒ', '“', '\\', 'Q', 'J', 'Ç', 'M',
 '£', 'L', '‘', '^', 'S', 'Ã', 'N', '¢', 'Ω',
 'j']

In [10]:
# with open('ext_order.pkl', 'wb') as fh:
#     pickle.dump(ext_orders, fh)

In [11]:
# with open('ext_order.pkl', 'rb') as fh:
#     ext_order = pickle.load(fh, encoding='utf8')[0][0]

## Baseline with better extension order

The first change I made to the baseline was to rewrite the score function to optimize the running speed. The change I made is to score the newly fixed symbol plantext character pair and corresponding influenced previously fixed plantext character based on the previous score instead of scoring the whole bitstring in each iteration. For instance, 'oooo...o' -> 'oooo..xo'. The new score can be calculated by adding unigram score of 'x' to the previous score, substracting unigram score of 'o' following 'x' and bigram score of '<\s>' from the previous score, and adding bigram score of 'o' following 'x' and trigram score of '<\s>' to the previous score. With this approach, the running time was improved to 20 minitues from 1 hour with a beamsize of 10000 on my machine with i7 7700k cpu. And the computed score is almost same as the score computed with score_bit_string function (the difference is within 0.0000001). Another approach to speed up the whole process we tried is multiprocessing. We will talk about that in the following notebook.

In [12]:
def score(cipher, phi, new_f, new_e, previous_score):
    '''
    scores the phi_prime based on the previous score, returns a float
    cipher -- list of single character string
    phi -- dictionary, old mapping e->[f]
    new_f -- single-character string, extended symbol
    previous_score -- float, old score for phi
    '''
    mapping = phi
    new_score = previous_score
    # for the first iteration, the previous score should be -2.545382 instead of 0
    # this is because the score of an empty string is not 0 whiling scoring with bitstring
    # the value can be obtained by calling lm.score_bitstring('thisisatest', '...........')
    if len(phi)==0:
        new_score += -2.545382
    lm_state = lm.begin()
    old_lm_state = lm.begin()
    # this Flag is used to track if a newly-fixed character affects the previously-fixed character
    triggerChangeFlag = 0
    for i in range(len(cipher)):
        char = cipher[i]
        # if this is a previously fixed character and not influenced by the newly-fixed character
        # we only need to track the lm_state and old lm_state, no need to compute the score
        if (char in mapping.keys()) and (triggerChangeFlag==0):
            token = mapping[char]
            ngram = lm_state + (token,)
            while len(ngram)> 0:
                if ngram in lm.table:
                    lm_state = ngram[-lm.history:]
                    break
                else: #backoff
                    ngram = ngram[1:]
            if len(ngram)==0:
                lm_state = ()
            old_lm_state = lm_state
        # if this is a previously fixed character and influenced by the newly-fixed character
        # substract the old score and add the new score to the previous score.
        elif (char in mapping.keys()) and (triggerChangeFlag>0):
            token = mapping[char]
            old_lm_state, old_logprob = lm.score(old_lm_state, token)
            new_score -= old_logprob
            lm_state, logprob = lm.score(lm_state, token)
            new_score += logprob
            triggerChangeFlag -= 1
        # if this is a newly-fixed charater, simply add the new score to the previous score
        elif char == new_f:
            (lm_state, logprob) = lm.score(lm_state, new_e)
            new_score += logprob
            triggerChangeFlag = 5
            old_lm_state = ()
        # if this is a unknown character, there is no influence on the previous score
        else:
            lm_state = ()
            old_lm_state = ()
            triggerChangeFlag = 0
    # treat the end tag '<\s>' as previously fixed character
    if triggerChangeFlag:
        new_score -= lm.end(old_lm_state)
        new_score += lm.end(lm_state)
    return new_score

The beam_search we implemented is based on the pseudo code mentioned in the assignment. The main change I made was to use customized ext_limit for each plaintext character instead of a general value. The details can be found in the following notebook.

In [24]:
def read_gold(gold_file):
    with open(gold_file) as f:
        gold = f.read()
    f.close()
    gold = list(gold.strip())
    return gold

def symbol_error_rate(dec, _gold):
    gold = read_gold(_gold)
    correct = 0
    if len(gold) == len(dec):
        for (d,g) in zip(dec, gold):
            if d==g:
                correct += 1
    wrong = len(gold)-correct
    error = wrong/len(gold)
    
    return error
gold = read_gold('data/_ref_Zodiac_408.txt')
gold_dict = dict()
for index in range(len(cipher_desc['content'])):
    cipher_char = cipher_desc['content'][index]
    if cipher_char not in gold_dict.keys():
        gold_dict[cipher_char] = gold[index]

Ve = [chr(i) for i in range(97, 123, 1)]
for e in Ve:
    keys = [k for k, v in gold_dict.items() if v == e]
    freq_cipher = 0
    for key in keys:
        freq_cipher += cipher_desc['relative_freq'][key]
    freq_plain = plaintxt_desc['relative_freq'][e]
    print(f'{e}: {keys}, {len(keys)}, plain freq {freq_plain}, cipher freq {freq_cipher}')

a: ['G', 'S', 'Ã', 'æ'], 4, plain freq 8.69449153965917, cipher freq 6.372549019607844
b: ['V'], 1, plain freq 1.5775397729346958, cipher freq 2.2058823529411766
c: ['≈'], 1, plain freq 3.296629993243273, cipher freq 2.450980392156863
d: ['∆', '§'], 2, plain freq 4.111572946751393, cipher freq 1.7156862745098038
e: ['Z', '–', 'W', '+', '∞', 'N', 'E'], 7, plain freq 12.140870772361387, cipher freq 13.235294117647058
f: ['J', 'Q'], 2, plain freq 2.2396407473659865, cipher freq 2.696078431372549
g: ['R'], 1, plain freq 2.0428939941058446, cipher freq 2.941176470588235
h: ['M', '£'], 2, plain freq 4.905679325108425, cipher freq 3.9215686274509802
i: ['º', 'P', 'u', 'À'], 4, plain freq 7.395771339569833, cipher freq 10.784313725490197
j: [], 0, plain freq 0.21989985102697754, cipher freq 0
k: ['/'], 1, plain freq 0.6846719094465245, cipher freq 1.4705882352941175
l: ['∫', 'B', '∑'], 3, plain freq 4.12837076781324, cipher freq 8.088235294117647
m: ['—'], 1, plain freq 2.5998296931383753, cip

In [15]:
def beam_search(cipher, ext_order, score_func, ext_limits, topn=1):
    '''
    finds the mappings between cipher char and plaintext char, returns the mapping dictionary
    ext_order -- list, the unigram char list sorted by their count DESC
    ext_limits -- int, defines maximum number of cipher char can be mapped to a plaintext char
    topn -- int, defines the number of dictionaries we want to keep while pruning
    '''
    print('Number of unique symbols in cipher:', len(ext_order))
    # mapping relationships already found with score
    Hs = [(defaultdict(dict), 0)]
    # hypothesis mapping relationships with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # list of plaintext characters
    Ve = [chr(i) for i in range(97, 123, 1)]
    while cardinality < len(ext_order):
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality+1})')
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                new_map = {f: e}
                phi_prime.update(new_map)
                counts = len([v for k, v in phi_prime.items() if v == e])
                ext_limit = ext_limits[e]
                if counts <= ext_limit:
                    Ht.append((phi_prime, score_func(cipher, phi, f, e, previous_score)))
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        
        #######################    monitor if the correct result has been pruned out #######################
        print('Best score: ', Hs[0][1], 'Worst score: ', Hs[min(len(Hs)-1, topn-1)][1])
        found_symbols = ext_order[:cardinality]
        phi_temp = dict()
        for key,value in gold_dict.items():
            if key in found_symbols:
                phi_temp[key] = value
        partial_text = ''
        bit_string = ''
        for cipher_char in cipher_desc['content']:
            if cipher_char in phi_temp.keys():
                partial_text += phi_temp[cipher_char]
                bit_string += 'o'
            else:
                partial_text += '_'
                bit_string += '.'
        gold_score = lm.score_bitstring(partial_text, bit_string)
        print('gold score', gold_score)
        if gold_score < Hs[min(len(Hs)-1, topn-1)][1] or gold_score > Hs[0][1]:
            print('Wrong Wrong Wrong Wrong Wrong Wrong Wrong Wrong!')
        
    return sorted(Hs, key=lambda x:x[1], reverse=True)

## Test case

Before deciphering the Zodiac Killer cipher, test the algorithm with some simple test cases

The first test case is a simple one to one mapping. <br>
Plaintext: `defendtheeastwallofthecastle` <br>
Cipher: `giuifgceiiprctpnnduceiqprcni` <br>

In [None]:
one_to_one_cipher = 'giuifgceiiprctpnnduceiqprcni'
one_to_one_cipher_desc = get_statistics(one_to_one_cipher, cipher=True)
one_to_one_ext_order = find_ext_order(one_to_one_cipher_desc)[0][0]
one_to_one_ext_limits = dict()
for e in [chr(i) for i in range(97, 123, 1)]:
    one_to_one_ext_limits[e] = 1
one_to_one_mappings = beam_search(one_to_one_cipher_desc['content'], one_to_one_ext_order,\
                                  score, one_to_one_ext_limits, 5000)
one_to_one_mapping = one_to_one_mappings[0][0]
one_to_one_decipher_text = ''
for char in one_to_one_cipher_desc['content']:
    one_to_one_decipher_text += one_to_one_mapping[char]
print('Deciphered result: ', one_to_one_decipher_text)

We didn't come up with a homophobic cipher. The reason is that our algorithm is able to decipher the Zodiac with a beamsize of 10000 after fixing three ground true mapping relationships (result not shown). The symbol error rate for that is ~5%. With a beamsize of 10000, the algorithm cannot correctly decipher the text all by its own. But it is enough to prove the baseline algorithm works.

## Multi-processing

In [17]:
def parallel_fn(Ve, phi, f, cipher, previous_score, ext_limits):
    ret = []
    for e in Ve:
        phi_prime = copy.deepcopy(phi)
        new_map = {f: e}
        phi_prime.update(new_map)
        counts = len([v for k, v in phi_prime.items() if v == e])
        if counts <= ext_limits[e]:
            ret.append((phi_prime, score(cipher, phi, f, e, previous_score)))
    return ret

def beam_search_mp(cipher, ext_order, ext_limits, topn=1):

    # initialization
    Hs = [(defaultdict(dict), 0)]
    Ht = []
    cardinality = 0
    Ve = [chr(i) for i in range(97, 123, 1)]
    
    while cardinality < len(ext_order):
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality+1})')
        
        mainStart = time.time()
        result = []
        p = Pool(cpu_count())
             
        for phi, previous_score in Hs: 
            result.append(p.apply_async(parallel_fn, args=(Ve, phi, f, cipher, previous_score, ext_limits))) 
                            
        p.close() 
        p.join()  

        Ht = []
        for subp in result:
            Ht += subp.get()
            
        if cardinality < 10 or cardinality >= 40:
            topn = 100000
        else:
            topn = 1000000
    
        # prune the histogram
        mainEnd = time.time()
        print ('Running Time for this symbol: %0.2f seconds.' % (mainEnd-mainStart))
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]    
        
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        
        #######################    monitor if the correct result has been pruned out #######################
        print('Best score: ', Hs[0][1], 'Worst score: ', Hs[min(len(Hs)-1, topn-1)][1])
        found_symbols = ext_order[:cardinality]
        phi_temp = dict()
        for key,value in gold_dict.items():
            if key in found_symbols:
                phi_temp[key] = value
        partial_text = ''
        bit_string = ''
        for cipher_char in cipher_desc['content']:
            if cipher_char in phi_temp.keys():
                partial_text += phi_temp[cipher_char]
                bit_string += 'o'
            else:
                partial_text += '_'
                bit_string += '.'
        gold_score = lm.score_bitstring(partial_text, bit_string)
        print('gold score', gold_score)
        if gold_score < Hs[min(len(Hs)-1, topn-1)][1] or gold_score > Hs[0][1]:
            print('Wrong Wrong Wrong Wrong Wrong Wrong Wrong Wrong!')
        
    return sorted(Hs, key=lambda x:x[1], reverse=True)

## Decipher Zodiac Killer cipher

As mentioned above, we use different ext_limit for different plaintext symbol. We use the following `ext_limits` to limit the ext_order. The goal is to customize the `ext_limit` for each plaintext character so that more symbols can be mapped to the more frequent plaintext character. The `ext_limit` is calculated by multiplying the relative frequency of the paintext character in wiki text by the number of unique cipher symbols. We believe this can dramatically improve the running time since the required amount of computation in each iteration is minimized. We use the ceiling instead of floor to ensure the `ext_limit` is large enough. Of course this approach might not be perfect if less frequent plaintext character is mapped to more cipher symbols. But even that is the case, the error rate should not increase significantly since the frequency of those less frequent plaintext should not be large (if the character distribution of the plaintext in the cipher task is similar to that of the one used to train the language model).

In [21]:
ext_limits = dict()
for e in [chr(i) for i in range(97, 123, 1)]:
    ext_limits[e] = math.ceil(plaintxt_desc['relative_freq'][e]*cipher_desc['vocab_length']/100)
print(ext_limits)

{'a': 5, 'b': 1, 'c': 2, 'd': 3, 'e': 7, 'f': 2, 'g': 2, 'h': 3, 'i': 4, 'j': 1, 'k': 1, 'l': 3, 'm': 2, 'n': 4, 'o': 4, 'p': 2, 'q': 1, 'r': 4, 's': 4, 't': 5, 'u': 2, 'v': 1, 'w': 1, 'x': 1, 'y': 1, 'z': 1}


In [None]:
# %%time
# # single core version
# mappings = beam_search(cipher_desc['content'], ext_order, score, ext_limits, 10000)

Number of unique symbols in cipher: 54
Working on symbol:  E (1)
Best score:  -10.7856218 Worst score:  -29.749483999999995
gold score -10.785621800000001
Working on symbol:  ∑ (2)
Best score:  -20.517739599999995 Worst score:  -59.14061499999999
gold score -24.181161000000003
Working on symbol:  B (3)
Best score:  -32.11996419999999 Worst score:  -57.93098
gold score -38.09201220000001
Working on symbol:  P (4)
Best score:  -40.82754000000006 Worst score:  -53.75988140000001
gold score -48.728844000000024
Working on symbol:  º (5)
Best score:  -54.011017099999975 Worst score:  -64.64988409999995
gold score -62.340047100000035
Working on symbol:  ∫ (6)
Best score:  -66.89316288000005 Worst score:  -75.60772610000008
gold score -74.86408880000002
Working on symbol:  / (7)
Best score:  -72.68006079000003 Worst score:  -80.948034
gold score -82.52081120000001
Wrong Wrong Wrong Wrong Wrong Wrong Wrong Wrong!
Working on symbol:  Z (8)
Best score:  -81.55473529000004 Worst score:  -88.530469

In [None]:
# mapping = mappings[0][0]
# decipher_text = ''
# for char in cipher_desc['content']:
#     decipher_text += mapping[char]
# print(decipher_text)

In [None]:
%%time
# multiprocessing version
mappings = beam_search_mp(cipher_desc['content'], ext_order, ext_limits, 1000000)

In [None]:
mapping = mappings[0][0]
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)