# Homework: Decipherment

In [1]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict, Counter
import collections
import pprint
import math
import bz2
from ngram import *
import sys, string
import copy
import pickle
#from joblib import Parallel, delayed
import itertools
pp = pprint.PrettyPrinter(width=45, compact=True)

First let us read in the cipher text from the `data` directory:

In [2]:
def read_file(filename):
    if filename[-4:] == ".bz2":
        with bz2.open(filename, 'rt', encoding='utf8') as f:
            content = f.read()
            f.close()
    else:
        with open(filename, 'r', encoding='utf8') as f:
            content = f.read()
            f.close()
    return content

cipher = read_file("data/cipher.txt")
print(cipher)

º∫P/Z/uB∫ÀOR•–X•B
WV+≈GyF∞ºHPπKÇ—y≈
MJy^uIÀΩ—T‘NQyDµ£
S¢/º∑BPORAu∫∆RÃ—E
À^LMZJƒ“\–FHVW≈æy
π+—GDºKI£∞—Xæµ§S¢
RN‘IyEÃOæ—GBTQS∑B
Lƒ/P∑BπX—EHMu^RRÀ
√ZK—–I£W—ÇæµLM“º∑
BPDR+j•∞\N¢≈EuHÀF
Z√–OVWIµ+‘L£Ã^R∞H
IºDR∏Ty“\ƒ≈/πXJQA
PµMæRu‘∫L£NVEKH•G
“IÇJÀµºæLMÃNA£Z¢P
§u–ÀAº∑BVW\+VT‘OP
^•S“Ã∆u≈∞ΩD§G∫∫IM
NÀ£S√E/º∫∫Z∆AP∑BV
–≈X—W—∏F∑æ√+πºAºB
∫OTµRu√+∏ƒy—∏^S—W
VZ≈GyKE∏TyAº∫∑L‘∏
HÇFBXº§XADƒ\ΩLÇ•—
∏≈ƒ∑∑∞≈µPORXQF∫G√
ZπJT‘—∏æJI+“BPQW∞
VEX“ºWI∞—EHM£•uIÀ


For the default solution we need to compute statistics like length, number of symbols/letters, 
unique occurences, frequencies and relative frequencies of a given file. This is done in the function `get_statistics` below.

While using `get_statistics`, make sure that `cipher=True` is set when the input is a ciphertext.

In [3]:
def get_statistics(content, cipher=True):
    stats = {}
    content = list(content)
    split_content = [x for x in content if x != '\n' and x!=' ']
    length = len(split_content)
    symbols = set(split_content)
    uniq_sym = len(list(symbols))
    freq = collections.Counter(split_content)
    rel_freq = {}
    for sym, frequency in freq.items():
        rel_freq[sym] = (frequency/length)*100
        
    if cipher:
        stats = {'content':split_content, 'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    else:
        stats = {'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    return stats

In [4]:
"""
ATTENTION!
For grading purposes only. Don't bundle with the assignment. 
Make sure '_ref.txt' is removed from the 'data' directory before publishing.
"""

def read_gold(gold_file):
    with open(gold_file) as f:
        gold = f.read()
    f.close()
    gold = list(gold.strip())
    return gold

def symbol_error_rate(dec, _gold):
    gold = read_gold(_gold)
    correct = 0
    if len(gold) == len(dec):
        for (d,g) in zip(dec, gold):
            if d==g:
                correct += 1
    wrong = len(gold)-correct
    error = wrong/len(gold)
    
    return error

In [5]:
cipher_desc = get_statistics(cipher, cipher=True)
#pp.pprint(cipher_desc)

## Load the 6-gram model

In [6]:
%%time
sequence = 'In a few cases, a multilingual artifact has been necessary to facilitate decipherment, the Rosetta Stone being the classic example. Statistical techniques provide another pathway to decipherment, as does the analysis of modern languages derived from ancient languages in which undeciphered texts are written. Archaeological and historical information is helpful in verifying hypothesized decipherments.'

# lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=True)
lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=False)

Reading language model from data/6-gram-wiki-char.lm.bz2...


Wall time: 15.2 s


Done.


In [None]:
print(sequence)
lm_logprob = lm.score_seq(sequence)
print("TOTAL LM LOGPROB: {}".format(lm_logprob), file=sys.stderr)
print("TOTAL LM LOGPROB: {}".format(lm.score_seq('this is the text.')), file=sys.stderr)
print(lm.get_bitstring_spans('..oo...ooo..'))
print(lm.score_bitstring('thisisatest', 'oo...oo.ooo'))

## Implementation for Reference 3

In [7]:
def find_sharp_n(cipher_desc, symbols_found, n_order):
    '''
    finds the #n for order n_order
    cipher_desc -- cipher statistics
    symbols_found -- list of single character string,
                     specifies the list of symbols have been placed in the extention order
    n_order -- int, specifies the order of n-gram
    '''
    sharp_n = 0
    for i in range(len(cipher_desc['content'])-n_order+1):
        #flag = True
        for j in range(i, i+n_order, 1):
            if cipher_desc['content'][j] not in symbols_found:
                break
            if j == (i+n_order-1):
                sharp_n += 1
                #print(cipher_desc['content'][i:i+n_order])
    return sharp_n            

In [8]:
def find_ext_order(cipher_desc, topn=100, weights=[1,1,1,1,2,3]):
    '''
    finds the best order of deciphering cipher symbols (find best extention order)
    cipher_desc -- cipher statistics
    topn -- int, number of best trees we want to keep during iteration
    weights -- list of int, weight for #n, n varies from 1 to 6
    '''
    # symbols_found = list()
    # symbols_found.append(sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)[0])
    # symbols already found with score
    Hs = [([], 0)]
    # hypothesis extended symbols with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    if weights[0] == 0:
        cardinality += 1
        Hs.append(([sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)[0]], 0))
    # list of cipher characters
    Ve = sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)
    while cardinality < cipher_desc['vocab_length']:
    #while cardinality < 3:
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                if e in phi_prime:
                    continue
                else:
                    phi_prime.append(e)
                    this_score = 0
                    for i in range(6):
                        this_score += weights[i]*find_sharp_n(cipher_desc, phi_prime, i+1)
                    Ht.append((phi_prime, this_score))
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        print('Done with symbol number', cardinality, '; Current best score: ', Hs[0][1])
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [10]:
%%time
# test the function above
ext_order = find_ext_order(cipher_desc, topn=100, weights=[1,2,3,4,5,6])

Done with symbol number 1 ; Current best score:  16
Done with symbol number 2 ; Current best score:  38
Done with symbol number 3 ; Current best score:  69
Done with symbol number 4 ; Current best score:  108
Done with symbol number 5 ; Current best score:  152
Done with symbol number 6 ; Current best score:  209
Done with symbol number 7 ; Current best score:  256
Done with symbol number 8 ; Current best score:  312
Done with symbol number 9 ; Current best score:  401
Done with symbol number 10 ; Current best score:  485
Done with symbol number 11 ; Current best score:  537
Done with symbol number 12 ; Current best score:  665
Done with symbol number 13 ; Current best score:  751
Done with symbol number 14 ; Current best score:  804
Done with symbol number 15 ; Current best score:  875
Done with symbol number 16 ; Current best score:  934
Done with symbol number 17 ; Current best score:  999
Done with symbol number 18 ; Current best score:  1058
Done with symbol number 19 ; Current be

In [12]:
ext_order[0][0]

['B',
 '∑',
 'P',
 'º',
 'A',
 '∫',
 '/',
 'Z',
 '∆',
 'u',
 'R',
 'O',
 'À',
 'V',
 '–',
 '√',
 'W',
 'I',
 '•',
 'X',
 '§',
 'D',
 'π',
 '+',
 '≈',
 '—',
 'G',
 'K',
 'y',
 'æ',
 'µ',
 'T',
 '∏',
 'F',
 'E',
 'H',
 '∞',
 'ƒ',
 '“',
 '\\',
 'Q',
 'J',
 'Ç',
 'M',
 '£',
 'L',
 '‘',
 '^',
 'S',
 'Ã',
 'N',
 '¢',
 'Ω',
 'j']

In [None]:
# with open('ext_order.pkl', 'wb') as fh:
#     pickle.dump(ext_order, fh)

In [None]:
# with open('ext_order.pkl', 'rb') as fh:
#     ext_order = pickle.load(fh, encoding='utf8')

## Baseline

In [13]:
def score(cipher, phi, new_f, new_e, previous_score):
    '''
    scores the phi_prime based on the previous score, returns a float
    cipher -- list of single character string
    phi -- dictionary, old mapping e->[f]
    new_f -- single-character string, extended symbol
    previous_score -- float, old score for phi
    '''
    mapping = phi
    new_score = previous_score
    # for the first iteration, the previous score should be -2.545382 instead of 0
    if len(phi)==0:
        new_score += -2.545382
    lm_state = lm.begin()
    old_lm_state = lm.begin()
    triggerChangeFlag = 0
    for i in range(len(cipher)):
        char = cipher[i]
        if (char in mapping.keys()) and (triggerChangeFlag==0):
            token = mapping[char]
            ngram = lm_state + (token,)
            while len(ngram)> 0:
                if ngram in lm.table:
                    lm_state = ngram[-lm.history:]
                    break
                else: #backoff
                    ngram = ngram[1:]
            if len(ngram)==0:
                lm_state = ()
            old_lm_state = lm_state
        elif (char in mapping.keys()) and (triggerChangeFlag>0):
            token = mapping[char]
            old_lm_state, old_logprob = lm.score(old_lm_state, token)
            new_score -= old_logprob
            lm_state, logprob = lm.score(lm_state, token)
            new_score += logprob
            triggerChangeFlag -= 1
        elif char == new_f:
            (lm_state, logprob) = lm.score(lm_state, new_e)
            new_score += logprob
            triggerChangeFlag = 5
            old_lm_state = ()
        else:
            lm_state = ()
            old_lm_state = ()
            triggerChangeFlag = 0
        #print('old lm state', old_lm_state)
        #print('lm state', lm_state)
    if triggerChangeFlag:
        new_score -= lm.end(old_lm_state)
        new_score += lm.end(lm_state)
    return new_score

In [14]:
def beam_search(cipher, ext_order, ext_limits=1, topn=1):
    '''
    finds the mappings between cipher char and plaintext char, returns the mapping dictionary
    ext_order -- list, the unigram char list sorted by their count DESC
    ext_limits -- int, defines maximum number of cipher char can be mapped to a plaintext char
    topn -- int, defines the number of dictionaries we want to keep while pruning
    '''
    print('Number of unique symbols in cipher:', len(ext_order))
    # mapping relationships already found with score
    Hs = [(defaultdict(dict), 0)]
    # hypothesis mapping relationships with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # list of plaintext characters
    Ve = [chr(i) for i in range(97, 123, 1)]
    while cardinality < len(ext_order):
    #while cardinality < 2:
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality+1})')
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                new_map = {f: e}
                phi_prime.update(new_map)
                counts = len([v for k, v in phi_prime.items() if v == e])
                if counts <= ext_limits:
                    Ht.append((phi_prime, score(cipher, phi, f, e, previous_score)))
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        print('Current score: ', Hs[0][1])
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [None]:
%%time
mappings = beam_search(cipher_desc['content'], ext_order[0][0], 6, 1000)

Number of unique symbols in cipher: 54
Working on symbol:  B (1)
Current score:  -13.532368399999998
Working on symbol:  ∑ (2)
Current score:  -23.8797244
Working on symbol:  P (3)
Current score:  -32.587300200000016
Working on symbol:  º (4)
Current score:  -45.770777300000006
Working on symbol:  A (5)
Current score:  -54.33727280000004
Working on symbol:  ∫ (6)
Current score:  -67.97915410000006
Working on symbol:  / (7)
Current score:  -73.70008026800001
Working on symbol:  Z (8)
Current score:  -81.31696976800004
Working on symbol:  ∆ (9)
Current score:  -83.74518066999998
Working on symbol:  u (10)
Current score:  -95.12777883000005
Working on symbol:  R (11)
Current score:  -105.95487933000007
Working on symbol:  O (12)
Current score:  -113.61255463000008
Working on symbol:  À (13)
Current score:  -122.6735097300001
Working on symbol:  V (14)
Current score:  -129.7293219300001
Working on symbol:  – (15)
Current score:  -135.9829850300001
Working on symbol:  √ (16)
Current score: 

In [None]:
mapping = mappings[0][0]
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)
print('score', lm.score_seq(decipher_text))
print(len(decipher_text))

print(symbol_error_rate(decipher_text, 'data/_ref_Zodiac_408.txt'))

## Multi-processing

In [None]:
def work(Hs):
    ret = []
    for phi, previous_score in Hs:
        for e in Ve:
            phi_prime = copy.deepcopy(phi)
            new_map = {f: e}
            phi_prime.update(new_map)
            counts = len([v for k, v in phi_prime.items() if v == e])
            if counts <= ext_limits:
                ret.append((phi_prime, score(phi_prime, cipher, lm)))
    return ret

def grouper(n, iterable):
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, n))
        if not chunk:
            return
        yield chunk

def beam_search_mp(cipher, ext_order, ext_limits=1, topn=1):
    '''
    finds the mappings between cipher char and plaintext char, returns the mapping dictionary
    ext_order -- list, the unigram char list sorted by their count DESC
    ext_limits -- int, defines maximum number of cipher char can be mapped to a plaintext char
    topn -- int, defines the number of dictionaries we want to keep while pruning
    '''
    print('Number of unique symbols in cipher:', len(ext_order))
    # mapping relationships already found with score
    Hs = [(defaultdict(dict), 0)]
    # hypothesis mapping relationships with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # list of plaintext characters
    Ve = [chr(i) for i in range(97, 123, 1)]
    
    N_JOBS = 3
    p = Parallel(n_jobs=N_JOBS, verbose=2, )
    
    while cardinality < len(ext_order):
    #while cardinality < 2:
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality+1})')
        jobs = list(grouper(math.ceil(len(Hs) / N_JOBS), Hs))

        print(f"Num of jobs {len(jobs)} w {len(jobs[0])}")
        Hts = p(delayed(work)(job) for job in jobs)
        Ht = [e for l in Hts for e in l]
        
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        print('Current score: ', Hs[0][1])
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [None]:
%%time
mappings = beam_search_mp(cipher_desc['content'], ext_order[0][0], 8, 5000)

In [None]:
mapping = mappings[0][0]
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)
print('score', lm.score_seq(decipher_text))
print(len(decipher_text))

print(symbol_error_rate(decipher_text, 'data/_ref_Zodiac_408.txt'))

In [None]:
with open('data/_ref_Zodiac_408.txt', 'r') as fh:
    ground_truth = fh.read()
print(ground_truth)
print('score', lm.score_seq(ground_truth))
print(len(ground_truth))

Notice that the default solution provides a very bad decipherment. Your job is to make it better!

## Grading

Ignore the following cells. They are for grading against the reference decipherment. Based on the clues provided in the decipherment homework description, you can easily find a reasonable reference text online for this cipher text.

In [None]:
"""
ATTENTION!
For grading purposes only. Don't bundle with the assignment. 
Make sure '_ref.txt' is removed from the 'data' directory before publishing.
"""

def read_gold(gold_file):
    with open(gold_file) as f:
        gold = f.read()
    f.close()
    gold = list(gold.strip())
    return gold

def symbol_error_rate(dec, _gold):
    gold = read_gold(_gold)
    correct = 0
    if len(gold) == len(dec):
        for (d,g) in zip(dec, gold):
            if d==g:
                correct += 1
    wrong = len(gold)-correct
    error = wrong/len(gold)
    
    return error

In [None]:
# gold decipherment
gold_file = "data/_ref.txt"
ser = symbol_error_rate(decipherment, gold_file)
print('Error: ', ser*100, 'Accuracy: ', (1-ser)*100)