# Homework: Decipherment

In [1]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict, Counter
import collections
import pprint
import math
import bz2
from ngram import *
import sys, string
import copy
import pickle
from joblib import Parallel, delayed
import itertools
pp = pprint.PrettyPrinter(width=45, compact=True)

First let us read in the cipher text from the `data` directory:

In [2]:
def read_file(filename):
    if filename[-4:] == ".bz2":
        with bz2.open(filename, 'rt', encoding='utf8') as f:
            content = f.read()
            f.close()
    else:
        with open(filename, 'r', encoding='utf8') as f:
            content = f.read()
            f.close()
    return content

cipher = read_file("data/cipher.txt")
print(cipher)

º∫P/Z/uB∫ÀOR•–X•B
WV+≈GyF∞ºHPπKÇ—y≈
MJy^uIÀΩ—T‘NQyDµ£
S¢/º∑BPORAu∫∆RÃ—E
À^LMZJƒ“\–FHVW≈æy
π+—GDºKI£∞—Xæµ§S¢
RN‘IyEÃOæ—GBTQS∑B
Lƒ/P∑BπX—EHMu^RRÀ
√ZK—–I£W—ÇæµLM“º∑
BPDR+j•∞\N¢≈EuHÀF
Z√–OVWIµ+‘L£Ã^R∞H
IºDR∏Ty“\ƒ≈/πXJQA
PµMæRu‘∫L£NVEKH•G
“IÇJÀµºæLMÃNA£Z¢P
§u–ÀAº∑BVW\+VT‘OP
^•S“Ã∆u≈∞ΩD§G∫∫IM
NÀ£S√E/º∫∫Z∆AP∑BV
–≈X—W—∏F∑æ√+πºAºB
∫OTµRu√+∏ƒy—∏^S—W
VZ≈GyKE∏TyAº∫∑L‘∏
HÇFBXº§XADƒ\ΩLÇ•—
∏≈ƒ∑∑∞≈µPORXQF∫G√
ZπJT‘—∏æJI+“BPQW∞
VEX“ºWI∞—EHM£•uIÀ


For the default solution we need to compute statistics like length, number of symbols/letters, 
unique occurences, frequencies and relative frequencies of a given file. This is done in the function `get_statistics` below.

While using `get_statistics`, make sure that `cipher=True` is set when the input is a ciphertext.

In [3]:
def get_statistics(content, cipher=True):
    stats = {}
    content = list(content)
    split_content = [x for x in content if x != '\n' and x!=' ']
    length = len(split_content)
    symbols = set(split_content)
    uniq_sym = len(list(symbols))
    freq = collections.Counter(split_content)
    rel_freq = {}
    for sym, frequency in freq.items():
        rel_freq[sym] = (frequency/length)*100
        
    if cipher:
        stats = {'content':split_content, 'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    else:
        stats = {'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    return stats

In [4]:
"""
ATTENTION!
For grading purposes only. Don't bundle with the assignment. 
Make sure '_ref.txt' is removed from the 'data' directory before publishing.
"""

def read_gold(gold_file):
    with open(gold_file) as f:
        gold = f.read()
    f.close()
    gold = list(gold.strip())
    return gold

def symbol_error_rate(dec, _gold):
    gold = read_gold(_gold)
    correct = 0
    if len(gold) == len(dec):
        for (d,g) in zip(dec, gold):
            if d==g:
                correct += 1
    wrong = len(gold)-correct
    error = wrong/len(gold)
    
    return error

In [5]:
cipher_desc = get_statistics(cipher, cipher=True)
#pp.pprint(cipher_desc)

## Load the 6-gram model

In [6]:
%%time
sequence = 'In a few cases, a multilingual artifact has been necessary to facilitate decipherment, the Rosetta Stone being the classic example. Statistical techniques provide another pathway to decipherment, as does the analysis of modern languages derived from ancient languages in which undeciphered texts are written. Archaeological and historical information is helpful in verifying hypothesized decipherments.'

# lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=True)
lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=False)

Reading language model from data/6-gram-wiki-char.lm.bz2...


Wall time: 1min 28s


Done.


In [7]:
from sys import getsizeof
getsizeof(lm)

56

In [12]:
getsizeof(lm.table) / 1024 / 1024

320.0000915527344

In [13]:
from copy import deepcopy
deepcopy(lm)

<ngram.LM at 0x3880c358>

In [7]:
print(sequence)
lm_logprob = lm.score_seq(sequence)
print("TOTAL LM LOGPROB: {}".format(lm_logprob), file=sys.stderr)

print("TOTAL LM LOGPROB: {}".format(lm.score_seq('this is the text.')), file=sys.stderr)
print("TOTAL LM LOGPROB: {}".format(lm.score_seq('jasbklfhthejkldhf')), file=sys.stderr)

print(lm.get_bitstring_spans('..oo...ooo..'))
print(lm.score_bitstring('thisisatest', 'oo...oo.ooo'))

In a few cases, a multilingual artifact has been necessary to facilitate decipherment, the Rosetta Stone being the classic example. Statistical techniques provide another pathway to decipherment, as does the analysis of modern languages derived from ancient languages in which undeciphered texts are written. Archaeological and historical information is helpful in verifying hypothesized decipherments.
{2: 3, 3: 4, 7: 8, 8: 9, 9: 10}
-8.10905897


TOTAL LM LOGPROB: -221.09434842188
TOTAL LM LOGPROB: -9.76947916
TOTAL LM LOGPROB: -40.57683077


## Implementation for Reference 3

In [8]:
def find_sharp_n(cipher_desc, symbols_found, n_order):
    '''
    finds the #n for order n_order
    cipher_desc -- cipher statistics
    symbols_found -- list of single character string,
                     specifies the list of symbols have been placed in the extention order
    n_order -- int, specifies the order of n-gram
    '''
    sharp_n = 0
    for i in range(len(cipher_desc['content'])-n_order+1):
        #flag = True
        for j in range(i, i+n_order, 1):
            if cipher_desc['content'][j] not in symbols_found:
                break
            if j == (i+n_order-1):
                sharp_n += 1
                #print(cipher_desc['content'][i:i+n_order])
    return sharp_n            

In [9]:
# test the function above
sharp_n = find_sharp_n(cipher_desc, ['—', 'º'], 2)
print(sharp_n)

0


In [32]:
def find_ext_order(cipher_desc, topn=100, weights=[1,1,1,1,2,3]):
    '''
    finds the best order of deciphering cipher symbols (find best extention order)
    cipher_desc -- cipher statistics
    topn -- int, number of best trees we want to keep during iteration
    weights -- list of int, weight for #n, n varies from 1 to 6
    '''
    # symbols_found = list()
    # symbols_found.append(sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)[0])
    # symbols already found with score
    Hs = [([], 0)]
    # hypothesis extended symbols with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # list of cipher characters
    Ve = sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)
    while cardinality < cipher_desc['vocab_length']:
    #while cardinality < 3:
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                if e in phi_prime:
                    continue
                else:
                    phi_prime.append(e)
                    this_score = 0
                    for i in range(6):
                        this_score += weights[i]*find_sharp_n(cipher_desc, phi_prime, i+1)
                    Ht.append((phi_prime, this_score))
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        print('Done with symbol number', cardinality, '; Current best score: ', Hs[0][1])
        # print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [33]:
%%time
# test the function above
ext_order = find_ext_order(cipher_desc, topn=100, weights=[1,1,1,1,2,3])

Done with symbol number 1 ; Current best score:  16
Done with symbol number 2 ; Current best score:  31
Done with symbol number 3 ; Current best score:  50
Done with symbol number 4 ; Current best score:  72
Done with symbol number 5 ; Current best score:  96
Done with symbol number 6 ; Current best score:  122
Done with symbol number 7 ; Current best score:  144
Done with symbol number 8 ; Current best score:  176
Done with symbol number 9 ; Current best score:  206
Done with symbol number 10 ; Current best score:  237
Done with symbol number 11 ; Current best score:  268
Done with symbol number 12 ; Current best score:  315
Done with symbol number 13 ; Current best score:  369
Done with symbol number 14 ; Current best score:  396
Done with symbol number 15 ; Current best score:  428
Done with symbol number 16 ; Current best score:  455
Done with symbol number 17 ; Current best score:  485
Done with symbol number 18 ; Current best score:  516
Done with symbol number 19 ; Current best 

## Baseline

In [7]:
# def reverse_mapping(reversed_mapping):
#     mapping = dict()
#     for key, values in reversed_mapping.items():
#         for value in values:
#             mapping[value] = key
#     return mapping

In [9]:
def score(phi, cipher, lm):
    mapping = phi
    partial_cipher = ''
    bit_string = ''
    #n_decipher = 0
    for char in cipher:
        if char in mapping.keys():
            partial_cipher += mapping[char]
            bit_string += 'o'
            #n_decipher += 1
        else:
            partial_cipher += char
            bit_string += '.'
    # print(bit_string)
    return lm.score_bitstring(partial_cipher, bit_string)
def beam_search_old(cipher, ext_order, ext_limits=1, topn=1):
    '''
    finds the mappings between cipher char and plaintext char, returns the mapping dictionary
    ext_order -- list, the unigram char list sorted by their count DESC
    ext_limits -- int, defines maximum number of cipher char can be mapped to a plaintext char
    topn -- int, defines the number of dictionaries we want to keep while pruning
    '''
    print('Number of unique symbols in cipher:', len(ext_order))
    # mapping relationships already found with score
    Hs = [(defaultdict(dict), 0)]
    # hypothesis mapping relationships with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # list of plaintext characters
    Ve = [chr(i) for i in range(97, 123, 1)]
    while cardinality < len(ext_order):
    #while cardinality < 2:
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality+1})')
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                new_map = {f: e}
                phi_prime.update(new_map)
                counts = len([v for k, v in phi_prime.items() if v == e])
                if counts <= ext_limits:
                    Ht.append((phi_prime, score(phi_prime, cipher, lm)))
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        print('Current score: ', Hs[0][1])
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [10]:
def work(Hs):
    ret = []
    for phi, previous_score in Hs:
        for e in Ve:
            phi_prime = copy.deepcopy(phi)
            new_map = {f: e}
            phi_prime.update(new_map)
            counts = len([v for k, v in phi_prime.items() if v == e])
            if counts <= ext_limits:
                ret.append((phi_prime, score(phi_prime, cipher, lm)))
    return ret

In [13]:
def grouper(n, iterable):
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, n))
        if not chunk:
            return
        yield chunk

In [19]:
def beam_search_mp(cipher, ext_order, ext_limits=1, topn=1):
    '''
    finds the mappings between cipher char and plaintext char, returns the mapping dictionary
    ext_order -- list, the unigram char list sorted by their count DESC
    ext_limits -- int, defines maximum number of cipher char can be mapped to a plaintext char
    topn -- int, defines the number of dictionaries we want to keep while pruning
    '''
    print('Number of unique symbols in cipher:', len(ext_order))
    # mapping relationships already found with score
    Hs = [(defaultdict(dict), 0)]
    # hypothesis mapping relationships with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # list of plaintext characters
    Ve = [chr(i) for i in range(97, 123, 1)]
    
    N_JOBS = 3
    p = Parallel(n_jobs=N_JOBS, verbose=2, )
    
    while cardinality < len(ext_order):
    #while cardinality < 2:
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality+1})')
        jobs = list(grouper(math.ceil(len(Hs) / N_JOBS), Hs))

        print(f"Num of jobs {len(jobs)} w {len(jobs[0])}")
        Hts = p(delayed(work)(job) for job in jobs)
        Ht = [e for l in Hts for e in l]
        
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        print('Current score: ', Hs[0][1])
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [None]:
%%time
mappings = beam_search_mp(cipher_desc['content'], ext_order[0][0], 8, 5000)

Number of unique symbols in cipher: 54
Working on symbol:  — (1)
Num of jobs 1 w 1


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


In [None]:
%%time
mappings = beam_search_old(cipher_desc['content'], ext_order[0][0], 8, 5000)

In [49]:
mapping = mappings[0][0]
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)
print('score', lm.score_seq(decipher_text))
print(len(decipher_text))

teenintheareseashoreesisattentitieaairtraittogsinsearnttheresteneatearraiaitbestroesinetsnttreatassaaregoriearstshtsathrinethnatetatreeasittereotissratttheneedsabgreettasiserrorseoreareatrtneititbiennaassesasetoeregrettsstriaastsraagseireateastthrobertorersatanteainaseeragaeasenteeinsethreeatotistssentsthertsetseiiitiratoriesiteitistetroitishataasnibiristieittaesereassessinatotisarethesoareattoratetaestra
score -499.1032521029998
408


In [None]:
mapping = mappings[0][0]
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)
print('score', lm.score_seq(decipher_text))
print(len(decipher_text))

In [50]:
symbol_error_rate(decipher_text, 'data/_ref_Zodiac_408.txt')

0.8602941176470589

## new score function (should be more efficient)

In [85]:
def score_new(cipher, phi, new_f, new_e, previous_score):
    '''
    scores the phi_prime based on the previous score, returns a dict
    cipher -- list of single character string
    phi -- dictionary, old mapping e->[f]
    new_f -- single-character string, extended symbol
    previous_score -- float, old score for phi
    '''
    mapping = phi
    new_score = previous_score
    lm_state = lm.begin()
    for i in range(len(cipher)):
        char = cipher[i]
        if char in mapping.keys():
            token = mapping[char]
            ngram = lm_state + (token,)
            while len(ngram)> 0:
                if ngram in lm.table:
                    lm_state = ngram[-lm.history:]
                    break
                else: #backoff
                    ngram = ngram[1:]
            if len(ngram)==0:
                lm_state = ()
        elif char == new_f:
            (lm_state, logprob) = lm.score(lm_state, new_e)
            new_score += logprob
        else:
            lm_state = ()  
    return new_score

In [86]:
sequence = 'thisisatest'
bitstring = 'ooooooooooo'
bitstring_ = 'oo..ooooooo'

spans = lm.get_bitstring_spans(bitstring)
seq_by_bits = [ sequence[i] if i in spans else '\t' for i in range(len(sequence)) ]
lm_state = lm.begin()
lm_logprob = 0.0 
for token in list(seq_by_bits):
    if token == '\t': # should we skip this token?
        lm_state = ()
        continue
    print('previous lm_state', lm_state)
    (lm_state, logprob) = lm.score(lm_state, token)
    lm_logprob += logprob

    print('lm_state', lm_state)
    print('logrob2222', logprob)
lm_logprob += lm.end(lm_state)
print(lm_logprob)
# -11.05281791
# -18.7161607

previous lm_state ('<s>',)
lm_state ('<s>', 't')
logrob2222 -0.7213777
previous lm_state ('<s>', 't')
lm_state ('<s>', 't', 'h')
logrob2222 -0.06716067
previous lm_state ('<s>', 't', 'h')
lm_state ('<s>', 't', 'h', 'i')
logrob2222 -1.644994
previous lm_state ('<s>', 't', 'h', 'i')
lm_state ('<s>', 't', 'h', 'i', 's')
logrob2222 -0.08672674
previous lm_state ('<s>', 't', 'h', 'i', 's')
lm_state ('t', 'h', 'i', 's', 'i')
logrob2222 -0.9207457
previous lm_state ('t', 'h', 'i', 's', 'i')
lm_state ('h', 'i', 's', 'i', 's')
logrob2222 -0.2430651
previous lm_state ('h', 'i', 's', 'i', 's')
lm_state ('i', 's', 'i', 's', 'a')
logrob2222 -0.7108436
previous lm_state ('i', 's', 'i', 's', 'a')
lm_state ('s', 'i', 's', 'a', 't')
logrob2222 -1.269513
previous lm_state ('s', 'i', 's', 'a', 't')
lm_state ('i', 's', 'a', 't', 'e')
logrob2222 -1.071304
previous lm_state ('i', 's', 'a', 't', 'e')
lm_state ('s', 'a', 't', 'e', 's')
logrob2222 -1.071436
previous lm_state ('s', 'a', 't', 'e', 's')
lm_state 

In [87]:
def beam_search(cipher, ext_order, ext_limits=1, topn=1):
    '''
    finds the mappings between cipher char and plaintext char, returns the mapping dictionary
    ext_order -- list, the unigram char list sorted by their count DESC
    ext_limits -- int, defines maximum number of cipher char can be mapped to a plaintext char
    topn -- int, defines the number of dictionaries we want to keep while pruning
    '''
    print('Number of unique symbols in cipher:', len(ext_order))
    # mapping relationships already found with score
    Hs = [(defaultdict(dict), 0)]
    # hypothesis mapping relationships with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # list of plaintext characters
    Ve = [chr(i) for i in range(97, 123, 1)]
    while cardinality < len(ext_order):
    #while cardinality < 2:
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality+1})')
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                new_map = {f: e}
                phi_prime.update(new_map)
                counts = len([v for k, v in phi_prime.items() if v == e])
                if counts <= ext_limits:
                    Ht.append((phi_prime, score_new(cipher, phi, f, e, previous_score)))
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        print('Current score: ', Hs[0][1])
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [89]:
right_most_cipher_vocab = list()
for cipher_char in reversed(cipher_desc['content']):
    if cipher_char not in right_most_cipher_vocab:
        right_most_cipher_vocab.append(cipher_char)

In [90]:
# with open('ext_order.pkl', 'wb') as fh:
#     pickle.dump(ext_order, fh)

In [12]:
with open('ext_order.pkl', 'rb') as fh:
    ext_order = pickle.load(fh, encoding='utf8')

In [None]:
%%time
sorted_keys = sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)
#reversed_mappings = beam_search(cipher_desc['content'], sorted_keys, 3, 100)
#reversed_mappings = beam_search(cipher_desc['content'], right_most_cipher_vocab, 8, 1000)
mappings = beam_search(cipher_desc['content'], ext_order[0][0], 8, 100000)

Number of unique symbols in cipher: 54
Working on symbol:  — (1)
Current score:  -14.649315199999997
Working on symbol:  ∑ (2)
Current score:  -24.38143299999999
Working on symbol:  B (3)
Current score:  -35.98365760000001
Working on symbol:  P (4)
Current score:  -44.349759600000034
Working on symbol:  º (5)
Current score:  -57.81347020000007
Working on symbol:  ∫ (6)
Current score:  -69.70347510000002
Working on symbol:  A (7)
Current score:  -76.9784770000001
Working on symbol:  / (8)
Current score:  -82.84680000000007
Working on symbol:  Z (9)
Current score:  -91.55581070000004
Working on symbol:  ∆ (10)
Current score:  -94.33437120000005
Working on symbol:  u (11)
Current score:  -104.13269230000006
Working on symbol:  O (12)
Current score:  -111.65146800000007
Working on symbol:  R (13)
Current score:  -122.1242165800001
Working on symbol:  À (14)
Current score:  -131.59201208000007
Working on symbol:  Ã (15)
Current score:  -136.28468161299998
Working on symbol:  E (16)
Current 

In [77]:
mapping = mappings[0][0]

In [78]:
len(mapping)

54

In [79]:
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)
print('score', lm.score_seq(decipher_text))
print(len(decipher_text))

seehihaheareastahseatsaiosteesheattaanatareaioratrrinhsthereeaeneieaanttiaoeusitesthaeaestsstroethroineoitaairhesharithtohetheteattaneearisestrsehhrttestheteamaouontaataiirsrestraitrineottstenaaeuothetareertheaietroeastasethaarshttioerineoasaesthesuaeairenaieinatortoseettoarirahseeineethesttesenithraesesherarearanoaennieseitsasanaaesettinthihtsotetourthaentottotreretriesrieaaienhataehersoeatesstoeattraata
score -597.1548610020002
408


In [80]:
with open('data/_ref_Zodiac_408.txt', 'r') as fh:
    ground_truth = fh.read()
print(ground_truth)
print('score', lm.score_seq(ground_truth))
print(len(ground_truth))

ilikekillingpeoplebecauseitissomuchfunitismorefunthankillingwildgameintheforrestbecausemanisthemostdangeroueanamalofalltokillsomethinggivesmethemostthrillingexperenceitisevenbetterthangettingyourrocksoffwithagirlthebestpartofitisthaewhenidieiwillbereborninparadicesndalltheihavekilledwillbecomemyslavesiwillnotgiveyoumynamebecauseyouwilltrytosloidownorstopmycollectiogofslavesformyafterlifeebeorietemethhpiti
score -359.0030393831197
408


In [81]:
symbol_error_rate(decipher_text, 'data/_ref_Zodiac_408.txt')

0.8848039215686274

Notice that the default solution provides a very bad decipherment. Your job is to make it better!

## Grading

Ignore the following cells. They are for grading against the reference decipherment. Based on the clues provided in the decipherment homework description, you can easily find a reasonable reference text online for this cipher text.

In [34]:
"""
ATTENTION!
For grading purposes only. Don't bundle with the assignment. 
Make sure '_ref.txt' is removed from the 'data' directory before publishing.
"""

def read_gold(gold_file):
    with open(gold_file) as f:
        gold = f.read()
    f.close()
    gold = list(gold.strip())
    return gold

def symbol_error_rate(dec, _gold):
    gold = read_gold(_gold)
    correct = 0
    if len(gold) == len(dec):
        for (d,g) in zip(dec, gold):
            if d==g:
                correct += 1
    wrong = len(gold)-correct
    error = wrong/len(gold)
    
    return error

In [None]:
# gold decipherment
gold_file = "data/_ref.txt"
ser = symbol_error_rate(decipherment, gold_file)
print('Error: ', ser*100, 'Accuracy: ', (1-ser)*100)