# Homework: Decipherment

In [1]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict, Counter
import collections
import pprint
import math
import bz2
from ngram import *
import sys, string
import copy
pp = pprint.PrettyPrinter(width=45, compact=True)

First let us read in the cipher text from the `data` directory:

In [2]:
def read_file(filename):
    if filename[-4:] == ".bz2":
        with bz2.open(filename, 'rt', encoding='utf8') as f:
            content = f.read()
            f.close()
    else:
        with open(filename, 'r', encoding='utf8') as f:
            content = f.read()
            f.close()
    return content

cipher = read_file("data/cipher.txt")
print(cipher)

º∫P/Z/uB∫ÀOR•–X•B
WV+≈GyF∞ºHPπKÇ—y≈
MJy^uIÀΩ—T‘NQyDµ£
S¢/º∑BPORAu∫∆RÃ—E
À^LMZJƒ“\–FHVW≈æy
π+—GDºKI£∞—Xæµ§S¢
RN‘IyEÃOæ—GBTQS∑B
Lƒ/P∑BπX—EHMu^RRÀ
√ZK—–I£W—ÇæµLM“º∑
BPDR+j•∞\N¢≈EuHÀF
Z√–OVWIµ+‘L£Ã^R∞H
IºDR∏Ty“\ƒ≈/πXJQA
PµMæRu‘∫L£NVEKH•G
“IÇJÀµºæLMÃNA£Z¢P
§u–ÀAº∑BVW\+VT‘OP
^•S“Ã∆u≈∞ΩD§G∫∫IM
NÀ£S√E/º∫∫Z∆AP∑BV
–≈X—W—∏F∑æ√+πºAºB
∫OTµRu√+∏ƒy—∏^S—W
VZ≈GyKE∏TyAº∫∑L‘∏
HÇFBXº§XADƒ\ΩLÇ•—
∏≈ƒ∑∑∞≈µPORXQF∫G√
ZπJT‘—∏æJI+“BPQW∞
VEX“ºWI∞—EHM£•uIÀ


For the default solution we need to compute statistics like length, number of symbols/letters, 
unique occurences, frequencies and relative frequencies of a given file. This is done in the function `get_statistics` below.

While using `get_statistics`, make sure that `cipher=True` is set when the input is a ciphertext.

In [3]:
def get_statistics(content, cipher=True):
    stats = {}
    content = list(content)
    split_content = [x for x in content if x != '\n' and x!=' ']
    length = len(split_content)
    symbols = set(split_content)
    uniq_sym = len(list(symbols))
    freq = collections.Counter(split_content)
    rel_freq = {}
    for sym, frequency in freq.items():
        rel_freq[sym] = (frequency/length)*100
        
    if cipher:
        stats = {'content':split_content, 'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    else:
        stats = {'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    return stats

In [4]:
cipher_desc = get_statistics(cipher, cipher=True)
pp.pprint(cipher_desc)

{'content': ['º', '∫', 'P', '/', 'Z', '/',
             'u', 'B', '∫', 'À', 'O', 'R',
             '•', '–', 'X', '•', 'B', 'W',
             'V', '+', '≈', 'G', 'y', 'F',
             '∞', 'º', 'H', 'P', 'π', 'K',
             'Ç', '—', 'y', '≈', 'M', 'J',
             'y', '^', 'u', 'I', 'À', 'Ω',
             '—', 'T', '‘', 'N', 'Q', 'y',
             'D', 'µ', '£', 'S', '¢', '/',
             'º', '∑', 'B', 'P', 'O', 'R',
             'A', 'u', '∫', '∆', 'R', 'Ã',
             '—', 'E', 'À', '^', 'L', 'M',
             'Z', 'J', 'ƒ', '“', '\\', '–',
             'F', 'H', 'V', 'W', '≈', 'æ',
             'y', 'π', '+', '—', 'G', 'D',
             'º', 'K', 'I', '£', '∞', '—',
             'X', 'æ', 'µ', '§', 'S', '¢',
             'R', 'N', '‘', 'I', 'y', 'E',
             'Ã', 'O', 'æ', '—', 'G', 'B',
             'T', 'Q', 'S', '∑', 'B', 'L',
             'ƒ', '/', 'P', '∑', 'B', 'π',
             'X', '—', 'E', 'H', 'M', 'u',
             '^', 'R', 'R', 'À', '√', 'Z',
          

## Baseline

In [9]:
%%time
sequence = 'In a few cases, a multilingual artifact has been necessary to facilitate decipherment, the Rosetta Stone being the classic example. Statistical techniques provide another pathway to decipherment, as does the analysis of modern languages derived from ancient languages in which undeciphered texts are written. Archaeological and historical information is helpful in verifying hypothesized decipherments.'

# lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=True)
lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=False)

Reading language model from data/6-gram-wiki-char.lm.bz2...



Wall time: 58.9 s


In [10]:
print(sequence)
lm_logprob = lm.score_seq(sequence)
print("TOTAL LM LOGPROB: {}".format(lm_logprob), file=sys.stderr)

print("TOTAL LM LOGPROB: {}".format(lm.score_seq('this is the text.')), file=sys.stderr)
print("TOTAL LM LOGPROB: {}".format(lm.score_seq('jasbklfhthejkldhf')), file=sys.stderr)

print(lm.get_bitstring_spans('..oo...ooo..'))
print(lm.score_bitstring('thisisatest', 'oo...oo.ooo'))

In a few cases, a multilingual artifact has been necessary to facilitate decipherment, the Rosetta Stone being the classic example. Statistical techniques provide another pathway to decipherment, as does the analysis of modern languages derived from ancient languages in which undeciphered texts are written. Archaeological and historical information is helpful in verifying hypothesized decipherments.
{2: 3, 3: 4, 7: 8, 8: 9, 9: 10}
-8.10905897


TOTAL LM LOGPROB: -221.09434842188
TOTAL LM LOGPROB: -9.76947916
TOTAL LM LOGPROB: -40.57683077


In [24]:
# sequence = 'thisisatest'
# bitstring = 'ooooooooooo'
# bitstring_ = '.oooooo.oo.'
# this_symbol = 't'
# mapping = {'h':'h', 'i':'i', 's':'s', 'a':'a', 'e':'e'}
# #previous_score = lm.score_bitstring('thisisatest', bitstring_)
# previous_score = 0
# print(previous_score)
# temp_dict = dict()
# lm_state_dict = dict()
# Ve = [chr(i) for i in range(97, 123, 1)]
# for char in Ve:
#     lm_state_dict[char] = lm.begin()
#     temp_dict[char] = previous_score
# for token in sequence:
#     if token == this_symbol:
#         for char in Ve:
#             lm_state = lm_state_dict[char]
#             (lm_state, logprob) = lm.score(lm_state, char)
#             lm_state_dict[char] = lm_state
#             temp_dict[char] += logprob
#             if char == 't':
#                 print('lm_state', lm_state)
#     else:
#         for char in Ve:
#             lm_state = lm_state_dict[char]
#             (lm_state, logprob) = lm.score(lm_state, mapping[token])
#             lm_state_dict[char] = lm_state
#             temp_dict[char] += logprob
#             if char == 't':
#                 print('lm_state', lm_state)
# for char in Ve:
#     lm_state = lm_state_dict[char]
#     temp_dict[char] += lm.end(lm_state)
# print(temp_dict['t'], temp_dict['d'])

# spans = lm.get_bitstring_spans(bitstring)
# seq_by_bits = [ sequence[i] if i in spans else '\t' for i in range(len(sequence)) ]
# lm_state = lm.begin()
# lm_logprob = 0.0 
# for token in list(seq_by_bits):
#     if token == '\t': # should we skip this token?
#         lm_state = ()
#         continue
#     lm.maybe_write("state: {}".format(lm_state + (token,)))
#     (lm_state, logprob) = lm.score(lm_state, token)
#     lm_logprob += logprob
#     lm.maybe_write("logprob={}".format(logprob))
#     print('logrob2222', logprob)
# lm_logprob += lm.end(lm_state)
# print(lm_logprob)
# -11.05281791
# -18.7161607

In [36]:
def find_sharp_n(cipher_desc, symbols_found, n_order):
    '''
    finds the #n for order n_order
    cipher_desc -- cipher statistics
    symbols_found -- list of single character string,
                     specifies the list of symbols have been placed in the extention order
    n_order -- int, specifies the order of n-gram
    '''
    sharp_n = 0
    for i in range(len(cipher_desc['content'])-n_order+1):
        #flag = True
        for j in range(i, i+n_order, 1):
            if cipher_desc['content'][j] not in symbols_found:
                break
            if j == (i+n_order-1):
                sharp_n += 1
                #print(cipher_desc['content'][i:i+n_order])
    return sharp_n            

In [64]:
# test the function above
sharp_n = find_sharp_n(cipher_desc, ['—', 'º'], 2)
print(sharp_n)

0


In [81]:
def find_ext_order(cipher_desc, topn=100, weights=[0,1,1,1,2,3]):
    '''
    finds the best order of deciphering cipher symbols (find best extention order)
    cipher_desc -- cipher statistics
    topn -- int, number of best trees we want to keep during iteration
    weights -- list of int, weight for #n, n varies from 1 to 6
    '''
    symbols_found = list()
    symbols_found.append(sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)[0])
    # symbols already found with score
    Hs = [(symbols_found, 0)]
    # hypothesis extended symbols with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 1
    # list of cipher characters
    Ve = sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)[1:]
    while cardinality < cipher_desc['vocab_length']:
    #while cardinality < 3:
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                if e in phi_prime:
                    continue
                else:
                    phi_prime.append(e)
                    this_score = 0
                    for i in range(1,6,1):
                        this_score += weights[i]*find_sharp_n(cipher_desc, phi_prime, i+1)
                    Ht.append((phi_prime, this_score))
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        print('Working on symbol number', cardinality, '; Current score: ', Hs[0][1])
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [82]:
%%time
# test the function above
ext_order = find_ext_order(cipher_desc)

Working on symbol number 2 ; Current score:  5
Working on symbol number 3 ; Current score:  11
Working on symbol number 4 ; Current score:  18
Working on symbol number 5 ; Current score:  29
Working on symbol number 6 ; Current score:  41
Working on symbol number 7 ; Current score:  56
Working on symbol number 8 ; Current score:  78
Working on symbol number 9 ; Current score:  96
Working on symbol number 10 ; Current score:  123
Working on symbol number 11 ; Current score:  152
Working on symbol number 12 ; Current score:  170
Working on symbol number 13 ; Current score:  218
Working on symbol number 14 ; Current score:  248
Working on symbol number 15 ; Current score:  267
Working on symbol number 16 ; Current score:  296
Working on symbol number 17 ; Current score:  318
Working on symbol number 18 ; Current score:  346
Working on symbol number 19 ; Current score:  368
Working on symbol number 20 ; Current score:  400
Working on symbol number 21 ; Current score:  430
Working on symbol

In [101]:
# ext_order

In [45]:
def reverse_mapping(reversed_mapping):
    mapping = dict()
    for key, values in reversed_mapping.items():
        for value in values:
            mapping[value] = key
    return mapping

In [46]:
def score(phi, cipher, lm):
    mapping = reverse_mapping(phi)
    partial_cipher = ''
    bit_string = ''
    #n_decipher = 0
    for char in cipher:
        if char in mapping.keys():
            partial_cipher += mapping[char]
            bit_string += 'o'
            #n_decipher += 1
        else:
            partial_cipher += char
            bit_string += '.'
    # print(bit_string)
    # return lm.score_bitstring(partial_cipher, bit_string) / n_decipher
    return lm.score_bitstring(partial_cipher, bit_string)

In [47]:
# test the two functions above
reversed_mapping = {'i': [cipher_desc['content'][0]], 'l': [cipher_desc['content'][1]]}
print('plaintext char to cipher text char', reversed_mapping)
print('cipher text char to plaintext char', reverse_mapping(reversed_mapping))
print('score: ', score(reversed_mapping, cipher, lm))

plaintext char to cipher text char {'i': ['º'], 'l': ['∫']}
cipher text char to plaintext char {'º': 'i', '∫': 'l'}
score:  -32.930710299999994


In [48]:
def beam_search(cipher, ext_order, ext_limits=1, topn=1):
    '''
    finds the mappings between cipher char and plaintext char, returns the mapping dictionary
    ext_order -- list, the unigram char list sorted by their count DESC
    ext_limits -- int, defines maximum number of cipher char can be mapped to a plaintext char
    topn -- int, defines the number of dictionaries we want to keep while pruning
    '''
    print('Number of unique symbols in cipher:', len(ext_order))
    # mapping relationships already found with score
    Hs = [(defaultdict(dict), 0)]
    # hypothesis mapping relationships with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # list of plaintext characters
    Ve = [chr(i) for i in range(97, 123, 1)]
    while cardinality < len(ext_order):
    #while cardinality < 2:
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality})')
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                if e in phi_prime.keys():
                    if len(phi_prime[e]) < ext_limits:
                        phi_prime[e].append(f)
                        Ht.append((phi_prime, score(phi_prime, cipher, lm)))
                else:
                    phi_prime[e] = [f]
                    this_score = score(phi_prime, cipher, lm)
                    Ht.append((phi_prime, this_score))
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        print('Current score: ', Hs[0][1])
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [49]:
# cipher = read_file("data/cipher.txt")
# cipher_desc = get_statistics(cipher, cipher=True)
# plaintxt = read_file("data/default.wiki.txt.bz2")
# plaintxt_desc = get_statistics(plaintxt, cipher=False)

In [25]:
right_most_cipher_vocab = list()
for cipher_char in reversed(cipher_desc['content']):
    if cipher_char not in right_most_cipher_vocab:
        right_most_cipher_vocab.append(cipher_char)

In [115]:
import pickle
# with open('ext_order.pkl', 'wb') as fh:
#     pickle.dump(ext_order, fh)

In [117]:
with open('ext_order.pkl', 'rb') as fh:
    ext_order = pickle.load(fh, encoding='utf8')

In [100]:
%%time
sorted_keys = sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)
#reversed_mappings = beam_search(cipher_desc['content'], sorted_keys, 3, 100)
#reversed_mappings = beam_search(cipher_desc['content'], right_most_cipher_vocab, 8, 10000)
reversed_mappings = beam_search(cipher_desc['content'], ext_order[0][0], 54, 500)

Number of unique symbols in cipher: 54
Working on symbol:  — (0)
Current score:  -17.194697199999997
Working on symbol:  ∑ (1)
Current score:  -26.92681499999999
Working on symbol:  B (2)
Current score:  -38.529039600000004
Working on symbol:  P (3)
Current score:  -47.236615400000005
Working on symbol:  º (4)
Current score:  -60.42009250000001
Working on symbol:  ∫ (5)
Current score:  -73.30223828000003
Working on symbol:  A (6)
Current score:  -82.62846930000006
Working on symbol:  / (7)
Current score:  -88.34939546800003
Working on symbol:  Z (8)
Current score:  -95.96628496800005
Working on symbol:  ∆ (9)
Current score:  -98.39449587000003
Working on symbol:  u (10)
Current score:  -109.77709403000004
Working on symbol:  O (11)
Current score:  -116.85669908200009
Working on symbol:  R (12)
Current score:  -128.26186983000005
Working on symbol:  À (13)
Current score:  -137.32282493000008
Working on symbol:  Ã (14)
Current score:  -142.3589917200001
Working on symbol:  E (15)
Current

In [102]:
mapping = reverse_mapping(reversed_mappings[0][0])

In [103]:
len(mapping)

54

In [104]:
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)
print('score', lm.score_seq(decipher_text))
print(len(decipher_text))

esentnessetttrhtsrerainthesetisenatenseseleineananthanessettlesotheresetteseartseraintreiaeisthehinwhatensnrhtieisiahssesnesstherstestteetierstresineteesseatrathaeaaresettertersnrnethsthsseatsineasanthealentitenseteeristiesseeneiethelttaewerelesserareintesthehoeahlawisssteethernesstolesseraherestsiertelesstinteerssnessheretainirsinlessenssstshewhlasalestesassshanetthatsietteinesiesresearherheersherstttese
score -489.34895032200046
408


In [91]:
with open('data/_ref_Zodiac_408.txt', 'r') as fh:
    ground_truth = fh.read()
print(ground_truth)
print('score', lm.score_seq(ground_truth))
print(len(ground_truth))

ilikekillingpeoplebecauseitissomuchfunitismorefunthankillingwildgameintheforrestbecausemanisthemostdangeroueanamalofalltokillsomethinggivesmethemostthrillingexperenceitisevenbetterthangettingyourrocksoffwithagirlthebestpartofitisthaewhenidieiwillbereborninparadicesndalltheihavekilledwillbecomemyslavesiwillnotgiveyoumynamebecauseyouwilltrytosloidownorstopmycollectiogofslavesformyafterlifeebeorietemethhpiti
score -359.0030393831197
408


In [105]:
symbol_error_rate(decipher_text, 'data/_ref_Zodiac_408.txt')

0.9852941176470589

Notice that the default solution provides a very bad decipherment. Your job is to make it better!

## Grading

Ignore the following cells. They are for grading against the reference decipherment. Based on the clues provided in the decipherment homework description, you can easily find a reasonable reference text online for this cipher text.

In [93]:
"""
ATTENTION!
For grading purposes only. Don't bundle with the assignment. 
Make sure '_ref.txt' is removed from the 'data' directory before publishing.
"""

def read_gold(gold_file):
    with open(gold_file) as f:
        gold = f.read()
    f.close()
    gold = list(gold.strip())
    return gold

def symbol_error_rate(dec, _gold):
    gold = read_gold(_gold)
    correct = 0
    if len(gold) == len(dec):
        for (d,g) in zip(dec, gold):
            if d==g:
                correct += 1
    wrong = len(gold)-correct
    error = wrong/len(gold)
    
    return error

In [None]:
# gold decipherment
gold_file = "data/_ref.txt"
ser = symbol_error_rate(decipherment, gold_file)
print('Error: ', ser*100, 'Accuracy: ', (1-ser)*100)