# Homework: Decipherment

In [11]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict, Counter
import collections
import pprint
import math
import bz2
from ngram import *
import sys, string
import copy
pp = pprint.PrettyPrinter(width=45, compact=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


First let us read in the cipher text from the `data` directory:

In [12]:
def read_file(filename):
    if filename[-4:] == ".bz2":
        with bz2.open(filename, 'rt', encoding='utf8') as f:
            content = f.read()
            f.close()
    else:
        with open(filename, 'r', encoding='utf8') as f:
            content = f.read()
            f.close()
    return content

cipher = read_file("data/cipher.txt")
print(cipher)

º∫P/Z/uB∫ÀOR•–X•B
WV+≈GyF∞ºHPπKÇ—y≈
MJy^uIÀΩ—T‘NQyDµ£
S¢/º∑BPORAu∫∆RÃ—E
À^LMZJƒ“\–FHVW≈æy
π+—GDºKI£∞—Xæµ§S¢
RN‘IyEÃOæ—GBTQS∑B
Lƒ/P∑BπX—EHMu^RRÀ
√ZK—–I£W—ÇæµLM“º∑
BPDR+j•∞\N¢≈EuHÀF
Z√–OVWIµ+‘L£Ã^R∞H
IºDR∏Ty“\ƒ≈/πXJQA
PµMæRu‘∫L£NVEKH•G
“IÇJÀµºæLMÃNA£Z¢P
§u–ÀAº∑BVW\+VT‘OP
^•S“Ã∆u≈∞ΩD§G∫∫IM
NÀ£S√E/º∫∫Z∆AP∑BV
–≈X—W—∏F∑æ√+πºAºB
∫OTµRu√+∏ƒy—∏^S—W
VZ≈GyKE∏TyAº∫∑L‘∏
HÇFBXº§XADƒ\ΩLÇ•—
∏≈ƒ∑∑∞≈µPORXQF∫G√
ZπJT‘—∏æJI+“BPQW∞
VEX“ºWI∞—EHM£•uIÀ


## Default Solution

For the default solution we need to compute statistics like length, number of symbols/letters, 
unique occurences, frequencies and relative frequencies of a given file. This is done in the function `get_statistics` below.

While using `get_statistics`, make sure that `cipher=True` is set when the input is a ciphertext.

In [13]:
def get_statistics(content, cipher=True):
    stats = {}
    content = list(content)
    split_content = [x for x in content if x != '\n' and x!=' ']
    length = len(split_content)
    symbols = set(split_content)
    uniq_sym = len(list(symbols))
    freq = collections.Counter(split_content)
    rel_freq = {}
    for sym, frequency in freq.items():
        rel_freq[sym] = (frequency/length)*100
        
    if cipher:
        stats = {'content':split_content, 'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    else:
        stats = {'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    return stats

In [14]:
cipher_desc = get_statistics(cipher, cipher=True)
pp.pprint(cipher_desc)

{'content': ['º', '∫', 'P', '/', 'Z', '/',
             'u', 'B', '∫', 'À', 'O', 'R',
             '•', '–', 'X', '•', 'B', 'W',
             'V', '+', '≈', 'G', 'y', 'F',
             '∞', 'º', 'H', 'P', 'π', 'K',
             'Ç', '—', 'y', '≈', 'M', 'J',
             'y', '^', 'u', 'I', 'À', 'Ω',
             '—', 'T', '‘', 'N', 'Q', 'y',
             'D', 'µ', '£', 'S', '¢', '/',
             'º', '∑', 'B', 'P', 'O', 'R',
             'A', 'u', '∫', '∆', 'R', 'Ã',
             '—', 'E', 'À', '^', 'L', 'M',
             'Z', 'J', 'ƒ', '“', '\\', '–',
             'F', 'H', 'V', 'W', '≈', 'æ',
             'y', 'π', '+', '—', 'G', 'D',
             'º', 'K', 'I', '£', '∞', '—',
             'X', 'æ', 'µ', '§', 'S', '¢',
             'R', 'N', '‘', 'I', 'y', 'E',
             'Ã', 'O', 'æ', '—', 'G', 'B',
             'T', 'Q', 'S', '∑', 'B', 'L',
             'ƒ', '/', 'P', '∑', 'B', 'π',
             'X', '—', 'E', 'H', 'M', 'u',
             '^', 'R', 'R', 'À', '√', 'Z',
          

The default solution matches the frequency of symbols in the cipher text with frequency of letters in the plaintext language (in this case, English). Note that this is just some text in English used to compute letter frequencies. We do not have access to the real plaintext in this homework. 

In order to do compute plaintext frequencies, we use an English dataset has no punctuation or spaces and all characters are lowercase.

In [5]:
# plaintext description
plaintxt = read_file("data/default.wiki.txt.bz2")
plaintxt_desc = get_statistics(plaintxt, cipher=False)
# pp.pprint(plaintxt_desc)

We have all the tools we need to describe the default solution to this homework.

We use a simple frequency matching heuristic to map cipher symbols to English letters.

We match the frequencies using the function $f(\cdot)$ of each cipher symbol $c$ with each English letter $e$:

$$h_{c,e} = | \log(\frac{f(c)}{f(e)})) | $$

For each cipher text symbol $c$ we then compute the most likely plain text symbol $e$ by sorting based on the above score.

In [6]:
"""
default : frequency matching heuristic

Notice how the candidate mappings, a.k.a hypotheses, are first scored with a measure of quality and, 
then, the best scoring hypothesis is chosen as the winner. 

The plaintext letters from the winner are then mapped to the respective ciphertext symbols.
"""

# def find_mappings(ciphertext, plaintext):
#     mappings = defaultdict(dict)
#     hypotheses = defaultdict(dict)
#     # calculate alignment scores
#     for symbol in ciphertext['vocab']:
#         for letter in plaintext['vocab']:
#             hypotheses[symbol][letter] = abs(math.log((ciphertext['relative_freq'][symbol]/plaintext['relative_freq'][letter])))
    
#     # find winner
#     for sym in hypotheses.keys():
#         #mappings[sym] = min(lemma_alignment[sym], key=lemma_alignment[sym].get)
#         winner = sorted(hypotheses[sym].items(), key=lambda kv: kv[1])
#         mappings[sym] = winner[1][0]
    
#     return mappings

'\ndefault : frequency matching heuristic\n\nNotice how the candidate mappings, a.k.a hypotheses, are first scored with a measure of quality and, \nthen, the best scoring hypothesis is chosen as the winner. \n\nThe plaintext letters from the winner are then mapped to the respective ciphertext symbols.\n'

Using this scoring function we map the cipher symbol `∆` to `v` in English

In [7]:
# mapping = find_mappings(cipher_desc, plaintxt_desc)
# print("∆ maps to {}\n".format(mapping['∆']))
# print(mapping)

The default solution to this decipherment problem is to take each cipher symbol and map it to the most likely English letter as provided by the `find_mappings` function above.

In [8]:
# english_text = []
# for symbol in cipher_desc['content']:
#     english_text.append(mapping[symbol])
# decipherment = ('').join(english_text)
# print(decipherment)

## Baseline

In [15]:
%%time
sequence = 'In a few cases, a multilingual artifact has been necessary to facilitate decipherment, the Rosetta Stone being the classic example. Statistical techniques provide another pathway to decipherment, as does the analysis of modern languages derived from ancient languages in which undeciphered texts are written. Archaeological and historical information is helpful in verifying hypothesized decipherments.'

# lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=True)
lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=False)

Reading language model from data/6-gram-wiki-char.lm.bz2...



Wall time: 1min 22s


In [16]:
print(sequence)
lm_logprob = lm.score_seq(sequence)
print("TOTAL LM LOGPROB: {}".format(lm_logprob), file=sys.stderr)

print("TOTAL LM LOGPROB: {}".format(lm.score_seq('this is the text.')), file=sys.stderr)
print("TOTAL LM LOGPROB: {}".format(lm.score_seq('jasbklfhthejkldhf')), file=sys.stderr)

print(lm.get_bitstring_spans('..oo...ooo..'))
print(lm.score_bitstring('thisisatest', 'oo...oo.ooo'))

In a few cases, a multilingual artifact has been necessary to facilitate decipherment, the Rosetta Stone being the classic example. Statistical techniques provide another pathway to decipherment, as does the analysis of modern languages derived from ancient languages in which undeciphered texts are written. Archaeological and historical information is helpful in verifying hypothesized decipherments.
{2: 3, 3: 4, 7: 8, 8: 9, 9: 10}
-8.10905897


TOTAL LM LOGPROB: -221.09434842188
TOTAL LM LOGPROB: -9.76947916
TOTAL LM LOGPROB: -40.57683077


In [17]:
def reverse_mapping(reversed_mapping):
    mapping = dict()
    for key, values in reversed_mapping.items():
        for value in values:
            mapping[value] = key
    return mapping

In [18]:
def score(phi, cipher, lm):
    mapping = reverse_mapping(phi)
    partial_cipher = ''
    bit_string = ''
    #n_decipher = 0
    for char in cipher:
        if char in mapping.keys():
            partial_cipher += mapping[char]
            bit_string += 'o'
            #n_decipher += 1
        else:
            partial_cipher += char
            bit_string += '.'
    # print(bit_string)
    # return lm.score_bitstring(partial_cipher, bit_string) / n_decipher
    return lm.score_bitstring(partial_cipher, bit_string)

In [19]:
# test the two functions above
reversed_mapping = {'i': [cipher_desc['content'][0]], 'l': [cipher_desc['content'][1]]}
print('plaintext char to cipher text char', reversed_mapping)
print('cipher text char to plaintext char', reverse_mapping(reversed_mapping))
print('score: ', score(reversed_mapping, cipher, lm))

plaintext char to cipher text char {'i': ['º'], 'l': ['∫']}
cipher text char to plaintext char {'º': 'i', '∫': 'l'}
score:  -32.930710299999994


In [20]:
def beam_search(cipher, ext_order, ext_limits=1, topn=1):
    '''
    finds the mappings between cipher char and plaintext char, returns the mapping dictionary
    ext_order -- list, the unigram char list sorted by their count DESC
    ext_limits -- int, defines maximum number of cipher char can be mapped to a plaintext char
    topn -- int, defines the number of dictionaries we want to keep while pruning
    '''
    print('Number of unique symbols in cipher:', len(ext_order))
    # mapping relationships already found with score
    Hs = [(defaultdict(dict), 0)]
    # hypothesis mapping relationships with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # list of plaintext characters
    Ve = [chr(i) for i in range(97, 123, 1)]
    while cardinality < len(ext_order):
    #while cardinality < 2:
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality})')
        for phi, _ in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                #print('phi', phi)
                #print('keys', phi_prime.keys())
                if e in phi_prime.keys():
                    if len(phi_prime[e]) < ext_limits:
                        phi_prime[e].append(f)
                        Ht.append((phi_prime, score(phi_prime, cipher, lm)))
                else:
                    phi_prime[e] = [f]
                    this_score = score(phi_prime, cipher, lm)
                    Ht.append((phi_prime, this_score))
                #print('phi_prime', phi_prime)
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [21]:
# cipher = read_file("data/cipher.txt")
# cipher_desc = get_statistics(cipher, cipher=True)
# plaintxt = read_file("data/default.wiki.txt.bz2")
# plaintxt_desc = get_statistics(plaintxt, cipher=False)

In [22]:
right_most_cipher_vocab = list()
for cipher_char in reversed(cipher_desc['content']):
    if cipher_char not in right_most_cipher_vocab:
        right_most_cipher_vocab.append(cipher_char)

In [24]:
%%time
sorted_keys = sorted(cipher_desc['frequencies'], key=cipher_desc['frequencies'].get, reverse=True)
#reversed_mappings = beam_search(cipher_desc['content'], sorted_keys, 3, 100)
reversed_mappings = beam_search(cipher_desc['content'], right_most_cipher_vocab, 3, 100)

Number of unique symbols in cipher: 54
Working on symbol:  À (0)
Working on symbol:  I (1)
Working on symbol:  u (2)
Working on symbol:  • (3)
Working on symbol:  £ (4)
Working on symbol:  M (5)
Working on symbol:  H (6)
Working on symbol:  E (7)
Working on symbol:  — (8)
Working on symbol:  ∞ (9)
Working on symbol:  W (10)
Working on symbol:  º (11)
Working on symbol:  “ (12)
Working on symbol:  X (13)
Working on symbol:  V (14)
Working on symbol:  Q (15)
Working on symbol:  P (16)
Working on symbol:  B (17)
Working on symbol:  + (18)
Working on symbol:  J (19)
Working on symbol:  æ (20)
Working on symbol:  ∏ (21)
Working on symbol:  ‘ (22)
Working on symbol:  T (23)
Working on symbol:  π (24)
Working on symbol:  Z (25)
Working on symbol:  √ (26)
Working on symbol:  G (27)
Working on symbol:  ∫ (28)
Working on symbol:  F (29)
Working on symbol:  R (30)
Working on symbol:  O (31)
Working on symbol:  µ (32)
Working on symbol:  ≈ (33)
Working on symbol:  ∑ (34)
Working on symbol:  ƒ (35)

In [25]:
mapping = reverse_mapping(reversed_mappings[0][0])

In [26]:
len(mapping)

54

In [27]:
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)
print('score', lm.score_seq(decipher_text))
print(len(decipher_text))

isapoperstllrherritoldmuainamgbnmltsmpestwnonydmwgemkpicrallhesglknitpctosurchuntilammondwigseaneagymklynsmiklandrodmcrcupacrmenintepllthognhseinbagctricrawlouracyklientuohhltisgoncekplansiwldomrculpmesdhagtalensceytignrdrsbstgiactkyheokayehthicrticotonlaprmrkgelawwydssstytemhipissoghacrthleninducahomihirsloglehodumndpmnitoldmgidomhisccndnbureiyehwucwcbrndluccalgalledusdhomsonndassorradiatieriisaninterest
score -643.7452808190006
408


In [28]:
with open('data/_ref_Zodiac_408.txt', 'r') as fh:
    ground_truth = fh.read()
print(ground_truth)
print('score', lm.score_seq(ground_truth))
print(len(ground_truth))

ilikekillingpeoplebecauseitissomuchfunitismorefunthankillingwildgameintheforrestbecausemanisthemostdangeroueanamalofalltokillsomethinggivesmethemostthrillingexperenceitisevenbetterthangettingyourrocksoffwithagirlthebestpartofitisthaewhenidieiwillbereborninparadicesndalltheihavekilledwillbecomemyslavesiwillnotgiveyoumynamebecauseyouwilltrytosloidownorstopmycollectiogofslavesformyafterlifeebeorietemethhpiti
score -359.0030393831197
408


In [29]:
n_total = 0
n_correct = 0
for i in range(len(decipher_text)):
    if ground_truth[i] == decipher_text[i]:
        n_correct += 1
    n_total += 1
print('Length', n_total)
print('Correct', n_correct)

Length 408
Correct 33


Notice that the default solution provides a very bad decipherment. Your job is to make it better!

## Grading

Ignore the following cells. They are for grading against the reference decipherment. Based on the clues provided in the decipherment homework description, you can easily find a reasonable reference text online for this cipher text.

In [None]:
"""
ATTENTION!
For grading purposes only. Don't bundle with the assignment. 
Make sure '_ref.txt' is removed from the 'data' directory before publishing.
"""

def read_gold(gold_file):
    with open(gold_file) as f:
        gold = f.read()
    f.close()
    gold = list(gold.strip())
    return gold

def symbol_error_rate(dec, _gold):
    gold = read_gold(_gold)
    correct = 0
    if len(gold) == len(dec):
        for (d,g) in zip(dec, gold):
            if d==g:
                correct += 1
    wrong = len(gold)-correct
    error = wrong/len(gold)
    
    return error

In [None]:
# gold decipherment
gold_file = "data/_ref.txt"
ser = symbol_error_rate(decipherment, gold_file)
print('Error: ', ser*100, 'Accuracy: ', (1-ser)*100)