# Homework: Decipherment

In [1]:
%load_ext autoreload
%autoreload 2
from collections import defaultdict, Counter
import collections
import pprint
import math
import bz2
from ngram import *
import sys, string
import copy
import pickle
#from joblib import Parallel, delayed
import itertools
pp = pprint.PrettyPrinter(width=45, compact=True)

In [2]:
import numpy as np

First let us read in the cipher text from the `data` directory:

In [3]:
def read_file(filename):
    if filename[-4:] == ".bz2":
        with bz2.open(filename, 'rt', encoding='utf8') as f:
            content = f.read()
            f.close()
    else:
        with open(filename, 'r', encoding='utf8') as f:
            content = f.read()
            f.close()
    return content

cipher = read_file("data/cipher.txt")
print(cipher)

º∫P/Z/uB∫ÀOR•–X•B
WV+≈GyF∞ºHPπKÇ—y≈
MJy^uIÀΩ—T‘NQyDµ£
S¢/º∑BPORAu∫∆RÃ—E
À^LMZJƒ“\–FHVW≈æy
π+—GDºKI£∞—Xæµ§S¢
RN‘IyEÃOæ—GBTQS∑B
Lƒ/P∑BπX—EHMu^RRÀ
√ZK—–I£W—ÇæµLM“º∑
BPDR+j•∞\N¢≈EuHÀF
Z√–OVWIµ+‘L£Ã^R∞H
IºDR∏Ty“\ƒ≈/πXJQA
PµMæRu‘∫L£NVEKH•G
“IÇJÀµºæLMÃNA£Z¢P
§u–ÀAº∑BVW\+VT‘OP
^•S“Ã∆u≈∞ΩD§G∫∫IM
NÀ£S√E/º∫∫Z∆AP∑BV
–≈X—W—∏F∑æ√+πºAºB
∫OTµRu√+∏ƒy—∏^S—W
VZ≈GyKE∏TyAº∫∑L‘∏
HÇFBXº§XADƒ\ΩLÇ•—
∏≈ƒ∑∑∞≈µPORXQF∫G√
ZπJT‘—∏æJI+“BPQW∞
VEX“ºWI∞—EHM£•uIÀ


For the default solution we need to compute statistics like length, number of symbols/letters, 
unique occurences, frequencies and relative frequencies of a given file. This is done in the function `get_statistics` below.

While using `get_statistics`, make sure that `cipher=True` is set when the input is a ciphertext.

In [4]:
def get_statistics(content, cipher=True):
    stats = {}
    content = list(content)
    split_content = [x for x in content if x != '\n' and x!=' ']
    length = len(split_content)
    symbols = set(split_content)
    uniq_sym = len(list(symbols))
    freq = collections.Counter(split_content)
    rel_freq = {}
    for sym, frequency in freq.items():
        rel_freq[sym] = (frequency/length)*100
        
    if cipher:
        stats = {'content':split_content, 'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    else:
        stats = {'length':length, 'vocab':list(symbols), 'vocab_length':uniq_sym, 'frequencies':freq, 'relative_freq':rel_freq}
    return stats

In [5]:
"""
ATTENTION!
For grading purposes only. Don't bundle with the assignment. 
Make sure '_ref.txt' is removed from the 'data' directory before publishing.
"""

def read_gold(gold_file):
    with open(gold_file) as f:
        gold = f.read()
    f.close()
    gold = list(gold.strip())
    return gold

def symbol_error_rate(dec, _gold):
    gold = read_gold(_gold)
    correct = 0
    if len(gold) == len(dec):
        for (d,g) in zip(dec, gold):
            if d==g:
                correct += 1
    wrong = len(gold)-correct
    error = wrong/len(gold)
    
    return error

In [6]:
cipher_desc = get_statistics(cipher, cipher=True)
#pp.pprint(cipher_desc)

## Load the 6-gram model

In [7]:
%%time
sequence = 'In a few cases, a multilingual artifact has been necessary to facilitate decipherment, the Rosetta Stone being the classic example. Statistical techniques provide another pathway to decipherment, as does the analysis of modern languages derived from ancient languages in which undeciphered texts are written. Archaeological and historical information is helpful in verifying hypothesized decipherments.'

# lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=True)
lm = LM("data/6-gram-wiki-char.lm.bz2", n=6, verbose=False)

Reading language model from data/6-gram-wiki-char.lm.bz2...


CPU times: user 30 s, sys: 879 ms, total: 30.9 s
Wall time: 31.1 s


Done.


In [8]:
print(sequence)
lm_logprob = lm.score_seq(sequence)
print("TOTAL LM LOGPROB: {}".format(lm_logprob), file=sys.stderr)
print("TOTAL LM LOGPROB: {}".format(lm.score_seq('this is the text.')), file=sys.stderr)
print("TOTAL LM LOGPROB: {}".format(lm.score_seq('jasbklfhthejkldhf')), file=sys.stderr)
print(lm.get_bitstring_spans('..oo...ooo..'))
print(lm.score_bitstring('thisisatest', 'oo...oo.ooo'))

In a few cases, a multilingual artifact has been necessary to facilitate decipherment, the Rosetta Stone being the classic example. Statistical techniques provide another pathway to decipherment, as does the analysis of modern languages derived from ancient languages in which undeciphered texts are written. Archaeological and historical information is helpful in verifying hypothesized decipherments.
{2: 3, 3: 4, 7: 8, 8: 9, 9: 10}
-8.10905897


TOTAL LM LOGPROB: -221.09434842188
TOTAL LM LOGPROB: -9.76947916
TOTAL LM LOGPROB: -40.57683077


# From Yabin: new score function (should be more efficient) 

In [9]:
def score_new(cipher, phi, new_f, new_e, previous_score):
    '''
    scores the phi_prime based on the previous score, returns a dict
    cipher -- list of single character string
    phi -- dictionary, old mapping e->[f]
    new_f -- single-character string, extended symbol
    previous_score -- float, old score for phi
    '''
    mapping = phi
    new_score = previous_score
    lm_state = lm.begin()
    for i in range(len(cipher)):
        char = cipher[i]
        if char in mapping.keys():
            token = mapping[char]
            ngram = lm_state + (token,)
            while len(ngram)> 0:
                if ngram in lm.table:
                    lm_state = ngram[-lm.history:]
                    break
                else: #backoff
                    ngram = ngram[1:]
            if len(ngram)==0:
                lm_state = ()
        elif char == new_f:
            (lm_state, logprob) = lm.score(lm_state, new_e)
            new_score += logprob
        else:
            lm_state = ()  
    return new_score

In [10]:
def beam_search_new(cipher, ext_order, ext_limits=1, topn=1):
    '''
    finds the mappings between cipher char and plaintext char, returns the mapping dictionary
    ext_order -- list, the unigram char list sorted by their count DESC
    ext_limits -- int, defines maximum number of cipher char can be mapped to a plaintext char
    topn -- int, defines the number of dictionaries we want to keep while pruning
    '''
    print('Number of unique symbols in cipher:', len(ext_order))
    # mapping relationships already found with score
    Hs = [(defaultdict(dict), 0)]
    # hypothesis mapping relationships with score
    Ht = []
    # initialize the cardinality (number of unique cipher text)
    cardinality = 0
    # list of plaintext characters
    Ve = [chr(i) for i in range(97, 123, 1)]
    while cardinality < len(ext_order):
    #while cardinality < 2:
        f = ext_order[cardinality]
        print('Working on symbol: ', f, f'({cardinality+1})')
        for phi, previous_score in Hs:
            for e in Ve:
                phi_prime = copy.deepcopy(phi)
                new_map = {f: e}
                phi_prime.update(new_map)
                counts = len([v for k, v in phi_prime.items() if v == e])
                if counts <= ext_limits:
                    Ht.append((phi_prime, score_new(cipher, phi, f, e, previous_score)))
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]                    
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        print('Current score: ', Hs[0][1])
        #print('Hs', Hs)
    return sorted(Hs, key=lambda x:x[1], reverse=True)

# new extension order

In [11]:
def find(string, char):
    return [i for i, letter in enumerate(string) if letter == char]

In [12]:
def gen_bit_str(cipher, deciphered_symbols, voc):
    # generate bit string
    bit_string = ''
    for char in cipher:   
        if (char in deciphered_symbols) or (char == voc):
            bit_string += 'x'
        else: 
            bit_string += '.'
    return bit_string

In [None]:
def get_num_pos(ngram, new_bit_str):
    num_pos = np.zeros(ngram)
    
    # count unigram
    count_uni = 0
    for j in new_bit_str.split('.'):
        if len(j) != 0:
            count_uni += 1
    num_pos[0]=count_uni   

    # count multigram
    to_find = 'xx'
    for i in range(1, ngram):
        s = copy.deepcopy(new_bit_str)
        count = 0

        while to_find in s:
            count += 1 
            s = s[(s.find(to_find) + 1):]
        to_find += 'x'
        num_pos[i]=count
    
    return num_pos

In [None]:
def find_ext_order(cipher, topn=100, weights=(1.0, 1.0, 1.0, 1.0, 2.0, 3.0), ngram = 6):

    # initial
    cipher_desc = get_statistics(cipher, cipher=True)
    weights = np.array(weights)
    cipher_vocab = cipher_desc['vocab']

    #first_symbol = cipher_desc['frequencies'].most_common(1)[0][0] # use the most frequent symbol as first symbol
    
    #cardinality = 1
    cardinality = 0
    
    # symbols already found with score
    # Hs = [([first_symbol], cipher_desc['frequencies'][first_symbol])]
    Hs = [([], 0)]
    # hypothesis extended symbols with score
    Ht = []
    
    print('Done with symbol number', cardinality, '; Current best score: ', Hs[0][1])
    
    while cardinality < cipher_desc['vocab_length']:
        for phi, previous_score in Hs:
            
            deciphered_symbols = phi
            
            
            for voc in cipher_vocab:
                phi_prime = copy.deepcopy(phi)
                if voc in phi_prime:
                    continue
                else:
                    weighted_sum = 0.0
                    num_pos = np.zeros(ngram)
                    
                    bit_string = gen_bit_str(cipher, deciphered_symbols, voc)
                   
                    num_pos = get_num_pos(ngram, bit_string)

                    weighted_sum += np.sum(np.multiply(weights, num_pos))
                    #print("weighted_sum: " + str(weighted_sum))
                    
                    phi_prime.append(voc)
                    #this_score = previous_score + weighted_sum
                    Ht.append((phi_prime, weighted_sum))
        
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        
        print('Done with symbol number', cardinality, '; Current best score: ', Hs[0][1], '; Current worst score: ', Hs[-1][1])
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [None]:
%%time
# test the function above
ext_order = find_ext_order(cipher, 100)

In [None]:
%%time
mappings = beam_search_new(cipher_desc['content'], ext_order[0][0], 8, 1000)

In [None]:
mapping = mappings[0][0]

In [None]:
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)
print('score', lm.score_seq(decipher_text))
print(len(decipher_text))

# OLD entension order

In [13]:
def case1(ngram):
    #print("case 1")
    num_pos = np.zeros(ngram)
    num_pos[0] += 1
    return num_pos
def case2(ngram, str_after):
    #print("case 2")
    num_pos = np.zeros(ngram)
    if str_after.find('.') != -1:
        str_after = str_after[:str_after.find('.')]
    #print("after1: "+str_after)
    for i in range(len(str_after)):
        idx = i+1
        num_pos[idx] += 1
        if idx+1 == ngram:
            break    
    return num_pos
def case3(ngram, str_before):
    #print("case 3")
    num_pos = np.zeros(ngram)
    str_before = str_before[str_before.rfind('.')+1:]
    #print("before1: "+str_before)
    for i in range(len(str_before)):
        idx = i+1
        num_pos[idx] += 1
        if idx+1 == ngram:
            break
    return num_pos
def case4(ngram, str_before, str_after):  
    #print("case 4")
    num_pos = np.zeros(ngram)
    str_before = str_before[str_before.rfind('.')+1:]
    if str_after.find('.') != -1:
        str_after = str_after[:str_after.find('.')]
    #print("before1: "+str_before)
    #print("after1: "+str_after)
    num_pos[0] -= 1
    for i in range(len(str_before)):
        idx = i+1
        num_pos[idx] += 1
        if idx+1 == ngram:
            break    
    for i in range(len(str_after)):
        idx = i+1
        num_pos[idx] += 1
        if idx+1 == ngram:
            break    
            
    # ********* to be complete ********* #
    
    return num_pos

In [14]:
def find_ext_order(cipher, topn=100, weights=(1.0, 1.0, 1.0, 1.0, 2.0, 3.0), ngram = 6):

    # initial
    cipher_desc = get_statistics(cipher, cipher=True)
    weights = np.array(weights)
    cipher_vocab = cipher_desc['vocab']

    first_symbol = cipher_desc['frequencies'].most_common(1)[0][0] # use the most frequent symbol as first symbol
    
    cardinality = 1
    
    # symbols already found with score
    Hs = [([first_symbol], 0)]
    # hypothesis extended symbols with score
    Ht = []
    
    print('Done with symbol number', cardinality, '; Current best score: ', Hs[0][1])
    
    while cardinality < cipher_desc['vocab_length']:
        for phi, previous_score in Hs:
            
            deciphered_symbols = phi
            bit_string = gen_bit_str(cipher, deciphered_symbols)
            
            for voc in cipher_vocab:
                phi_prime = copy.deepcopy(phi)
                if voc in phi_prime:
                    continue
                else:
                    weighted_sum = 0.0
                    num_pos = np.zeros(ngram)
                    # for every occurance, 
                    for pos in find(cipher, voc):
                        str_before = bit_string[0:pos]
                        str_after = bit_string[pos+1:]
                        #print("before: "+str_before) 
                        #print("after: "+str_after) 

                        if len(str_before) == 0:
                            if str_after[0] == '.':
                                num_pos += case1(ngram)
                            else:
                                num_pos += case2(ngram, str_after)
                        elif len(str_after) == 0:
                            if str_before[-1] == '.':
                                num_pos += case1(ngram)
                            else:
                                num_pos += case3(ngram, str_before)
                        elif (str_before[-1] == '.') and (str_after[0] == '.'):
                            num_pos += case1(ngram)
                        elif (str_before[-1] == '.') and (str_after[0] == 'x'):
                            num_pos += case2(ngram, str_after)
                        elif (str_before[-1] == 'x') and (str_after[0] == '.'):
                            num_pos += case3(ngram, str_before)
                        else:
                            num_pos += case4(ngram, str_before, str_after)

                    weighted_sum += np.sum(np.multiply(weights, num_pos))
                    #print("weighted_sum: " + str(weighted_sum))
                    
                    phi_prime.append(voc)
                    this_score = previous_score + weighted_sum
                    Ht.append((phi_prime, this_score))
        
        # prune the histogram
        Ht = sorted(Ht, key=lambda x:x[1], reverse=True)[:topn]
        cardinality += 1
        Hs = copy.deepcopy(Ht)
        Ht.clear()
        
        print('Done with symbol number', cardinality, '; Current best score: ', Hs[0][1], '; Current worst score: ', Hs[-1][1])
    return sorted(Hs, key=lambda x:x[1], reverse=True)

In [15]:
%%time
# test the function above
ext_order = find_ext_order(cipher, 100)

Done with symbol number 1 ; Current best score:  0
Done with symbol number 2 ; Current best score:  14.0 ; Current worst score:  1.0
Done with symbol number 3 ; Current best score:  26.0 ; Current worst score:  21.0
Done with symbol number 4 ; Current best score:  38.0 ; Current worst score:  36.0
Done with symbol number 5 ; Current best score:  53.0 ; Current worst score:  49.0
Done with symbol number 6 ; Current best score:  67.0 ; Current worst score:  63.0
Done with symbol number 7 ; Current best score:  84.0 ; Current worst score:  79.0
Done with symbol number 8 ; Current best score:  104.0 ; Current worst score:  97.0
Done with symbol number 9 ; Current best score:  121.0 ; Current worst score:  115.0
Done with symbol number 10 ; Current best score:  141.0 ; Current worst score:  134.0
Done with symbol number 11 ; Current best score:  162.0 ; Current worst score:  156.0
Done with symbol number 12 ; Current best score:  184.0 ; Current worst score:  178.0
Done with symbol number 1

In [16]:
%%time
mappings = beam_search_new(cipher_desc['content'], ext_order[0][0], 8, 1000)

Number of unique symbols in cipher: 54
Working on symbol:  — (1)
Current score:  -14.649315199999997
Working on symbol:  º (2)
Current score:  -28.11302579999999
Working on symbol:  ∑ (3)
Current score:  -38.823369300000024
Working on symbol:  B (4)
Current score:  -47.54285170000002
Working on symbol:  P (5)
Current score:  -58.23973588
Working on symbol:  O (6)
Current score:  -65.43686280000004
Working on symbol:  R (7)
Current score:  -76.55245618000004
Working on symbol:  A (8)
Current score:  -84.98652718000005
Working on symbol:  u (9)
Current score:  -94.97365218000007
Working on symbol:  ∫ (10)
Current score:  -105.59759818000008
Working on symbol:  À (11)
Current score:  -114.83868258000008
Working on symbol:  / (12)
Current score:  -120.66467068000011
Working on symbol:  Z (13)
Current score:  -129.37368138000005
Working on symbol:  ∆ (14)
Current score:  -132.03980188000006
Working on symbol:  V (15)
Current score:  -138.64843168
Working on symbol:  W (16)
Current score:  -

In [17]:
mapping = mappings[0][0]

In [18]:
decipher_text = ''
for char in cipher_desc['content']:
    decipher_text += mapping[char]
print(decipher_text)
print('score', lm.score_seq(decipher_text))
print(len(decipher_text))

teeanarherresreshseatstohteenorettintirtrueassdttestiatthereareaeaerriiinnitaroeestatnaesttotsheeaertiessttraraeshadtthiiaethneereirieersnoertsseraeiitttheteamshasitrreronsrresteasisaiehetttesattaitanendaeeiaerseisseroessttrnretaiiasasnierrrratthesaaeasreisttaarthutrseetisrstsrateenaaetherteesesotasantatheraeersasitesitesentstorsatatetisserohetreatiauirsestitthteereedoessnnnasesantathedsherettsthereissrtr
score -598.56311446898
408


In [19]:
with open('data/_ref_Zodiac_408.txt', 'r') as fh:
    ground_truth = fh.read()
print(ground_truth)
print('score', lm.score_seq(ground_truth))
print(len(ground_truth))

ilikekillingpeoplebecauseitissomuchfunitismorefunthankillingwildgameintheforrestbecausemanisthemostdangeroueanamalofalltokillsomethinggivesmethemostthrillingexperenceitisevenbetterthangettingyourrocksoffwithagirlthebestpartofitisthaewhenidieiwillbereborninparadicesndalltheihavekilledwillbecomemyslavesiwillnotgiveyoumynamebecauseyouwilltrytosloidownorstopmycollectiogofslavesformyafterlifeebeorietemethhpiti
score -359.0030393831197
408


In [20]:
symbol_error_rate(decipher_text, 'data/_ref_Zodiac_408.txt')

0.9509803921568627