# Get Plaintext Dataset

In [293]:
import nltk

In [294]:
nltk.download('genesis')

[nltk_data] Downloading package genesis to
[nltk_data]     C:\Users\rosha\AppData\Roaming\nltk_data...
[nltk_data]   Package genesis is already up-to-date!


True

In [295]:
from nltk.corpus import genesis

In [296]:
genesis_words = genesis.words()
filtered_words = [word for word in genesis_words if word.isalpha()]

print(len(filtered_words))
print(filtered_words[:10])

260396
['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven', 'and', 'the', 'earth']


In [297]:
def make_blocks(word_list):
    word_str = "".join(word_list)
    blocked_arr = []
    
    for i in range(0, len(word_str), 5):
        block = word_str[i:i+5].lower()

        if len(block) == 5:
            blocked_arr.append(block)
        else:
            diff = 5 - len(block)
            padded_block = block + "x" * diff
            blocked_arr.append(padded_block)

    return blocked_arr

In [298]:
pt_blocks = make_blocks(filtered_words)

print(pt_blocks[:10])

['Inthe', 'begin', 'ningG', 'odcre', 'atedt', 'hehea', 'venan', 'dthee', 'arthA', 'ndthe']


# Create Plaintext-Ciphertext dataset

In [299]:
import random

In [300]:
def caesar_encrypt(text, key):
    result = ""

    # traverse text
    for i in range(len(text)):
        char = text[i]

        result += chr((ord(char) + key - 97) % 26 + 97)

    return result

In [301]:
cipher = caesar_encrypt("Eesha ne mera dil Charaya", key=11)
print(cipher)

PpdslyypyxpclyotwyNslcljl


In [302]:
#This function now returns [[pt, ct, key], [pt, ct, key]]
#instead of [[pt, pt, pt], [ct, ct, ct], key] which is big yikes when training

#The function now also generates a random key for each word

def pt_ct_dataset(pt_list):
    final_list = []

    for pt in pt_list:
        key = random.randint(1, 25)
        ct = caesar_encrypt(pt, key)
        final_list.append([pt, ct, key])

    return final_list

In [303]:
def let2num(letter):
    return ord(letter)

def encode(pt_ct_key_list):
    encoded_list = []

    for pairs in pt_ct_key_list:
        encoded_pt_list = []
        encoded_ct_list = []
        
        for char in pairs[0]:
            encoded_pt_list.append(ord(char))

        for char in pairs[1]:
            encoded_ct_list.append(ord(char))

        encoded_list.append([encoded_pt_list, encoded_ct_list, pairs[2]])
    
    return encoded_list

In [304]:
dataset = pt_ct_dataset(pt_blocks)

print(dataset[:5])

[['Inthe', 'Otznk', 6], ['begin', 'ortva', 13], ['ningG', 'bwbuU', 14], ['odcre', 'ncbqd', 25], ['atedt', 'unyxn', 20]]


In [305]:
encoded_dataset = encode(dataset)

print(encoded_dataset[:2])

[[[73, 110, 116, 104, 101], [79, 116, 122, 110, 107], 6], [[98, 101, 103, 105, 110], [111, 114, 116, 118, 97], 13]]


## Re-formatting the pt and ct arrays

In [306]:
pt = []
ct = []

for item in encoded_dataset:
    pt.append(item[0] + [item[2]])
    ct.append(item[1])

print(pt[:2])
print(ct[:2])


[[73, 110, 116, 104, 101, 6], [98, 101, 103, 105, 110, 13]]
[[79, 116, 122, 110, 107], [111, 114, 116, 118, 97]]


In [307]:
print(encoded_dataset[0])

[[73, 110, 116, 104, 101], [79, 116, 122, 110, 107], 6]


In [308]:
print(encoded_dataset[1])

[[98, 101, 103, 105, 110], [111, 114, 116, 118, 97], 13]


# Model