# Get Plaintext Dataset

In [6]:
import nltk

In [7]:
nltk.download('genesis')

[nltk_data] Downloading package genesis to
[nltk_data]     C:\Users\rosha\AppData\Roaming\nltk_data...
[nltk_data]   Package genesis is already up-to-date!


True

In [8]:
from nltk.corpus import genesis

In [9]:
genesis_words = genesis.words()
filtered_words = [word for word in genesis_words if word.isalpha()]

print(len(filtered_words))
print(filtered_words[:10])

260396
['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven', 'and', 'the', 'earth']


In [10]:
def make_blocks(word_list):
    word_str = "".join(word_list)
    blocked_arr = []
    
    for i in range(0, len(word_str), 5):
        block = word_str[i:i+5].lower()

        if len(block) == 5:
            blocked_arr.append(block)
        else:
            diff = 5 - len(block)
            padded_block = block + "x" * diff
            blocked_arr.append(padded_block)

    return blocked_arr

In [11]:
pt_blocks = make_blocks(filtered_words)

print(pt_blocks[:10])

['inthe', 'begin', 'ningg', 'odcre', 'atedt', 'hehea', 'venan', 'dthee', 'artha', 'ndthe']


# Create Plaintext-Ciphertext dataset

In [12]:
import random
import numpy as np

In [13]:
def caesar_encrypt(text, key):
    result = ""

    # traverse text
    for i in range(len(text)):
        char = text[i]

        result += chr((ord(char) + key - 97) % 26 + 97)

    return result

In [14]:
cipher = caesar_encrypt("Eesha ne mera dil Charaya", key=11)
print(cipher)

jpdslyypyxpclyotwyhslcljl


In [15]:
#This function now returns [[pt, ct, key], [pt, ct, key]]
#instead of [[pt, pt, pt], [ct, ct, ct], key] which is big yikes when training

#The function now also generates a random key for each word

def pt_ct_dataset(pt_list):
    final_list = []

    for pt in pt_list:
        key = random.randint(1, 25)
        ct = caesar_encrypt(pt, key)
        final_list.append([pt, ct, key])

    return final_list

In [16]:
def let2num(letter):
    return ord(letter)

def encode(pt_ct_key_list):
    encoded_list = []

    for pairs in pt_ct_key_list:
        encoded_pt_list = []
        encoded_ct_list = []
        
        for char in pairs[0]:
            encoded_pt_list.append(ord(char))

        for char in pairs[1]:
            encoded_ct_list.append(ord(char))

        encoded_list.append([encoded_pt_list, encoded_ct_list, pairs[2]])
    
    return encoded_list

In [17]:
dataset = pt_ct_dataset(pt_blocks)

print(dataset[:5])

[['inthe', 'nsymj', 5], ['begin', 'nqsuz', 12], ['ningg', 'upunn', 7], ['odcre', 'pedsf', 1], ['atedt', 'ngrqg', 13]]


In [18]:
encoded_dataset = encode(dataset)

print(encoded_dataset[:2])

[[[105, 110, 116, 104, 101], [110, 115, 121, 109, 106], 5], [[98, 101, 103, 105, 110], [110, 113, 115, 117, 122], 12]]


## Re-formatting the pt and ct arrays

In [19]:
pt = []
ct = []

for item in encoded_dataset:
    pt.append(item[0] + [item[2]])
    ct.append(item[1])

print(pt[:2])
print(ct[:2])


[[105, 110, 116, 104, 101, 5], [98, 101, 103, 105, 110, 12]]
[[110, 115, 121, 109, 106], [110, 113, 115, 117, 122]]


In [20]:
pt = np.array(pt)
print(pt.shape)
print(pt[0])

ct = np.array(ct)
print(ct.shape)
print(ct[0])

(220730, 6)
[105 110 116 104 101   5]
(220730, 5)
[110 115 121 109 106]


# Model

In [21]:
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense

In [23]:
model = Sequential()
model.add(Dense(10, activation="relu", input_dim=6))
model.add(Dense(100, activation="relu"))
model.add(Dense(5, activation="softmax"))

In [26]:
model.compile(optimizer="adam", loss="mean_squared_error", metrics = ["accuracy"])

In [27]:
model.fit(pt[:1000], ct[:1000], epochs=100, batch_size= 10,verbose=2)

Epoch 1/100


100/100 - 1s - loss: 12053.9961 - accuracy: 0.2150 - 912ms/epoch - 9ms/step
Epoch 2/100
100/100 - 0s - loss: 12053.9883 - accuracy: 0.2150 - 85ms/epoch - 853us/step
Epoch 3/100
100/100 - 0s - loss: 12053.9902 - accuracy: 0.2150 - 89ms/epoch - 895us/step
Epoch 4/100
100/100 - 0s - loss: 12053.9902 - accuracy: 0.2150 - 90ms/epoch - 901us/step
Epoch 5/100
100/100 - 0s - loss: 12053.9873 - accuracy: 0.2150 - 86ms/epoch - 860us/step
Epoch 6/100
100/100 - 0s - loss: 12053.9873 - accuracy: 0.2150 - 90ms/epoch - 902us/step
Epoch 7/100
100/100 - 0s - loss: 12053.9893 - accuracy: 0.2150 - 87ms/epoch - 869us/step
Epoch 8/100
100/100 - 0s - loss: 12053.9863 - accuracy: 0.2150 - 88ms/epoch - 882us/step
Epoch 9/100
100/100 - 0s - loss: 12053.9893 - accuracy: 0.2150 - 86ms/epoch - 858us/step
Epoch 10/100
100/100 - 0s - loss: 12053.9902 - accuracy: 0.2150 - 85ms/epoch - 848us/step
Epoch 11/100
100/100 - 0s - loss: 12053.9873 - accuracy: 0.2150 - 88ms/epoch - 877us/step
Epoch 12/100
100/1

<keras.src.callbacks.History at 0x2acc83daaa0>