### RNN: Caesar cipher

In [1]:
import time
import random
import re
import numpy as np
import pandas as pd
import torch
import torch.nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [2]:
batch_size = 32
num_epochs = 10
learning_rate = 0.05
file_name = './data/article.txt'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
caesar_offset = 3
alphabet = ('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')
shifted_alphabet = alphabet[caesar_offset:] + alphabet[:caesar_offset]

char_to_index = {c: i for i, c in enumerate(alphabet)}
index_to_char = list(alphabet)

In [4]:
def load_and_preprocess(txt_path):
    """Preprocess text."""
    with open(txt_path, encoding='utf-8') as txt_file:
        text = txt_file.read()
        text = re.sub('[^a-zA-Z\. ]', ' ', text)
        text = re.sub('\s+', ' ', text)
        text = re.sub('( s )', 's ', text)

        data = text.replace('\n', ' ').split(".")
        data = [x.strip(' ') for x in data]
    print(f'Number of strings in data list: {len(data)}')
    return data

def encrypt_text(text, offset, alphabet=alphabet,
                 shifted_alphabet=shifted_alphabet):
    table = str.maketrans(alphabet, shifted_alphabet)
    encrypted_text = [text[i].translate(table) for i in range(len(text))]
    return encrypted_text


def decrypt_text(encrypted_text, alphabet=alphabet,
                 shifted_alphabet=shifted_alphabet):
    table = str.maketrans(shifted_alphabet, alphabet)
    decrypted_text = [encrypted_text[i].translate(table) for i in range(len(encrypted_text))]
    return decrypted_text

In [5]:
data = load_and_preprocess(file_name)
encrypted = encrypt_text(data, caesar_offset)
decrypted = decrypt_text(encrypted)

Number of strings in data list: 418


In [6]:
df = pd.DataFrame({'Decrypted_text': decrypted,'Encrypted_text': encrypted})
df.head()

Unnamed: 0,Decrypted_text,Encrypted_text
0,Childrens experiences and propaganda,FkloguhqvchAshulhqfhvcdqgcsursdjdqgd
1,Curator Ian Cooke discusses the ways in which ...,FxudwrucLdqcFrrnhcglvfxvvhvcwkhczdBvclqczklfkc...
2,Children were affected by the First World War ...,FkloguhqczhuhcdiihfwhgceBcwkhcIluvwcZruogcZduc...
3,For the first time war impacted on whole popul...,Irucwkhciluvwcwlphczduclpsdfwhgcrqczkrohcsrsxo...
4,Technological innovations brought the threat o...,Whfkqrorjlfdoclqqrydwlrqvceurxjkwcwkhcwkuhdwcr...


In [7]:
df.to_csv('./data/article_decr_encr.csv', index=False)

In [8]:
# Tokenization
text = [[c for c in ph] for ph in data]
Y = [[char_to_index[ch] for ch in line] for line in data]
X = [[i + caesar_offset for i in line] for line in Y]

In [9]:
# Padding
Y_tensor = [torch.as_tensor(seq) for seq in Y]
Y_padded = pad_sequence(Y_tensor, batch_first=True)

X_tensor = [torch.as_tensor(seq) for seq in X]
X_padded = pad_sequence(X_tensor, batch_first=True)

print(f"Y shape: {Y_padded.shape}")
print(f"X shape: {X_padded.shape}")

Y shape: torch.Size([418, 374])
X shape: torch.Size([418, 374])


In [10]:
dataset = TensorDataset(X_padded, Y_padded)
data = DataLoader(dataset, batch_size, shuffle=True)

#### Model

In [11]:
class BasicRNN(torch.nn.Module):
    def __init__(self, dictionary_size, embedding_size,
                 num_hiddens, num_classes):
        super().__init__()
        self.num_hiddens = num_hiddens
        self.embedding = torch.nn.Embedding(dictionary_size, embedding_size)
        self.rnn = torch.nn.RNN(embedding_size, num_hiddens, batch_first=True)
        self.output = torch.nn.Linear(num_hiddens, num_classes)

    def forward(self, X):
        embed = self.embedding(X)
        output, hidden = self.rnn(embed)
        output = self.output(output)
        return output

In [12]:
model = BasicRNN(len(char_to_index) + caesar_offset, 28, 64,
                 len(char_to_index) + caesar_offset).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [13]:
for epoch in range(num_epochs):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    model.train()
    for X_b, y_b in data:
      optimizer.zero_grad()
      y_b = y_b.view(1, -1).squeeze()
      answers = model(X_b)
      answers = answers.view(-1, len(alphabet) + caesar_offset)
      loss = criterion(answers, y_b)
      train_loss += loss.item()

      loss.backward()
      optimizer.step()
      train_passed += 1

    print("Epoch {}   Time {:.3f}    Train Loss: {:.3f}".format(epoch, time.time() - start,
                                                                train_loss / train_passed))

Epoch 0   Time 4.595    Train Loss: 0.840
Epoch 1   Time 3.222    Train Loss: 0.038
Epoch 2   Time 1.995    Train Loss: 0.011
Epoch 3   Time 2.338    Train Loss: 0.004
Epoch 4   Time 2.127    Train Loss: 0.002
Epoch 5   Time 2.290    Train Loss: 0.001
Epoch 6   Time 2.446    Train Loss: 0.001
Epoch 7   Time 2.603    Train Loss: 0.001
Epoch 8   Time 2.655    Train Loss: 0.000
Epoch 9   Time 3.003    Train Loss: 0.000


In [14]:
idx = random.randint(0, len(X_padded) - 1)
results = model(X_padded.to(device)).argmax(dim=2)
acc = (results == Y_padded.to(device)).flatten()
acc = (acc.sum() / acc.shape[0])
out_sentence = "".join([index_to_char[i] for i in results[idx]])
true_sentence = "".join([index_to_char[i] for i in Y_padded[idx]])

print(f"Train accuracy: {acc:.3f}")
print(f"Validation sentence: '{out_sentence[:77]}'")
print(f"True sentence: '{true_sentence[:77]}'")

Train accuracy: 1.000
Validation sentence: 'The hope that love could bring to the soldier on the front lines is captured '
True sentence: 'The hope that love could bring to the soldier on the front lines is captured '


_Taking a phrase from outside the dataset:_

In [15]:
sentence = """Language changes very subtly whenever speakers come into
contact with each other. No two individuals speak identically: people from
different geographical places clearly speak differently, but even within
the same small community there are variations according to a speaker's age,
gender, ethnicity and social and educational background. Through our interactions
with these different speakers, we encounter new words, expressions and
pronunciations and integrate them into our own speech."""

In [16]:
def preprocess(sentence):
    sentence = re.sub('[^a-zA-Z ]', ' ', sentence)
    sentence = re.sub('\s+', ' ', sentence)
    sentence = re.sub('( s )', 's ', sentence)
    return sentence

sentence = preprocess(sentence)
encrypted_sentence = encrypt_text(sentence, caesar_offset, alphabet=alphabet)
encrypted_idx = [char_to_index[char] + caesar_offset for char in sentence]

result = model(torch.tensor([encrypted_idx]).to(device)).argmax(dim=2)
Y = torch.tensor([[char_to_index[ch] for ch in line] for line in sentence])
acc = (result == Y.reshape(1, -1).to(device)).flatten()
acc = (acc.sum() / acc.shape[0])
decrypted_sentence = "".join([alphabet[i] for i in result.flatten()])

In [17]:
print(f"Accuracy: {acc:.3f}")
print("_" * 17)
length = 133
print("Encrypted sentence:")
for i in range(0, len(encrypted_sentence), length):
    print (f"{''.join(encrypted_sentence[i: i+length])}")
print("-" * 17)
print("Decrypted sentence:")
for i in range(0, len(decrypted_sentence), length):
    print (f"{decrypted_sentence[i: i+length]}")

Accuracy: 1.000
_________________
Encrypted sentence:
OdqjxdjhcfkdqjhvcyhuBcvxewoBczkhqhyhucvshdnhuvcfrphclqwrcfrqwdfwczlwkchdfkcrwkhucQrcwzrclqglylgxdovcvshdnclghqwlfdooBcshrsohciurpcgli
ihuhqwcjhrjudsklfdocsodfhvcfohduoBcvshdncgliihuhqwoBcexwchyhqczlwklqcwkhcvdphcvpdoocfrppxqlwBcwkhuhcduhcyduldwlrqvcdffruglqjcwrcdcvsh
dnhuvcdjhcjhqghuchwkqlflwBcdqgcvrfldocdqgchgxfdwlrqdocedfnjurxqgcWkurxjkcrxuclqwhudfwlrqvczlwkcwkhvhcgliihuhqwcvshdnhuvczhchqfrxqwhuc
qhzczrugvchAsuhvvlrqvcdqgcsurqxqfldwlrqvcdqgclqwhjudwhcwkhpclqwrcrxucrzqcvshhfkc
-----------------
Decrypted sentence:
Language changes very subtly whenever speakers come into contact with each other No two individuals speak identically people from dif
ferent geographical places clearly speak differently but even within the same small community there are variations according to a spe
akers age gender ethnicity and social and educational background Through our interactions with these different speakers we encounter 
new words expressions a