In [20]:
# Importing neccessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import Adam

In [2]:
# reading excel file containing training dataset
xls = pd.ExcelFile('CryptoGrams.xlsx')
xls.sheet_names

['Vigenere', 'SS']

In [3]:
# extracting simple substitution and vignere substitution encyption training data in different dataframes
ss = pd.read_excel(xls, "SS", header=None, names=["cipher"])                    # simple substitution
vig = pd.read_excel(xls, "Vigenere", header=None, names=["cipher"])             # vignere substituion

# Data Preprocessing

In this classification task, a cipher is considered as a bag of words (letters to be exact) and it is represented by a vector whose elements correspond to the frequency of occurrence of different characters in the cipher. The dimension of the vector is the same as the size of the dictionary built by including all the distinct words that occur in a corpus of ciphers. Let N be the size of the dictionary. Let ti be the ith word or term in the dictionary, and tf(ti,d) be the frequency of occurrence of ti in a given cipher d.

A dictionary is constructed using a number of cipher texts in a corpus. We consider two methods for constructing the dictionary. In the first method, cipher texts generated using different encryption methods are included in a single corpus. This method is called the common dictionary method.

In [4]:
# concatinating the 2 dataframes to create a one large training set
# This is created so that we can use common "dictionary scheme" 
df = pd.concat([ss, vig])

In [5]:
# Creating training labels
# 0 -> simple substitution
# 1-> vignere substitution
y =  [0]*50 + [1]*50

In [6]:
# Now we find tfidf for every character in a cipher
v = TfidfVectorizer(analyzer='char')      # Here we create a tfidf object to find the tfidf of a cipher with focus on every character rather than every word
x = v.fit_transform(df['cipher'])         # we convert out text data into numbers that represent their tfidf representation
len(v.get_feature_names())                # it gives us number of charcters in our dictionary

27

In [7]:
# processing train data to feed into neural network
import numpy as np
x = x.todense()
x = np.array(x)
y = np.array(y)
y = y.reshape((100,1))
print("x shape : ", x.shape)
print("y shape : ", y.shape)

x shape :  (100, 27)
y shape :  (100, 1)


In [None]:
# following code converts the test dataset into a test.csv file for further predicting tasks.
import docx
doc = docx.Document("dataset_cryptosystem.docx")

text = []
for p in doc.paragraphs:
    text.append(p.text)
    
len(text)

d = [0,1,2,3,4,5,6,7,8,9,10,11, 12, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 34, 35, 36, 38]
d = sorted(d, reverse=True)
for i in d:
    del text[i]
test = pd.DataFrame(text, columns=["cipher"])
# test.to_csv("test.csv", index_label=False, header=False)

In [8]:
# preprocessing test data
#test = pd.read_csv("test.csv", header=None, names=["cipher"])

x_test = v.fit_transform(test['cipher'])

x_test = x_test.todense()
x_test = np.array(x_test)

# Model

In [31]:
# Neural Network
model = Sequential()                                          # sequential network
model.add(Dense(64, input_dim=27, activation='relu'))         # adding 5 dense layer having 64 hidden units
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))                                        # dropout
model.add(Dense(1, activation='sigmoid'))                      # output layer

model.compile(loss='binary_crossentropy',                      # compiling the model with objective function-binary crossentropy, and optimizer-SGD
              optimizer='SGD',
              metrics=['accuracy'])

model.fit(x, y,                                                # fitting the model
          epochs=20,
          batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fe3fcee9b70>

In [32]:
# Predictions
y_test = model.predict(x_test)
pred = []
for i in range(len(y_test)):
    if y_test[i]>=0.5:
        pred.append("vignere")
    else:
        pred.append("simple substitution")
pred

['vignere',
 'vignere',
 'vignere',
 'vignere',
 'vignere',
 'vignere',
 'vignere',
 'vignere',
 'vignere',
 'vignere',
 'vignere',
 'simple substitution',
 'simple substitution',
 'vignere',
 'vignere',
 'vignere',
 'vignere',
 'vignere',
 'vignere',
 'simple substitution']