In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import string
import re
import nltk
import random

In [72]:
with open('moby_dick.txt', 'r') as file:
    text = file.read()

In [248]:
class Cipher:
    
    def __init__(self, text, dna_size, n_child, epochs):
        
        self.text = text
        self.text_to_encode = None

        self.cheat_mapper = None
    
        self.markov = np.ones(shape = (len(string.ascii_lowercase), len(string.ascii_lowercase)))
        self.pi = np.ones(shape = len(string.ascii_lowercase))
        
        self.offspring = None
        self.dna_size = dna_size
        self.n_child = n_child
        self.dna_pool = None
        self.epochs = epochs
        self.control_function()
    
    
    
    
    def control_function(self):
        self.cheat_mapper = self.mix_letters()
        self.text_reading()
        self.text_to_encode = self.encode_text(self.text)
        
        self.dna_pool = self.creat_dna_pool(self.dna_size)
        self.offspring = self.evolve_offspring(self.dna_pool, self.n_child)
        
        for i in range(self.epochs):
            self.training()
            print('Epochs #{0} is done'.format(i))
            
        for num, dna in enumerate(self.dna_pool):
            Cipher.dna_score(dna, num)
            
            
            
    
    def mix_letters(self):
        alphabet1 = [letter for letter in string.ascii_lowercase]
        alphabet2 = [letter for letter in string.ascii_lowercase]

        random.shuffle(alphabet2)

        cheat_mapper = {i:alphabet2[number] for number, i in enumerate(alphabet1, start = 0)}
        return cheat_mapper

    
    def text_reading(self):
        tokens = nltk.word_tokenize(self.text)
        
        for word in tokens:
            self.pi_update(word[0])
            
            for index in range(len(word)-1):
                char1 = word[index]
                char2 = word[index + 1]
                
                self.markov_update(char1, char2)
                
        self.pi = self.pi/self.pi.sum()
        self.markov = self.markov/self.markov.sum(axis = 1)
                
    def pi_update(self, char):
        index = ord(char) - 97
        self.pi[index] += 1
        
    def markov_update(self, char1, char2):
        index_char1 = ord(char1) - 97
        index_char2 = ord(char2) - 97
        self.markov[index_char1, index_char2] += 1
        

    def word_propability(self, word):
        
        index = ord(word[0]) - 97
        #logp = np.log(self.pi[index])
        logp = self.pi[index]/sum(self.pi)
        
        for index in range(1, len(word)-1):
            index_1 = ord(word[index]) - 97
            index_2 = ord(word[index + 1]) - 97
            #logp += np.log(self.markov[index_1, index_2])
            logp += self.markov[index_1, index_2]/self.markov.sum()
        return logp
    
    def sequence_propability(self, sequence):
        sequence = Cipher.text_preprocessing(sequence) 
        words = nltk.word_tokenize(sequence)
        
        logp = 0
        
        for word in words:
            logp += self.word_propability(word)
        return logp
          
    def encode_text(self, text):
        new_text = ''
        for letter in text:
            if letter != ' ':
                new_text += self.cheat_mapper[letter]
            if letter == ' ':
                new_text += ' '
        
        return new_text
    
    def creat_dna_pool(self, dna_size):
        dna_pool = []
        for num in range(dna_size):
            dna = [i for i in string.ascii_lowercase]
            random.shuffle(dna)
            dna_pool.append(dna)
            
        return dna_pool
    
    def evolve_offspring(self, dna_pool, n_child):
        offspring = dna_pool.copy()
        
        for dna in dna_pool:
            for num in range(0, n_child):
                child = dna.copy()

                random_1 = np.random.randint(low = 0, high = 25)
                random_2 = np.random.randint(low = 0, high = 25)

                temp = child[random_2]
                child[random_2] = child[random_1]
                child[random_1] = temp
                del temp
                offspring.append(child)
      
        return offspring
            
    @staticmethod
    def text_preprocessing(text):
        sents = nltk.sent_tokenize(text)
        result = ''

        for i in sents:
            review = i.lower()
            review = re.sub('[^a-zA-Z]', ' ', review)
            review = re.sub('/d', ' ', review)
            review = re.sub('/s+', ' ', review)

            tokens = nltk.word_tokenize(review)
            result += ' '.join(tokens)

        return result
    
    
    def training(self):
        best_dna = {}
        
        for num in range(0, len(self.offspring)):
            mapper = {list(self.cheat_mapper.keys())[i]:self.offspring[num][i] for i in range(0,26)}
            mapper.update({' ': ' '})
            new_text = [mapper[i] for i in self.text]
            mapper.popitem()
            new_text = ''.join(new_text)
            
            sent = nltk.sent_tokenize(new_text)
            del new_text
            
            dna_prop = 0
            
            for i in sent:
                dna_prop += self.sequence_propability(i)

            best_dna.update({dna_prop:num})
            
        best_dna = {key:best_dna[key] for key in sorted(best_dna)}
        
        best_dna = {key:best_dna[key] for index, key in enumerate(best_dna.keys()) if index >= len(best_dna)-20}
        self.dna_pool = [self.offspring[index] for index in list(best_dna.values())]
        
        
        
    @staticmethod
    def dna_score(dna, num):
        alphabet = [i for i in string.ascii_lowercase]
        dna = {alphabet[i]: dna[i] for i in range(26)}
        
        score = 0
        for key in dna:
            if key == dna[key]:
                score += 1
            else:
                continue
        accuracy = score/26
        
        print('Accuracy of DNA#{0} is {1}%'.format(num, accuracy*100))
        
    @staticmethod
    def begin(text, dna_size, n_child, epochs):
        text = Cipher.text_preprocessing(text)
        return Cipher(text, dna_size, n_child, epochs)
        
        

In [None]:
obj1 = Cipher.begin(text, 20, 5, epochs = 100)