In [1]:
# import libraries
import pickle
import codecs
import re
import random
import numpy as np
from copy import deepcopy
from nltk.tokenize import word_tokenize

In [2]:
TEST = False
WRITE = True
KEY = "prideAndPrejudice.txt"
CORPUS = "movie_lines.tsv"
MAXLENGTH = 30
MAXREPEAT = 3

In [3]:
def cleanTokenize(data):
    data = re.sub("[^\x41-\x7A]+", ' ', data)
    data = re.sub("[\x5B-\x60]+", ' ', data)
    words = word_tokenize(data.lower())
    return words

In [4]:
class dic_recorder():
    def __init__(self):
        self.__count = 0
        self.dict = {}
        self.rev_dict = {}

    def record(self, w):
        if(type(w)) == list:
            w = tuple(w)
        self.__count = self.__count + 1
        self.dict[w] = self.__count
        self.rev_dict[self.__count] = w

    def find(self, w):
        if w not in self.dict:
            wo = []
            num = []
            wo.append("UNK")
            num.append(self.find("UNK")[1])
            for char in w:
                wo.append(char)
                num.append(self.find(char)[1])
            wo.append("UNK")
            num.append(self.find("UNK")[1])
            w = wo
        else:
            num = self.dict[w]
        return w, num

In [5]:
class map_recorder():
    def __init__(self):
        self.mapping = {}
        self.th = MAXREPEAT

    def count(self, w):
        if w in self.mapping:
            return len(self.mapping[w])
        return 0

    def record(self, p, c):
        count = self.count(p)
        if count == 0:
            self.mapping[p] = [c]
            return True
        if count >= self.th:
            return False
        self.mapping[p].append(c)
        return True

    def encrypt(self, w):
        count = self.count(w)
        if count == 0:
            tmp = []
            tmp.append(self.mapping["UNK"])
            for char in w:
                tmp.append(self.mapping[char][random.randint(0, self.count(char) - 1)])
            tmp.append(self.mapping["UNK"])
        else:
            tmp = self.mapping[w][random.randint(0, count - 1)]
        return tmp

In [6]:
class indexer():
    def __init__(self):
        self.index = [1, 1, 1]
        self.wps = 9
        self.spp = 9

    def next(self):
        ind = deepcopy(self.index)
        if self.index[2] == self.wps - 1:
            if self.index[1] == self.spp - 1:
                self.index[0] = self.index[0] + 1
            self.index[1] = (self.index[1]) % self.spp + 1
        self.index[2] = (self.index[2]) % self.wps + 1
        return ind

In [7]:
def reformat(line, corpus_r):
    l = []
    li = []
    for word in line:
        w, i = corpus_r.find(word)
        if type(w) == list:
            l.extend(w)
            li.extend(i)
        else:
            l.append(w)
            li.append(i)
    return l, li

In [8]:
#with codecs.open(KEY, 'r', 'utf-8-sig') as f:
    #data = f.read()

In [9]:
with open(KEY, 'rb') as f:
    data = f.read().decode('utf8')
words = cleanTokenize(data)
print("key file read and tokenized")

  data = re.sub("[\x5B-\x60]+", ' ', data)


key file read and tokenized


In [10]:
corpus_r = dic_recorder()
cypher_r = dic_recorder()
mr = map_recorder()
id = indexer()
mr.record("START", [0, 0, 1])
mr.record("END", [0, 0, 2])
mr.record("UNK", [0, 0, 3])

for i in range(3):
    cypher_r.record([0, 0, i + 1])

for i in range(len(words)):
    word = words[i]
    ind = id.next()
    if mr.record(word, ind):
        cypher_r.record(ind)
    if i % 100 == 0:
        ind = id.next()
        char = chr(random.randint(33, 126))
        if(mr.record(char, ind)):
            cypher_r.record(ind)
for i in range(33, 127):
    char = chr(i)
    if mr.count(char) == 0:
        ind = id.next()
        if(mr.record(char, ind)):
            cypher_r.record(ind)
        
print("word mapping and cypher dictionaries created")

for key, value in mr.mapping.items():
    corpus_r.record(key)
print("corpus dictionaries created")

word mapping and cypher dictionaries created
corpus dictionaries created


In [11]:
with open(CORPUS, 'rb') as f:
    lines = f.readlines()
new_lines = []
length = len(lines)
if TEST:
    length = 50
for i in range(length):
    line = lines[i].decode('utf8').split('\t')[4]
    new_lines.append(cleanTokenize(line))
print("corpus created")

corpus created


In [12]:
res = np.zeros((len(new_lines), 3, MAXLENGTH), dtype = np.int32)

start = cypher_r.dict[tuple(mr.encrypt("START"))]
end = cypher_r.dict[tuple(mr.encrypt("END"))]
res[:, 0, 0] = start
res[:, 1, 0] = start
res[:, 2, 0] = 1

for i in range(len(new_lines)):
    j = 0
    theline = new_lines[i]
    theline, cypher = reformat(theline, corpus_r)
    while j < len(theline):
        if j == MAXLENGTH - 2:
            break
        w = theline[j]
        num = cypher[j]
        res[i][0][j + 1] = num
        res[i][1][j + 1] = cypher_r.dict[tuple(mr.encrypt(w))]
        res[i][2][j + 1] = j + 2
        j = j + 1
    res[i][0][j + 1] = end
    res[i][1][j + 1] = end
    res[i][2][j + 1] = j + 2
print("corpus mapped")

corpus mapped


In [13]:
if WRITE:
    pickle.dump(corpus_r.dict, open("corpus_dict.p", "wb"))
    pickle.dump(corpus_r.rev_dict, open("corpus_rev_dict.p", "wb"))
    pickle.dump(cypher_r.dict, open("cypher_dict.p", "wb"))
    pickle.dump(cypher_r.rev_dict, open("cypher_rev_dict.p", "wb"))
    pickle.dump(mr.mapping, open("cyptherbook.p", "wb"))
    lenth = int(len(res) * 0.9)
    with open("train.npy", "wb") as f:
        np.save(f, res)
    with open("test.npy", "wb") as f:
        np.save(f, res[lenth:])
    print("all file saved")

all file saved


In [14]:
#corpus_r dict rev_dict
#cypther_r
#mr mapping
#new_lines
#res
count = 0
c2 = 0
for line in new_lines:
        if len(line) > 10:
            count += 1
            if len(line) > 20:
                c2 +=1
        
print(count, c2)

107794 41907


In [15]:
print(res[2])

[[    1   408    32   404    36   872   122     3  2166   469  1267   912
      3    32    11   404    36   132   872   122  1231   258   145    32
   2993  1181     3   253   156     2]
 [    1   643    44   647 10057  2399   214     3  4381   746  2167  1513
      3    44    11   635  9905   212  2804   224  3938   748   851    44
   5583  2017     3   389   227     2]
 [    1     2     3     4     5     6     7     8     9    10    11    12
     13    14    15    16    17    18    19    20    21    22    23    24
     25    26    27    28    29    30]]


In [18]:
print(res.size)

27424710


In [19]:
for i in range(100):
    print(res[i])

[[  1 277 148 132   2   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0]
 [  1 494 209 212   2   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0]
 [  1   2   3   4   5   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0]]
[[    1   156   159   306   123   111   813    17   144  2670   123    18
    375    29   431   432    87   123   122  1325   250     3   437  1775
   1830   432     3    32   437     2]
 [    1   316   260  1190   172  1496  3550    93   561  6688   440    19
   1289   176 12386  2877   213   440   171  2289  1403     3   686  4887
   3285   681     3    37   686     2]
 [    1     2     3     4     5     6     7     8     9    10    11    12
     13    14    15    16    17    18    19    20    21    22    23    24
     25    26    27    28    29    30]]
[[    1   408    32   404    36   872   122     3  2166   469 