### Loading modules

In [1]:
import re
import os
import numpy as np
from numpy.random import rand, randint, choice, shuffle
from tqdm import tqdm_notebook

### Loading parameters

In [2]:
# parameters
FILEPATH = "./data/news.2011.en.shuffled"
CLEAN_FILEPATH = "./data/news.2011.en.cleaned"
N_LINES = 2466169
CHARS = list(" abcdefghijklmnopqrstuvwxyz")
MAX_INPUT_LEN = 40
MIN_INPUT_LEN = 3
AMOUNT_OF_NOISE = 0.3 / MAX_INPUT_LEN

# regex cleanup
RE_DASH_FILTER = re.compile(r'[\-\˗\֊\‐\‑\‒\–\—\⁻\₋\−\﹣\－]', re.UNICODE)
NORMALIZE_WHITESPACE_REGEX = re.compile(r'[^\S\n]+', re.UNICODE)
RE_APOSTROPHE_FILTER = re.compile(r'&#39;|[ʼ՚＇‘’‛❛❜ߴߵ`‵´ˊˋ{}{}{}{}{}{}{}{}{}]'.
                                  format ( 
                                      chr(768), chr(769), chr(832), 
                                      chr(833), chr(2387), chr(5151), 
                                      chr(5152), chr(65344), chr(8242)
                                  ), re.UNICODE)
RE_LEFT_PARENTH_FILTER = re.compile(r'[\(\[\{\⁽\₍\❨\❪\﹙\（]', re.UNICODE)
RE_RIGHT_PARENTH_FILTER = re.compile(r'[\)\]\}\⁾\₎\❩\❫\﹚\）]', re.UNICODE)
RE_BASIC_CLEANER = re.compile(r'[^\w\s\-\)\(]', re.UNICODE)

### Loading and cleaning data

In [3]:
# strong cleaner, replace any characters not in CHARS with a space
def clean_text(txt):
    txt = RE_DASH_FILTER.sub(' ', txt)
    txt = RE_APOSTROPHE_FILTER.sub(' ', txt)
    txt = RE_LEFT_PARENTH_FILTER.sub(' ', txt)
    txt = RE_RIGHT_PARENTH_FILTER.sub(' ', txt)
    txt = RE_BASIC_CLEANER.sub(' ', txt)
    txt = "".join(char for char in txt if char in CHARS)
    txt = NORMALIZE_WHITESPACE_REGEX.sub(' ', txt)
    return txt

In [4]:
%%time

i = 0
length = 0
lines = list()

if os.path.isfile(CLEAN_FILEPATH):
    print("loading pre-cleaned file")
    with open(CLEAN_FILEPATH, 'r', encoding='utf-8') as f:
        for s in f:
            line = s.rstrip('\n')
            lines.append(line)
            i += 1
            new_len = len(s)
            if new_len > length:
                length = new_len
else:
    print("loading and cleaning text")
    with open(FILEPATH, 'r', encoding='utf-8') as f:
        for s in tqdm_notebook(f, total=N_LINES):
            line = clean_text(s.lower()).strip('\n')
            if line != '':
                lines.append(line)
                i += 1
                new_len = len(s)
                if new_len > length:
                    length = new_len

    with open(CLEAN_FILEPATH, 'w', encoding='utf-8') as f:
        for line in lines:
            f.write(line + '\n')

n_lines = i
print("total: %d lines" % n_lines)
print("max length: %d char" % length)

loading pre-cleaned file
total: 2466169 lines
max length: 9803 char
CPU times: user 2.13 s, sys: 247 ms, total: 2.38 s
Wall time: 2.39 s


### Generating mispelled sentence

In [5]:
def add_noise_to_string(a_string, amount_of_noise):
    
    length = len(a_string)
    threshold = amount_of_noise * length
    
    # replace a character with a random character
    if rand() < threshold:
        rdm_char_pos = randint(length)
        a_string = a_string[:rdm_char_pos] + choice(CHARS[:-1]) + a_string[rdm_char_pos + 1:]
        
    # delete a character
    if rand() < threshold:
        rdm_char_pos = randint(length)
        a_string = a_string[:rdm_char_pos] + a_string[rdm_char_pos + 1:]
        
    # add a random character
    if length < MAX_INPUT_LEN and rand() < threshold:
        rdm_char_pos = randint(length)
        a_string = a_string[:rdm_char_pos] + choice(CHARS[:-1]) + a_string[rdm_char_pos:]
        
    # transpose 2 characters
    if rand() < threshold:
        rdm_char_pos = randint(length - 2)
        a_string = (a_string[:rdm_char_pos] +
                    a_string[rdm_char_pos + 1] +
                    a_string[rdm_char_pos] +
                    a_string[rdm_char_pos + 2:])
        
    # delete space
    if rand() < threshold:
        spaces_pos = [pos for pos, char in enumerate(a_string) if char == " "]
        if len(spaces_pos) != 0:
            rdm_space_pos = choice(spaces_pos)
            a_string = a_string[:rdm_space_pos] + a_string[rdm_space_pos + 1:]
        
    return a_string


def generate_examples(corpus):
    
    sources, targets = list(), list()
    
    while corpus:
        line = corpus.pop()
        
        while len(line) > MIN_INPUT_LEN:
            if len(line) <= MAX_INPUT_LEN:
                target = line
                line = ""
            else:
                space_pos = line.rfind(" ", MIN_INPUT_LEN, MAX_INPUT_LEN - 1)
                if space_pos > -1:
                    target = line[:space_pos]
                    line = line[space_pos + 1:]
                else:
                    space_pos = line.rfind(" ")
                    if space_pos == -1:
                        break
                    else:
                        line = line[space_pos + 1:]
                        continue
            targets.append(target)
    
    for target_idx, target in enumerate(targets):
        source = add_noise_to_string(target, AMOUNT_OF_NOISE)
        source += "." * (MAX_INPUT_LEN - len(source))
        target += "." * (MAX_INPUT_LEN - len(target))
        targets[target_idx] = target
        sources.append(source)
        
    return sources, targets

In [6]:
%%time
corpus = lines.copy()
sources, targets = generate_examples(corpus)

CPU times: user 1min 50s, sys: 881 ms, total: 1min 51s
Wall time: 1min 51s


In [7]:
rd_idx = randint(0, len(sources))
print(sources[rd_idx])
print(targets[rd_idx])

to indian prime ministermanmohan........
to indian prime minister manmohan.......


### Vocabularies

In [8]:
PAD, GO, EOS, UNK = SEQ_VOC = ['.', ' _GO ', ' _EOS ', ' _UNK ']
VOC = SEQ_VOC + CHARS

vocabulary = dict()
for i, token in enumerate(VOC):
    vocabulary[token] = i

In [9]:
for _i, (_src, _tgt) in enumerate(zip(sources, targets)):
    print(_i)
    print(_src)
    print(_tgt)
    break

0
president rene preval and former us.....
president rene preval and former us.....


In [10]:
# convert character sequences into id sequences
def char2id(char_seq):
    return [vocabulary[char] for char in char_seq]

def tokenize(src_strings, tgt_strings, reverse=True):
    n = len(src_strings)
    d = MAX_INPUT_LEN
    
    empty_ids = np.empty(shape=(n, d*2+1), dtype=np.int32)
    empty_ids.fill(vocabulary[PAD])
    src_ids = np.copy(empty_ids)
    tgt_ids = np.copy(empty_ids)
    
    gen = enumerate(zip(src_strings, tgt_strings))
    for i, (src_seq, tgt_seq) in tqdm_notebook(gen, total=n):
        src_seq = char2id(src_seq)
        tgt_seq = char2id(tgt_seq)
        src_ids[i] = src_seq[::-1] + [vocabulary[GO]] + tgt_seq
        tgt_ids[i, d:-1] = tgt_seq
        t = 1
        while not tgt_ids[i, t]:
            t -= 1
        tgt_ids[i, t+1] = vocabulary[EOS]
        
    return src_ids, tgt_ids

# convert id sequences into character sequences
def id2char(id_seq):
    return "".join(VOC[idx] for idx in id_seq)

In [11]:
%%time 
x, y = tokenize(sources, targets)


CPU times: user 2min 54s, sys: 1.58 s, total: 2min 56s
Wall time: 2min 56s


In [12]:
x[10]

array([ 0,  0,  0,  0,  0, 22, 19, 24, 15, 13, 26,  4, 14,  9, 24, 23, 13,
       18, 13, 17,  4,  9, 17, 13, 22, 20,  4, 23,  4, 29, 22, 24, 18, 25,
       19,  7,  4,  9, 12, 24,  1, 24, 12,  9,  4,  7, 19, 25, 18, 24, 22,
       29,  4, 23,  4, 20, 22, 13, 17,  9,  4, 17, 13, 18, 13, 23, 24,  9,
       22,  4, 26, 13, 15, 24, 19, 22,  0,  0,  0,  0,  0], dtype=int32)

In [13]:
y[10]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0, 24, 12,  9,  4,  7, 19, 25, 18, 24, 22, 29,
        4, 23,  4, 20, 22, 13, 17,  9,  4, 17, 13, 18, 13, 23, 24,  9, 22,
        4, 26, 13, 15, 24, 19, 22,  2,  0,  0,  0,  0,  0], dtype=int32)

In [14]:
id2char(y[10])

'........................................the country s prime minister viktor _EOS .....'

In [15]:
id2char(x[10])

'.....rotkiv jetsinim emirp s yrtnuoc eht _GO the country s prime minister viktor.....'

In [16]:
from sklearn.model_selection import train_test_split

n_try = 100000
x_train, x_test, y_train, y_test = train_test_split(x[:n_try], y[:n_try], train_size=0.75)

### Seq2Seq architecture

In [17]:
from keras.models import Sequential
from keras.layers import Embedding, Dropout, GRU, Dense

length = np.shape(x)[1]
voc_size = len(VOC)

model = Sequential()
model.add(Embedding(voc_size, 64, input_length=length))
model.add(Dropout(0.3))
model.add(GRU(512, return_sequences=True))
model.add(Dense(voc_size, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

Using TensorFlow backend.


In [18]:
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

best_model_fname = "./spellcheck_seq2seq_chekpoint.h5"
best_model_cb = ModelCheckpoint(best_model_fname,
                                monitor='val_loss',
                                save_best_only=True, 
                                verbose=1)

In [19]:
%%time
validation_data = (x_test, np.expand_dims(y_test, -1))
history = model.fit(x_train, 
                    np.expand_dims(y_train, -1), 
                    validation_data=validation_data, 
                    epochs=3, 
                    batch_size=64, 
                    callbacks=[best_model_cb]
                   )

Train on 75000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 3h 23min 12s, sys: 54min 20s, total: 4h 17min 33s
Wall time: 31min 24s


In [20]:
x_test[0]

array([ 4,  9, 17, 13,  4, 24,  4,  9, 12, 24,  4, 24,  5,  4, 22,  9, 24,
       23, 13, 18, 13, 17,  4,  9, 17, 13, 22, 20,  4, 23,  5, 27,  4, 13,
       16,  5, 17, 18,  9,  6,  1,  6,  9, 18,  4,  5, 16, 13,  4, 27,  5,
       23,  4, 20, 22, 13, 17,  9,  4, 17, 13, 18, 13, 23, 24,  9, 22,  4,
        5, 24,  4, 24, 12,  9,  4, 24, 13, 17,  9,  4,  0], dtype=int32)

In [21]:
id2char(x_test[0])

' emi t eht ta retsinim emirp saw ilamneb _GO ben ali was prime minister at the time .'

In [None]:
model.predict([x_test[0]])