### <font color=firebrick> Tmp header for reloading </font>

In [1]:
%load_ext autoreload
%autoreload 2

### Loading modules

In [2]:
import re
import os
import numpy as np
from numpy.random import rand, randint, choice, shuffle
from tqdm import tqdm_notebook

### Loading parameters

In [3]:
# parameters
FILEPATH = "./data/news.2011.en.shuffled"
CLEAN_FILEPATH = "./data/news.2011.en.cleaned"
N_LINES = 2466169
CHARS = list(" abcdefghijklmnopqrstuvwxyz")
MAX_INPUT_LEN = 40
MIN_INPUT_LEN = 3
AMOUNT_OF_NOISE = 0.3 / MAX_INPUT_LEN

# regex cleanup
RE_DASH_FILTER = re.compile(r'[\-\˗\֊\‐\‑\‒\–\—\⁻\₋\−\﹣\－]', re.UNICODE)
NORMALIZE_WHITESPACE_REGEX = re.compile(r'[^\S\n]+', re.UNICODE)
RE_APOSTROPHE_FILTER = re.compile(r'&#39;|[ʼ՚＇‘’‛❛❜ߴߵ`‵´ˊˋ{}{}{}{}{}{}{}{}{}]'.
                                  format ( 
                                      chr(768), chr(769), chr(832), 
                                      chr(833), chr(2387), chr(5151), 
                                      chr(5152), chr(65344), chr(8242)
                                  ), re.UNICODE)
RE_LEFT_PARENTH_FILTER = re.compile(r'[\(\[\{\⁽\₍\❨\❪\﹙\（]', re.UNICODE)
RE_RIGHT_PARENTH_FILTER = re.compile(r'[\)\]\}\⁾\₎\❩\❫\﹚\）]', re.UNICODE)
RE_BASIC_CLEANER = re.compile(r'[^\w\s\-\)\(]', re.UNICODE)

### Loading and cleaning data

In [4]:
# strong cleaner, replace any characters not in CHARS with a space
def clean_text(txt):
    txt = RE_DASH_FILTER.sub(' ', txt)
    txt = RE_APOSTROPHE_FILTER.sub(' ', txt)
    txt = RE_LEFT_PARENTH_FILTER.sub(' ', txt)
    txt = RE_RIGHT_PARENTH_FILTER.sub(' ', txt)
    txt = RE_BASIC_CLEANER.sub(' ', txt)
    txt = "".join(char for char in txt if char in CHARS)
    txt = NORMALIZE_WHITESPACE_REGEX.sub(' ', txt)
    return txt

In [5]:
%%time

i = 0
length = 0
lines = list()

if os.path.isfile(CLEAN_FILEPATH):
    print("loading pre-cleaned file")
    with open(CLEAN_FILEPATH, 'r', encoding='utf-8') as f:
        for s in f:
            line = s.rstrip('\n')
            lines.append(line)
            i += 1
            new_len = len(s)
            if new_len > length:
                length = new_len
else:
    print("loading and cleaning text")
    with open(FILEPATH, 'r', encoding='utf-8') as f:
        for s in tqdm_notebook(f, total=N_LINES):
            line = clean_text(s.lower()).strip('\n')
            if line != '':
                lines.append(line)
                i += 1
                new_len = len(s)
                if new_len > length:
                    length = new_len

    with open(CLEAN_FILEPATH, 'w', encoding='utf-8') as f:
        for line in lines:
            f.write(line + '\n')

n_lines = i
print("total: %d lines" % n_lines)
print("max length: %d char" % length)

loading pre-cleaned file
total: 2466169 lines
max length: 9803 char
CPU times: user 2.14 s, sys: 232 ms, total: 2.37 s
Wall time: 2.36 s


### Generating mispelled sentence

In [6]:
def add_noise_to_string(a_string, amount_of_noise):
    
    length = len(a_string)
    threshold = amount_of_noise * length
    
    # replace a character with a random character
    if rand() < threshold:
        rdm_char_pos = randint(length)
        a_string = a_string[:rdm_char_pos] + choice(CHARS[:-1]) + a_string[rdm_char_pos + 1:]
        
    # delete a character
    if rand() < threshold:
        rdm_char_pos = randint(length)
        a_string = a_string[:rdm_char_pos] + a_string[rdm_char_pos + 1:]
        
    # add a random character
    if length < MAX_INPUT_LEN and rand() < threshold:
        rdm_char_pos = randint(length)
        a_string = a_string[:rdm_char_pos] + choice(CHARS[:-1]) + a_string[rdm_char_pos:]
        
    # transpose 2 characters
    if rand() < threshold:
        rdm_char_pos = randint(length - 2)
        a_string = (a_string[:rdm_char_pos] +
                    a_string[rdm_char_pos + 1] +
                    a_string[rdm_char_pos] +
                    a_string[rdm_char_pos + 2:])
        
    # delete space
    if rand() < threshold:
        spaces_pos = [pos for pos, char in enumerate(a_string) if char == " "]
        if len(spaces_pos) != 0:
            rdm_space_pos = choice(spaces_pos)
            a_string = a_string[:rdm_space_pos] + a_string[rdm_space_pos + 1:]
        
    return a_string


def generate_examples(corpus):
    
    sources, targets = list(), list()
    
    print("split sentences...")
    while corpus:
        line = corpus.pop()
        
        while len(line) > MIN_INPUT_LEN:
            if len(line) <= MAX_INPUT_LEN:
                target = line
                line = ""
            else:
                space_pos = line.rfind(" ", MIN_INPUT_LEN, MAX_INPUT_LEN - 1)
                if space_pos > -1:
                    target = line[:space_pos]
                    line = line[space_pos + 1:]
                else:
                    space_pos = line.rfind(" ")
                    if space_pos == -1:
                        break
                    else:
                        line = line[space_pos + 1:]
                        continue
            targets.append(target)
    print("...done")
    
    for target_idx, target in tqdm_notebook(enumerate(targets), total=len(targets)):
        source = add_noise_to_string(target, AMOUNT_OF_NOISE)
        source += "." * (MAX_INPUT_LEN - len(source))
        target += "." * (MAX_INPUT_LEN - len(target))
        targets[target_idx] = target
        sources.append(source)
        
    return sources, targets

In [7]:
%%time
corpus = lines.copy()
sources, targets = generate_examples(corpus)

split sentences...
...done


Widget Javascript not detected.  It may not be installed or enabled properly.



CPU times: user 2min 3s, sys: 1.24 s, total: 2min 4s
Wall time: 2min 4s


In [8]:
rd_idx = randint(0, len(sources))
print(sources[rd_idx])
print(targets[rd_idx])

thtey have qor a long time yet the......
they have for a long time yet the.......


### Vocabularies

In [9]:
PAD, GO, EOS, UNK = SEQ_VOC = ['.', ' _GO ', ' _EOS ', ' _UNK ']
VOC = SEQ_VOC + CHARS

vocabulary = dict()
for i, token in enumerate(VOC):
    vocabulary[token] = i

In [10]:
# convert character sequences into id sequences
def char2id(char_seq):
    return [vocabulary[char] for char in char_seq]
    
def tokenize(src_strings, tgt_strings):
    n = len(src_strings)
    d = MAX_INPUT_LEN
    
    empty_ids = np.zeros(shape=(n, d*2+1), dtype=np.int32)
    src_ids = np.zeros(shape=(n, d*2+1), dtype=np.int32)
    tgt_ids = np.zeros(shape=(n, d*2+1), dtype=np.int32)
    
    for i, (src_seq, tgt_seq) in tqdm_notebook(enumerate(zip(src_strings, tgt_strings)), total=n):
        src_seq = char2id(src_seq)
        tgt_seq = char2id(tgt_seq)
        src_ids[i] = src_seq[::-1] + [vocabulary[GO]] + tgt_seq
        tgt_ids[i, d:-1] = tgt_seq
        t = 1
        while not tgt_ids[i, t]:
            t -= 1
        tgt_ids[i, t+1] = vocabulary[EOS]
        
    return src_ids, tgt_ids

# convert id sequences into character sequences
def id2char(id_seq):
    return "".join(VOC[idx] for idx in id_seq)

In [11]:
%%time 
x, y = tokenize(sources, targets)

Widget Javascript not detected.  It may not be installed or enabled properly.



CPU times: user 4min 24s, sys: 51.5 s, total: 5min 16s
Wall time: 5min 15s


In [12]:
idx = 15
print(y[idx])
print()
print(x[idx])
print()
print(id2char(y[idx]))
print(id2char(x[idx]))

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 24 12  9  4 25 23  4 20 22  9
 23 13  8  9 18 24 13  5 16  4  7 19 17 17 13 23 23 13 19 18  4 23  2  0  0
  0  0  0  0  0  0]

[ 0  0  0  0  0  0  0  0  0 23  4 18 19 13 23 23 13 17 19  7  4 16  5 13 24
 18  9  8 13 23  9 22 20  4 23 25  4  9 12 24  1 24 12  9  4 25 23  4 20 22
  9 23 13  8  9 18 24 13  5 16  4  7 19 17 17 13 23 23 13 19 18  4 23  0  0
  0  0  0  0  0  0]

........................................the us presidential commission s _EOS ........
.........s noissimoc laitnediserp su eht _GO the us presidential commission s........


### Seq2Seq architecture

In [13]:
from sklearn.model_selection import train_test_split

n_try = 100000
x_train, x_test, y_train, y_test = train_test_split(x[:n_try], y[:n_try], train_size=0.75)

In [17]:
from keras.models import Sequential
from keras.layers import Embedding, Dropout, GRU, Dense

length = np.shape(x)[1]
voc_size = len(VOC)

model = Sequential()
model.add(Embedding(voc_size, 64, input_length=length))
model.add(Dropout(0.3))
model.add(GRU(1024, return_sequences=True))
model.add(GRU(1024, return_sequences=True))
model.add(Dense(voc_size, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [18]:
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

best_model_fname = "./seq2seq_model_checkpoint.h5"
best_model_cb = ModelCheckpoint(best_model_fname,
                                monitor='val_loss',
                                save_best_only=True, 
                                verbose=1)

In [None]:
%%time
validation_data = (x_test, np.expand_dims(y_test, -1))
history = model.fit(x_train, 
                    np.expand_dims(y_train, -1), 
                    validation_data=validation_data, 
                    epochs=5,
                    batch_size=64, 
                    verbose=1,
                    callbacks=[best_model_cb]
                   )

Train on 75000 samples, validate on 25000 samples
Epoch 1/5
  896/75000 [..............................] - ETA: 3254s - loss: 2.4636

In [None]:
def greedy_correct(model, input_ids):
    
    if type(input_ids) == np.ndarray:
        input_ids = input_ids.tolist()
    assert len(input_ids) == 40
    
    input_ids += [vocabulary[GO]]
    input_array = np.empty(shape=(1, model.input_shape[1]), dtype=np.int32)
    decoded_ids = []
    
    while len(input_ids) <= length:
        
        input_array.fill(vocabulary[PAD])
        input_array[0, -len(input_ids):] = input_ids
        
        next_token_id = model.predict(input_array)[0, -1].argmax()
        
        if next_token_id == vocabulary[EOS]:
            break
            
        decoded_ids.append(next_token_id)
        
        input_ids.append(next_token_id)
    
    return id2char(decoded_ids)

In [None]:
# load model
# best_model = load_model("./seq2seq_model_checkpoint.h5")
# best_model = load_model("./seq2seq_model_checkpoint_1024_008.h5")

In [None]:
n_test = np.shape(x_test)[0]
idx = np.random.randint(n_test)
print("idx: %s" % idx)
print()
input_ids = x_test[idx][:40]
expected_ids = x_test[idx][41:]

print("mispelled string:")
print(id2char(input_ids)[::-1])
print()
print("deep corrected string:")
corrected_string = greedy_correct(best_model, input_ids)
print(corrected_string)
print()
print("expected correction:")
print(id2char(expected_ids))