In [1]:
import re
import numpy as np
from numpy.random import rand, randint, choice, shuffle
from tqdm import tqdm_notebook

### Loading parameters

In [2]:
# parameters
FILEPATH = "../../02_data/05_google_1billion_word/training-monolingual/news.2011.en.shuffled"
N_LINES = 2466169
CHARS = list("abcdefghijklmnopqrstuvwxyz .")
MAX_INPUT_LEN = 40
MIN_INPUT_LEN = 3
AMOUNT_OF_NOISE = 0.3 / MAX_INPUT_LEN

# regex cleanup
RE_DASH_FILTER = re.compile(r'[\-\˗\֊\‐\‑\‒\–\—\⁻\₋\−\﹣\－]', re.UNICODE)
NORMALIZE_WHITESPACE_REGEX = re.compile(r'[^\S\n]+', re.UNICODE)
RE_APOSTROPHE_FILTER = re.compile(r'&#39;|[ʼ՚＇‘’‛❛❜ߴߵ`‵´ˊˋ{}{}{}{}{}{}{}{}{}]'.
                                  format ( 
                                      chr(768), chr(769), chr(832), 
                                      chr(833), chr(2387), chr(5151), 
                                      chr(5152), chr(65344), chr(8242)
                                  ), re.UNICODE)
RE_LEFT_PARENTH_FILTER = re.compile(r'[\(\[\{\⁽\₍\❨\❪\﹙\（]', re.UNICODE)
RE_RIGHT_PARENTH_FILTER = re.compile(r'[\)\]\}\⁾\₎\❩\❫\﹚\）]', re.UNICODE)
RE_BASIC_CLEANER = re.compile(r'[^\w\s]', re.UNICODE)

### Loading and cleaning data

In [3]:
def clean_text(txt):
    txt = NORMALIZE_WHITESPACE_REGEX.sub(' ', txt)
    txt = RE_DASH_FILTER.sub('-', txt)
    txt = RE_APOSTROPHE_FILTER.sub(' ', txt)
    txt = RE_LEFT_PARENTH_FILTER.sub('(', txt)
    txt = RE_RIGHT_PARENTH_FILTER.sub(')', txt)
    txt = RE_BASIC_CLEANER.sub('', txt)
    return txt

In [4]:
%%time

# loading files
with open(FILEPATH, 'r', encoding='utf-8') as f:
    lines = list()
    i = 0
    length = 0
    for s in tqdm_notebook(f, total=N_LINES):
        lines.append(clean_text(s.lower()))
        new_length = len(s)
        if new_length > length:
            length = new_length
        i += 1
    n_lines = i
    print("total: %d lines" % n_lines)
    print("max length: %d char" % length)


total: 2466169 lines
max length: 10118 char
Wall time: 48.4 s


### Generating mispelled sentence

In [5]:
def add_noise_to_string(a_string, amount_of_noise):
    """ Add some common spelling mistakes to a string """
    
    length = len(a_string)
    threshold = amount_of_noise * length
    
    if rand() < threshold:
        # replace a character with a random character
        rdm_char_pos = randint(length)
        a_string = a_string[:rdm_char_pos] + choice(CHARS[:-1]) + a_string[rdm_char_pos + 1:]
        
    if rand() < threshold:
        # delete a character
        rdm_char_pos = randint(length)
        a_string = a_string[:rdm_char_pos] + a_string[rdm_char_pos + 1:]
        
    if length < MAX_INPUT_LEN and rand() < threshold:
        # add a random character
        rdm_char_pos = randint(length)
        a_string = a_string[:rdm_char_pos] + choice(CHARS[:-1]) + a_string[rdm_char_pos:]
        
    if rand() < threshold:
        # transpose 2 characters
        rdm_char_pos = randint(length - 1)
        a_string = (a_string[:rdm_char_pos] +
                    a_string[rdm_char_pos + 1] +
                    a_string[rdm_char_pos] +
                    a_string[rdm_char_pos + 2:])
        
    if rand() < threshold:
        # delete space
        space_pos = [pos for pos, char in enumerate(a_string) if char == " "]
        rdm_space_pos = choice(space_pos)
        a_string = a_string[:rdm_space_pos] + a_string[rdm_space_pos + 1:]
        
    return a_string

In [6]:
idx = randint(N_LINES)

In [7]:
a_string = lines[idx]
mispelled_string = add_noise_to_string(a_string, AMOUNT_OF_NOISE)
print(a_string)
print(mispelled_string)

once reported our staff will be notified and the comment will be reviewed

once reported ourstaffzwill be notified and the comment will be reviewed

