## Create clean data and noisy data for training correctors

## Import modules

In [15]:
import re
from neuspell.noising import CharacterReplacementNoiser

train_data_path = '../dataset/train'
test_data_path = '../dataset/test'

char_repl_noiser = CharacterReplacementNoiser(language="english")
char_repl_noiser.load_resources()

data folder is set to `e:\nlp\nlp-env\neuspell\neuspell\../data` script
no resources are required to be downloaded for this noiser


## Function

In [16]:
def read_file(file_path):
    with open(file_path) as f:
        data = f.read().splitlines()

    if '' in data:
        data.remove('')

    print('len data:', len(data))
    display(data[:10])
    return data

In [17]:
def cleaner(raw_data):
    clean_data = []
    for line in raw_data:
        txt = re.sub(r'\(|\)|\[|\]', '', line).strip().lower()
        if txt not in clean_data:
            clean_data.append(txt)
    clean_data = list(set(clean_data))

    print('len clean_data:', len(clean_data))
    display(clean_data[:10])
    return clean_data

In [18]:
def noiser(clean_data, char_repl_noiser):
    noisy_data = char_repl_noiser.noise(clean_data)

    print('len noisy_data:', len(noisy_data))
    display(noisy_data[:10])
    return noisy_data

## Prepare training set

In [19]:
train_data = read_file('../dataset/train/clean_lyrics.txt')

len data: 7309


['just a lost boy in a small town',
 'singing "love is forever and ever"',
 'good on paper, picture perfect',
 "cause i know i'm addicted to your drama",
 'you see me i be',
 'i gotta tell them to myself',
 "i'm still learning to love",
 "no, i can't sleep until i feel your touch",
 'and all i can think',
 "so beautiful you're leaving me"]

### Clean training set

In [13]:
clean_train = cleaner(train_data)

len clean_data: 7309


['well show me the way',
 "'cause nothin' from nothin' leaves nothin'",
 "she said look, what's your game baby",
 'savage love, did somebody, did somebody break your heart',
 "told you i'll be here forever",
 'oh baby, we found love right where we are maybe',
 'all i know is we said, "hello"',
 "another saturday night and i ain't got nobody",
 'if your schemes like your dreams',
 'you fell, i caught you']

### Add noise to clean training set

In [20]:
noisy_train = noiser(clean_train, char_repl_noiser)

total # of texts after retokenization: 7309
total # of tokens after retokenization: 49947


100%|██████████| 7309/7309 [00:01<00:00, 4139.78it/s]

len noisy_data: 7309





['well show me the way',
 "'cause nothiwn' from nothin' leaves nothin'",
 "she said look, waht's your game baby",
 'savage love, did simebody, did somebody breark yuor heart',
 "told ylu i'll be here foredver",
 'oh bayb, we found love right where we are maybe',
 'all i know is we said, "hello"',
 "anoyher saturday nigt and i ain'xt got nobody",
 'if yojur schemes lkke your dreajs',
 'you fell, i cauggt you']

## Prepare testing set

In [21]:
test_data = read_file('../dataset/test/test_lyrics.txt')

len data: 1418


['the world on drugs',
 'ten yeah thousand dollar plates',
 'thousand dollar plates',
 'thousand dollar plates, fine china',
 'shorty like a, uh',
 'shorty like a',
 'wheezy outta here',
 'shorty like a thousand dollar plate, fine china',
 'tell her that she beautiful every day, i remind her',
 "then i jump in the pussy like a lake, i'm a diver"]

### Clean testing set

In [5]:
clean_test = cleaner(train_data)

len clean_data_test: 1418


['the world on drugs',
 'ten yeah thousand dollar plates',
 'thousand dollar plates',
 'thousand dollar plates, fine china',
 'shorty like a, uh',
 'shorty like a',
 'wheezy outta here',
 'shorty like a thousand dollar plate, fine china',
 'tell her that she beautiful every day, i remind her',
 "then i jump in the pussy like a lake, i'm a diver"]

### Add noise to clean testing set

In [9]:
noisy_test = noiser(clean_train, char_repl_noiser)

no resources are required to be downloaded for this noiser
total # of texts after retokenization: 1418
total # of tokens after retokenization: 9959


100%|██████████| 1418/1418 [00:00<00:00, 3831.57it/s]

len noisy_data: 1418





['the world on drugs',
 'ten yeah thousand dllar plates',
 'thousand dollar plates',
 'thousand dollawr plates, fine cina',
 'sbhorty lkie a, uh',
 'sohrty lile a',
 'wheezy outta hree',
 'shrty loke a thoysand dolar plate, fine cihna',
 'tell her tnat she bqeautiful every day, i remind her',
 "thrn i jump in the pussy leike a lake, i'm a divker"]

## Save to file

In [28]:
with open(train_data_path + '/clean_lyrics.txt', 'w') as f:
    f.write('\n'.join(clean_data))

In [29]:
with open(train_data_path + '/noisy_lyrics.txt', 'w') as f:
    f.write('\n'.join(noisy_data))

In [7]:
with open('../dict/test_lyrics.txt', 'w') as f:
    f.write('\n'.join(clean_data_test))

In [10]:
with open('../dict/test_lyrics_noisy.txt', 'w') as f:
    f.write('\n'.join(noisy_data_test))