## Create clean data and noisy data for training correctors

## Import modules

In [2]:
import os
import re
from neuspell.noising import CharacterReplacementNoiser

train_data_path = '../dataset/train'
test_data_path = '../dataset/test'

char_repl_noiser = CharacterReplacementNoiser(language="english")
char_repl_noiser.load_resources()

data folder is set to `e:\nlp\nlp-env\neuspell\neuspell\../data` script
no resources are required to be downloaded for this noiser


## Function

In [3]:
def read_file(file_path):
    with open(file_path, mode='r', encoding='utf-8-sig') as f:
        data = f.read().splitlines()

    if '' in data:
        data.remove('')

    print('len data:', len(data))
    display(data[:10])
    return data

In [4]:
def cleaner(raw_data):
    clean_data = []
    for line in raw_data:
        txt = re.sub(r'\(|\)|\[|\]', '', line).strip().lower()
        if txt not in clean_data:
            clean_data.append(txt)
    clean_data = list(set(clean_data))

    print('len clean_data:', len(clean_data))
    display(clean_data[:10])
    return clean_data

In [5]:
def noiser(clean_data, char_repl_noiser):
    noisy_data = char_repl_noiser.noise(clean_data)

    print('len noisy_data:', len(noisy_data))
    display(noisy_data[:10])
    return noisy_data

## Prepare training set

In [64]:
train_data = read_file(os.path.join(train_data_path, 'raw_en_lyrics.txt'))

len data: 10510


['just a lost boy in a small town',
 'singing "love is forever and ever"',
 'good on paper, picture perfect',
 "cause i know i'm addicted to your drama",
 'you see me i be',
 'i gotta tell them to myself',
 "i'm still learning to love",
 "no, i can't sleep until i feel your touch",
 'and all i can think',
 "so beautiful you're leaving me"]

### Clean training set

In [66]:
clean_train = cleaner(train_data)

len clean_data: 7309


['well show me the way',
 "'cause nothin' from nothin' leaves nothin'",
 "she said look, what's your game baby",
 'savage love, did somebody, did somebody break your heart',
 "told you i'll be here forever",
 'oh baby, we found love right where we are maybe',
 'all i know is we said, "hello"',
 "another saturday night and i ain't got nobody",
 'if your schemes like your dreams',
 'you fell, i caught you']

### Add noise to clean training set

In [67]:
noisy_train = noiser(clean_train, char_repl_noiser)

total # of texts after retokenization: 7309
total # of tokens after retokenization: 49947


100%|██████████| 7309/7309 [00:01<00:00, 4149.55it/s]

len noisy_data: 7309





['well skhow me the way',
 "'dcause nothin' from nothin' leaves nothin'",
 "sne said look, what's youcr game baby",
 'savage love, did somebody, did simebody break your heoart',
 "told you i'll be here forever",
 'oh baby, we found lobe right where we are mapybe',
 'all i know is we said, "hello"',
 "another saturay night and i an't git nqobody",
 'if yuor schemes lkie your dreams',
 'yoeu fell, i caught you']

## Prepare testing set

In [6]:
test_data = read_file(os.path.join(test_data_path, 'clean_en_lyrics.txt'))

len data: 930


['tell her that she beautiful every day i remind her',
 'i get lost in her eyes like dust from the skies',
 "did i say that out loud i'm so crazy about mine",
 "but we don't have the same soul",
 'tell me that you love me baby say it again',
 "repeat it over and over until it's in my brain",
 "you need to send your location i can't think",
 'did i say that out loud',
 "i'm so crazy about mine",
 'when i look back']

### Clean testing set

In [7]:
# clean_test = cleaner(test_data)

### Add noise to clean testing set

In [8]:
noisy_test = noiser(test_data, char_repl_noiser)

total # of texts after retokenization: 930
total # of tokens after retokenization: 6250


100%|██████████| 930/930 [00:00<00:00, 3956.57it/s]

len noisy_data: 930





['tell her that she beautiufl every dsy i remind her',
 'i get lost in her eyes licke dsut from tge skies',
 "did i say that out louud i'm so crazry about mnie",
 "but we don't have the saqme soul",
 'tell me that you love me babny say it again',
 "repeat it over and over until it's in my brian",
 "you need to send your location i can't think",
 'did i sqy that out luod',
 "i'm so craay about mibe",
 'when i look back']

## Save to file

In [None]:
with open(train_data_path + '/clean_lyrics.txt', 'w') as f:
    f.write('\n'.join(clean_train))

In [53]:
with open(train_data_path + '/noisy_lyrics.txt', 'w') as f:
    f.write('\n'.join(noisy_train))

In [54]:
# with open(test_data_path + '/clean_en_lyrics.txt', 'w') as f:
#     f.write('\n'.join(clean_test))

In [76]:
with open(test_data_path + '/noisy_en_lyrics.txt', 'w') as f:
    f.write('\n'.join(noisy_test))