## Create clean data and noisy data for training correctors

## Import modules

In [22]:
import os
import re
from neuspell.noising import CharacterReplacementNoiser

train_data_path = '../dataset/train'
test_data_path = '../dataset/test'

char_repl_noiser = CharacterReplacementNoiser(language="english")
char_repl_noiser.load_resources()

no resources are required to be downloaded for this noiser


## Function

In [23]:
def read_file(file_path):
    with open(file_path) as f:
        data = f.read().splitlines()

    if '' in data:
        data.remove('')

    print('len data:', len(data))
    display(data[:10])
    return data

In [24]:
def cleaner(raw_data):
    clean_data = []
    for line in raw_data:
        txt = re.sub(r'\(|\)|\[|\]', '', line).strip().lower()
        if txt not in clean_data:
            clean_data.append(txt)
    clean_data = list(set(clean_data))

    print('len clean_data:', len(clean_data))
    display(clean_data[:10])
    return clean_data

In [25]:
def noiser(clean_data, char_repl_noiser):
    noisy_data = char_repl_noiser.noise(clean_data)

    print('len noisy_data:', len(noisy_data))
    display(noisy_data[:10])
    return noisy_data

## Prepare training set

In [26]:
train_data = read_file(os.path.join(train_data_path, 'raw_lyrics.txt'))

len data: 10510


['just a lost boy in a small town',
 'singing "love is forever and ever"',
 'good on paper, picture perfect',
 "cause i know i'm addicted to your drama",
 'you see me i be',
 'i gotta tell them to myself',
 "i'm still learning to love",
 "no, i can't sleep until i feel your touch",
 'and all i can think',
 "so beautiful you're leaving me"]

### Clean training set

In [27]:
clean_train = cleaner(train_data)

len clean_data: 7309


['well show me the way',
 "'cause nothin' from nothin' leaves nothin'",
 "she said look, what's your game baby",
 'savage love, did somebody, did somebody break your heart',
 "told you i'll be here forever",
 'oh baby, we found love right where we are maybe',
 'all i know is we said, "hello"',
 "another saturday night and i ain't got nobody",
 'if your schemes like your dreams',
 'you fell, i caught you']

### Add noise to clean training set

In [28]:
noisy_train = noiser(clean_train, char_repl_noiser)

total # of texts after retokenization: 7309
total # of tokens after retokenization: 49947


100%|██████████| 7309/7309 [00:01<00:00, 4068.39it/s]

len noisy_data: 7309





['well show me the wzy',
 "'cause nothih' frdom nlthin' lesaves nothin'",
 "she siad look, what's yoaur game bbay",
 'savage loce, did somebody, did somrbody break your heeart',
 "told you i'll be here flrever",
 'oh bby, we founbd loce rgiht whree we ate maybe',
 'all i kbnow is we said, "hello"',
 "another satjrday noght and i ain't got nobidy",
 'if yoir schems lkie your dreams',
 'you felk, i caugyt yzou']

## Prepare testing set

In [29]:
test_data = read_file(os.path.join(test_data_path, 'raw_lyrics.txt'))

len data: 2060


['The world on drugs',
 'Ten (yeah) thousand dollar plates',
 'Thousand dollar plates',
 'Thousand dollar plates, fine china',
 'Shorty like a, uh',
 'Shorty like a',
 '(Wheezy outta here)',
 'Shorty like a thousand dollar plate, fine china',
 'Tell her that she beautiful every day, I remind her',
 "Then I jump in the pussy like a lake, I'm a diver"]

### Clean testing set

In [36]:
clean_test = cleaner(test_data)

len clean_data: 1418


["cos he's alright in the city",
 'do you want me or do you not?',
 'and when i take you shopping',
 "i'm going to jupiter with my girl don't call your exes",
 'with your golden grill, true love never dies',
 "now my dick ain't free",
 "i'm wild as can be, and i want",
 'thanks, mr. president',
 'hey, batter, batter swing',
 'city girl, but she grew up in the tri-state']

### Add noise to clean testing set

In [37]:
noisy_test = noiser(clean_test, char_repl_noiser)

total # of texts after retokenization: 1418
total # of tokens after retokenization: 9959


100%|██████████| 1418/1418 [00:00<00:00, 3959.53it/s]

len noisy_data: 1418





["coys he's arlight in the city",
 'do you wanot me or do you not?',
 'ansd when i take ylu shopping',
 "ib'm going to jupiter wuth my gril dont call your exes",
 'wkth ypur golden grill, true loce nwver dwies',
 "npw my dick ainn't free",
 "ix'm wild as can be, and i want",
 'thanks, mr. president',
 'hey, batter, bater swing',
 'city girl, but she grew up in the tri-state']

## Save to file

In [32]:
with open(train_data_path + '/clean_lyrics.txt', 'w') as f:
    f.write('\n'.join(clean_train))

In [33]:
with open(train_data_path + '/noisy_lyrics.txt', 'w') as f:
    f.write('\n'.join(noisy_train))

In [34]:
with open(test_data_path + '/clean_lyrics.txt', 'w') as f:
    f.write('\n'.join(clean_test))

In [35]:
with open(test_data_path + '/noisy_lyrics.txt', 'w') as f:
    f.write('\n'.join(noisy_test))