# Correcting songs and fine-tuing

## Import modules

In [1]:
import os
from neuspell import available_checkers, BertChecker, SclstmChecker

print(f"available checkers: {available_checkers()}")

data folder is set to `e:\nlp\nlp-env\neuspell\neuspell\../data` script
available checkers: ['BertsclstmChecker', 'CnnlstmChecker', 'ElmosclstmChecker', 'NestedlstmChecker', 'SclstmChecker', 'SclstmbertChecker', 'SclstmelmoChecker', 'BertChecker']


## Define data path

In [2]:
train_data_path = '../dataset/train'
clean_train_path = os.path.join(train_data_path, 'clean_lyrics.txt')
noisy_train_path = os.path.join(train_data_path, 'noisy_lyrics.txt')

test_data_path = '../dataset/test'
clean_test_path = os.path.join(test_data_path, 'test_lyrics.txt')
noisy_test_path = os.path.join(test_data_path, 'test_lyrics_noisy.txt')

## Load training data

In [3]:
with open(clean_train_path) as f:
    clean_train = f.read().splitlines()

clean_train[:10]

['just a lost boy in a small town',
 'singing "love is forever and ever"',
 'good on paper, picture perfect',
 "cause i know i'm addicted to your drama",
 'you see me i be',
 'i gotta tell them to myself',
 "i'm still learning to love",
 "no, i can't sleep until i feel your touch",
 'and all i can think',
 "so beautiful you're leaving me"]

In [4]:
with open(noisy_train_path) as f:
    noisy_train = f.read().splitlines()

noisy_train[:10]

['just a lsot boy in a smmall tiwn',
 'singing "lve is foreiver and eve"',
 'good on paper, piccture perfect',
 "cayse i know i'm adicted to your dtama",
 'yoou see me i be',
 'i gotta tell tehm to myseclf',
 "i'm sitll learning to lovke",
 "no, i can't sleep until i feel your toch",
 'and all i caan think',
 "so beautiful yuo're letaving me"]

In [5]:
with open(clean_test_path) as f:
    clean_test = f.read().splitlines()

clean_test[:10]

['the world on drugs',
 'ten yeah thousand dollar plates',
 'thousand dollar plates',
 'thousand dollar plates, fine china',
 'shorty like a, uh',
 'shorty like a',
 'wheezy outta here',
 'shorty like a thousand dollar plate, fine china',
 'tell her that she beautiful every day, i remind her',
 "then i jump in the pussy like a lake, i'm a diver"]

In [6]:
with open(noisy_train_path) as f:
    noisy_test = f.read().splitlines()

noisy_test[:10]

['just a lsot boy in a smmall tiwn',
 'singing "lve is foreiver and eve"',
 'good on paper, piccture perfect',
 "cayse i know i'm adicted to your dtama",
 'yoou see me i be',
 'i gotta tell tehm to myseclf',
 "i'm sitll learning to lovke",
 "no, i can't sleep until i feel your toch",
 'and all i caan think',
 "so beautiful yuo're letaving me"]

## BERTChecker

### Load pretrained model

In [7]:
# create BertChecker
bert_checker = BertChecker()
# use pretrained model
bert_checker.from_pretrained()

loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 185211810
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise


### Evaluate pretrained model on testing set

In [8]:
bert_checker.evaluate(clean_file="test_lyrics.txt", corrupt_file="test_lyrics_noisy.txt", data_dir=test_data_path)

../dataset/test test_lyrics.txt test_lyrics_noisy.txt


1418it [00:00, 472625.80it/s]
1418it [00:00, 708291.42it/s]


loaded tuples of (corr,incorr) examples from ../dataset/test
###############################################
data size: 1418


355it [01:29,  3.96it/s]


Epoch None valid_loss: 0.6923443188262008
total inference time for this data is: 89.627995 secs
###############################################


total token count: 11906
_corr2corr:8991, _corr2incorr:664, _incorr2corr:1761, _incorr2incorr:490
accuracy is 0.9030740802956493
word correction rate is 0.7823189693469569
###############################################





### Example of wrong outputs 

In [9]:
bert_correct = bert_checker.correct_strings(noisy_test)
bert_correct[:10]

KeyboardInterrupt: 

In [None]:
clean_test_prep = [i.replace(' ', '').strip().lower() for i in clean_test]
bert_correct_prep = [i.replace(' ', '').strip().lower() for i in bert_correct]

print('len data:', len(clean_test_prep))
print('Number of correct data after bert:', sum(a == b for a, b in zip(bert_correct_prep, clean_test_prep)))

example: justalotboyinasmalltown
len data: 7309
correct data after bert: 4404


In [None]:
for i in range(len(clean_test_prep)):
    if clean_test_prep[i] != bert_correct_prep[i]:
        print(clean_test[i], ' | ', bert_correct[i])

just a lost boy in a small town  |  just a lot boy in a small town
singing "love is forever and ever"  |  singing " love is forever and even "
i'm still learning to love  |  I ' m still learning to look
i feel like i could die walking up to the room, oh yeah  |  I feel like I could die waking up to the room , oh yeah
i'm really gonna miss you picking fights  |  I ' m really gonna miss you picking gifts
even tried to bite my tongue when you start shit  |  even tried to bite my tongue when you start shot
what the fuck did i do  |  what the folk did i do
blurring all the lines, you intoxicate me  |  blurring all the lines , you intoxicated me
kick in the door waving the coco  |  kick in the floor wearing the condo
and your craigslist couch and the way your voice sounds  |  and your craigslist such as the way your voice sounds
vroom vroom, i'll see you latеr, bye  |  from freedom , I ' ll see you later , bye
and a neigh neigh there.  |  and a new night there .
i just wish i can be there wi

### Fine-tuning BertChecker

In [None]:
# create new BertChecker
bert_checker_new = BertChecker()
# load pretrained model
bert_checker_new.from_pretrained()
# fine-tune on training data
bert_checker_new.finetune(clean_file="clean_lyrics.txt", corrupt_file="noisy_lyrics.txt", data_dir=train_data_path)

loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 185211810
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise


7309it [00:00, 811976.37it/s]
7309it [00:00, 1461139.50it/s]


loaded tuples of (corr,incorr) examples from e:\nlp\nlp-env\neuspell\neuspell\../data\traintest
len of train and test data:  5848 1461
CHECKPOINT_PATH: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased
Training model params
In epoch: 0
train_data size: 5848
Percent: [>                             ] 1% || batch_time: 4.1719 || batch_loss: 0.7811 || avg_batch_loss: 0.5298 || batch_acc: 0.9580 || avg_batch_acc: 0.9580 

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:882.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


Percent: [----------------------------->] 100% || batch_time: 3.0696 || batch_loss: 0.0887 || avg_batch_loss: 0.2350 || batch_acc: 0.9580 || avg_batch_acc: 0.9580 
Epoch 0 train_loss: 0.23499097294540558
valid_data size: 1461
Percent: [----------------------------->] 100% || batch_time: 1.9114 || batch_loss: 0.0270 || avg_batch_loss: 0.1347 || batch_acc: 0.9938 || avg_batch_acc: 0.9714 
Epoch 0 valid_loss: 0.1346610203061415
validation accuracy improved from -1.0000 to 44.6831
Model saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased\pytorch_model.bin in epoch 0
In epoch: 1
train_data size: 5848
Percent: [----------------------------->] 100% || batch_time: 3.6861 || batch_loss: 0.0008 || avg_batch_loss: 0.0230 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------->] 100% || batch_time: 2.8948 || batch_loss: 0.0225 || avg_batch_loss: 0.0230 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 
Epoch 1 train_loss: 0.022979982243882636
valid_data size: 1461
Percent: [----------------------------->] 100% || batch_time: 1.8254 || batch_loss: 0.0135 || avg_batch_loss: 0.1153 || batch_acc: 0.9938 || avg_batch_acc: 0.9754 
Epoch 1 valid_loss: 0.11528032609140096
validation accuracy improved from 44.6831 to 44.8663
Model saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased\pytorch_model.bin in epoch 1
In epoch: 2
train_data size: 5848
Percent: [>                             ] 1% || batch_time: 2.9128 || batch_loss: 0.0028 || avg_batch_loss: 0.0088 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [>                             ] 2% || batch_time: 2.6306 || batch_loss: 0.0123 || avg_batch_loss: 0.0060 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [>                             ] 3% || batch_time: 2.9026 || batch_loss: 0.0017 || avg_batch_loss: 0.0090 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [>                             ] 4% || batch_time: 3.2799 || batch_loss: 0.0005 || avg_batch_loss: 0.0082 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [->                            ] 5% || batch_time: 3.9419 || batch_loss: 0.0083 || avg_batch_loss: 0.0097 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [->                            ] 6% || batch_time: 2.4786 || batch_loss: 0.0009 || avg_batch_loss: 0.0093 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [->                            ] 7% || batch_time: 3.2838 || batch_loss: 0.0053 || avg_batch_loss: 0.0090 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-->                           ] 8% || batch_time: 2.6781 || batch_loss: 0.0140 || avg_batch_loss: 0.0084 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-->                           ] 10% || batch_time: 3.6858 || batch_loss: 0.0054 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-->                           ] 11% || batch_time: 3.3328 || batch_loss: 0.0085 || avg_batch_loss: 0.0077 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--->                          ] 12% || batch_time: 3.4228 || batch_loss: 0.0020 || avg_batch_loss: 0.0076 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--->                          ] 13% || batch_time: 3.4913 || batch_loss: 0.0006 || avg_batch_loss: 0.0073 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--->                          ] 14% || batch_time: 1.9404 || batch_loss: 0.0006 || avg_batch_loss: 0.0076 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---->                         ] 15% || batch_time: 3.7448 || batch_loss: 0.0061 || avg_batch_loss: 0.0074 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---->                         ] 16% || batch_time: 3.0177 || batch_loss: 0.0008 || avg_batch_loss: 0.0074 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---->                         ] 17% || batch_time: 2.8596 || batch_loss: 0.0020 || avg_batch_loss: 0.0072 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---->                         ] 18% || batch_time: 3.4928 || batch_loss: 0.0034 || avg_batch_loss: 0.0072 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----->                        ] 19% || batch_time: 2.3495 || batch_loss: 0.0035 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----->                        ] 20% || batch_time: 3.1327 || batch_loss: 0.0016 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----->                        ] 22% || batch_time: 4.0560 || batch_loss: 0.0045 || avg_batch_loss: 0.0067 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------>                       ] 23% || batch_time: 3.5878 || batch_loss: 0.0030 || avg_batch_loss: 0.0065 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------>                       ] 24% || batch_time: 3.2748 || batch_loss: 0.0061 || avg_batch_loss: 0.0074 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------>                       ] 25% || batch_time: 3.3018 || batch_loss: 0.0075 || avg_batch_loss: 0.0075 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------->                      ] 26% || batch_time: 3.2458 || batch_loss: 0.0045 || avg_batch_loss: 0.0075 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------->                      ] 27% || batch_time: 2.5786 || batch_loss: 0.0035 || avg_batch_loss: 0.0073 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------->                      ] 28% || batch_time: 2.8085 || batch_loss: 0.0148 || avg_batch_loss: 0.0074 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------->                     ] 29% || batch_time: 3.1407 || batch_loss: 0.0039 || avg_batch_loss: 0.0074 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------->                     ] 30% || batch_time: 3.0117 || batch_loss: 0.0160 || avg_batch_loss: 0.0080 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------->                     ] 31% || batch_time: 3.3291 || batch_loss: 0.0097 || avg_batch_loss: 0.0080 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------->                    ] 33% || batch_time: 3.8559 || batch_loss: 0.0067 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------->                    ] 34% || batch_time: 3.7522 || batch_loss: 0.0084 || avg_batch_loss: 0.0082 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------->                    ] 35% || batch_time: 3.0367 || batch_loss: 0.0023 || avg_batch_loss: 0.0080 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------->                   ] 36% || batch_time: 3.8697 || batch_loss: 0.0062 || avg_batch_loss: 0.0080 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------->                   ] 37% || batch_time: 3.6778 || batch_loss: 0.0028 || avg_batch_loss: 0.0079 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------->                   ] 38% || batch_time: 3.5768 || batch_loss: 0.0074 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------->                  ] 39% || batch_time: 4.1040 || batch_loss: 0.0101 || avg_batch_loss: 0.0085 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------->                  ] 40% || batch_time: 4.7096 || batch_loss: 0.0016 || avg_batch_loss: 0.0084 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------->                  ] 41% || batch_time: 3.3513 || batch_loss: 0.0010 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------>                 ] 42% || batch_time: 3.8969 || batch_loss: 0.0531 || avg_batch_loss: 0.0084 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------>                 ] 43% || batch_time: 3.0387 || batch_loss: 0.0010 || avg_batch_loss: 0.0082 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------>                 ] 45% || batch_time: 3.7529 || batch_loss: 0.0059 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------->                ] 46% || batch_time: 3.0337 || batch_loss: 0.0052 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------->                ] 47% || batch_time: 4.0139 || batch_loss: 0.0072 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------->                ] 48% || batch_time: 3.1607 || batch_loss: 0.0016 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------->               ] 49% || batch_time: 3.4818 || batch_loss: 0.0056 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------->               ] 50% || batch_time: 4.9862 || batch_loss: 0.0035 || avg_batch_loss: 0.0080 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------->               ] 51% || batch_time: 3.9641 || batch_loss: 0.0125 || avg_batch_loss: 0.0080 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------->              ] 52% || batch_time: 2.4956 || batch_loss: 0.0074 || avg_batch_loss: 0.0080 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------->              ] 53% || batch_time: 3.4388 || batch_loss: 0.0044 || avg_batch_loss: 0.0079 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------->              ] 54% || batch_time: 4.1459 || batch_loss: 0.0157 || avg_batch_loss: 0.0082 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------->             ] 55% || batch_time: 3.1227 || batch_loss: 0.0010 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------->             ] 57% || batch_time: 3.6597 || batch_loss: 0.0079 || avg_batch_loss: 0.0082 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------->             ] 58% || batch_time: 3.2527 || batch_loss: 0.0031 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------->            ] 59% || batch_time: 3.4048 || batch_loss: 0.0020 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------->            ] 60% || batch_time: 4.0024 || batch_loss: 0.0027 || avg_batch_loss: 0.0080 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------->            ] 61% || batch_time: 3.9219 || batch_loss: 0.0026 || avg_batch_loss: 0.0079 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------>           ] 62% || batch_time: 3.7098 || batch_loss: 0.0031 || avg_batch_loss: 0.0078 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------>           ] 63% || batch_time: 6.5744 || batch_loss: 0.0218 || avg_batch_loss: 0.0080 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------>           ] 64% || batch_time: 2.8736 || batch_loss: 0.0554 || avg_batch_loss: 0.0082 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------->          ] 65% || batch_time: 2.9096 || batch_loss: 0.0026 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------->          ] 66% || batch_time: 4.1623 || batch_loss: 0.0051 || avg_batch_loss: 0.0084 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------->          ] 67% || batch_time: 3.8222 || batch_loss: 0.0169 || avg_batch_loss: 0.0084 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------->         ] 69% || batch_time: 3.9889 || batch_loss: 0.0060 || avg_batch_loss: 0.0084 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------->         ] 70% || batch_time: 5.4402 || batch_loss: 0.0025 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------->         ] 71% || batch_time: 3.7058 || batch_loss: 0.0056 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------->        ] 72% || batch_time: 3.7408 || batch_loss: 0.0042 || avg_batch_loss: 0.0082 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------->        ] 73% || batch_time: 2.7077 || batch_loss: 0.0009 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------->        ] 74% || batch_time: 2.8367 || batch_loss: 0.0018 || avg_batch_loss: 0.0080 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------------->       ] 75% || batch_time: 3.0327 || batch_loss: 0.0063 || avg_batch_loss: 0.0080 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------------->       ] 76% || batch_time: 2.9076 || batch_loss: 0.0027 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------------->       ] 77% || batch_time: 3.2609 || batch_loss: 0.0036 || avg_batch_loss: 0.0084 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------->      ] 78% || batch_time: 2.8806 || batch_loss: 0.0210 || avg_batch_loss: 0.0084 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------->      ] 80% || batch_time: 4.1124 || batch_loss: 0.0055 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------->      ] 81% || batch_time: 3.0067 || batch_loss: 0.0096 || avg_batch_loss: 0.0084 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------>     ] 82% || batch_time: 3.4088 || batch_loss: 0.0010 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------>     ] 83% || batch_time: 3.0767 || batch_loss: 0.0077 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------>     ] 84% || batch_time: 2.7496 || batch_loss: 0.0193 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------>     ] 85% || batch_time: 3.0067 || batch_loss: 0.0051 || avg_batch_loss: 0.0082 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------->    ] 86% || batch_time: 3.0137 || batch_loss: 0.0009 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------->    ] 87% || batch_time: 3.5776 || batch_loss: 0.0069 || avg_batch_loss: 0.0082 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------->    ] 88% || batch_time: 4.8166 || batch_loss: 0.0155 || avg_batch_loss: 0.0082 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------------->   ] 89% || batch_time: 3.3297 || batch_loss: 0.0135 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------------->   ] 90% || batch_time: 3.5928 || batch_loss: 0.0199 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------------->   ] 92% || batch_time: 3.3888 || batch_loss: 0.0008 || avg_batch_loss: 0.0082 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------------->  ] 93% || batch_time: 2.8776 || batch_loss: 0.0021 || avg_batch_loss: 0.0082 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------------->  ] 94% || batch_time: 3.7618 || batch_loss: 0.0407 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------------->  ] 95% || batch_time: 2.9147 || batch_loss: 0.0051 || avg_batch_loss: 0.0086 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------> ] 96% || batch_time: 3.2888 || batch_loss: 0.0024 || avg_batch_loss: 0.0085 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------> ] 97% || batch_time: 3.5609 || batch_loss: 0.0042 || avg_batch_loss: 0.0085 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------> ] 98% || batch_time: 2.7897 || batch_loss: 0.0069 || avg_batch_loss: 0.0085 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------->] 99% || batch_time: 3.4971 || batch_loss: 0.0012 || avg_batch_loss: 0.0084 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------->] 100% || batch_time: 3.6199 || batch_loss: 0.0036 || avg_batch_loss: 0.0084 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------->] 100% || batch_time: 2.9512 || batch_loss: 0.0012 || avg_batch_loss: 0.0084 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 
Epoch 2 train_loss: 0.008352796749966957
valid_data size: 1461
Percent: [----------------------------->] 100% || batch_time: 1.9136 || batch_loss: 0.0135 || avg_batch_loss: 0.1153 || batch_acc: 0.9938 || avg_batch_acc: 0.9754 
Epoch 2 valid_loss: 0.11528032609140096
validation accuracy improved from 44.8663 to 44.8663
Model saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased\pytorch_model.bin in epoch 2
Model and logs saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased


### Evaluate on testing set after fine-tuning

In [None]:
bert_checker_new.evaluate(clean_file="test_lyrics.txt", corrupt_file="test_lyrics_noisy.txt", data_dir=test_data_path)

e:\nlp\nlp-env\neuspell\neuspell\../data\traintest clean_lyrics.txt noisy_lyrics.txt


7309it [00:00, 730743.90it/s]
7309it [00:00, 1217577.57it/s]


loaded tuples of (corr,incorr) examples from e:\nlp\nlp-env\neuspell\neuspell\../data\traintest
###############################################
data size: 7309


1828it [07:04,  4.31it/s]


Epoch None valid_loss: 0.026905515569071756
total inference time for this data is: 424.256846 secs
###############################################


total token count: 58943
_corr2corr:43238, _corr2incorr:60, _incorr2corr:14843, _incorr2incorr:802
accuracy is 0.9853757019493409
word correction rate is 0.9487376158517098
###############################################





### Result analysis

In [None]:
bert_correct_new = bert_checker_new.correct_strings(noisy_test)
bert_correct_new[:10]

['just a lost boy in a small town',
 'singing " love is forever and eve "',
 'good on paper , picture perfect',
 "cause i know i ' m addicted to your drama",
 'you see me i be',
 'i gotta tell them to myself',
 "i ' m still learning to love",
 "no , i can ' t sleep until i feel your touch",
 'and all i can think',
 "so beautiful you ' re leaving me"]

In [None]:
clean_test_prep = [i.replace(' ', '').strip().lower() for i in clean_test]
bert_correct_new_prep = [i.replace(' ', '').strip().lower() for i in bert_correct_new]

print('len data:', len(clean_test_prep))
print('correct data after bert:', sum(a == b for a, b in zip(bert_correct_new_prep, clean_test_prep)))

example: justalostboyinasmalltown
len data: 7309
correct data after bert: 6456


In [None]:
for i in range(len(clean_test_prep)):
    if clean_test_prep[i] != bert_correct_new_prep[i]:
        print(clean_test_prep[i], ' | ', bert_correct_new[i])

singing "love is forever and ever"  |  singing " love is forever and eve "
i'm really gonna miss you picking fights  |  i ' m really gonna miss you picking gifts
blurring all the lines, you intoxicate me  |  blurring all the lines , you inotxicate me
kick in the door waving the coco  |  kick in the door waving the codo
vroom vroom, i'll see you latеr, bye  |  from vriom , i ' ll see you later , bye
and a neigh neigh there.  |  and a neih neigh there .
i gave a second chance to cupid  |  i gave a second chance to cuepid
that'll work, come over  |  thrill work , come over
got a lot of love, well you better save it for me  |  got a lot of love , well you better saved it for me
we went to bed in france, then we woke up in japan  |  we went to bed in france , then we woke up in jaoan
they don't want to see us get too attached  |  they dont want to see us get too attached
oh, make you try fo understand  |  oh , make you try to understand
oh, i might stop talking to people before i snap, snap

## SC-LSTM plus ELMO (at input)

### Load pretrained model

In [None]:
# create SclstmChecker
elmo_checker = SclstmChecker()
# add ELMO at input
elmo_checker = elmo_checker.add_("elmo", at="input")  # "elmo" or "bert", "input" or "output"
# load pretrained model
elmo_checker.from_pretrained()

loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 209906438
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise
new model loaded: <class 'neuspell.corrector_elmosclstm.ElmosclstmChecker'>
loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 209906438
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise
creating spacy models ...
spacy models initialized




['just a lost boy in a small town',
 'singing " love is forever and ever "',
 'good on paper , picture perfect',
 "cause I know i 'm addicted to your data",
 'you see me and be',
 'I got to tell them to myself',
 "I 'm still learning to love",
 'no , I can not sleep until i feel your touch',
 'and all I can think',
 'so beautiful your leaving me']

### Evaluate pretrained model on testing set

In [None]:
elmo_checker.evaluate(clean_file="test_lyrics.txt", corrupt_file="test_lyrics_noisy.txt", data_dir=test_data_path)

e:\nlp\nlp-env\neuspell\neuspell\../data\traintest clean_lyrics.txt noisy_lyrics.txt


7309it [00:00, 812126.95it/s]
7309it [00:00, 1217916.17it/s]


loaded tuples of (corr,incorr) examples from e:\nlp\nlp-env\neuspell\neuspell\../data\traintest
data size: 7309


1828it [14:40,  2.08it/s]


Epoch None valid_loss: 1.237444352181441
total inference time for this data is: 880.299118 secs
###############################################
total token count: 49947
corr2corr:30071, corr2incorr:4235, incorr2corr:10823, incorr2incorr:4818
accuracy is 0.8187478727451099
word correction rate is 0.691963429448245
###############################################





### Example of wrong outputs

In [None]:
# correct noisy data
elmo_correct = elmo_checker.correct_strings(noisy_test)
elmo_correct[:10]

In [None]:
clean_test_prep = [i.replace(' ', '').strip().lower() for i in clean_test]
elmo_correct_prep = [i.replace(' ', '').strip().lower() for i in elmo_correct]

print('len data:', len(clean_test_prep))
print('correct data after bert:', sum(a == b for a, b in zip(elmo_correct_prep, clean_data_prep)))

example: justalostboyinasmalltown
len data: 7309
correct data after bert: 3885


In [None]:
for i in range(len(clean_test_prep)):
    if clean_test_prep[i] != elmo_correct_prep[i]:
        print(clean_test[i], ' | ', elmo_correct[i])

cause i know i'm addicted to your drama  |  cause I know i 'm addicted to your data
you see me i be  |  you see me and be
i gotta tell them to myself  |  I got to tell them to myself
no, i can't sleep until i feel your touch  |  no , I can not sleep until i feel your touch
so beautiful you're leaving me  |  so beautiful your leaving me
when it wasn't yours, yeah  |  when it was not yours , yeah
i feel like i could die walking up to the room, oh yeah  |  I feel like I could die waking up to the room , oh yeah
i'm really gonna miss you picking fights  |  I 'm really on no misos you picking fights
even tried to bite my tongue when you start shit  |  even tried to bite my tongue when you start shut
and i won't be your victim  |  and it wont be your victim
what the fuck did i do  |  what the fuck did it do
blurring all the lines, you intoxicate me  |  blurring all the lines , you inotxicate me
kick in the door waving the coco  |  kick in the door waving the code
and your craigslist couch an

### Fine-tuning SclstmChecker

In [None]:
# create AclstmChecker
elmo_checker_new = SclstmChecker()
# add elmo at input
elmo_checker_new = elmo_checker_new.add_("elmo", at="input")  # "elmo" or "bert", "input" or "output"
# load pretrained model
elmo_checker_new.from_pretrained()
# fine-tune on training data
elmo_checker_new.finetune(clean_file="clean_lyrics.txt", corrupt_file="noisy_lyrics.txt", data_dir=train_data_path)

loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 209906438
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise
new model loaded: <class 'neuspell.corrector_elmosclstm.ElmosclstmChecker'>
loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 209906438
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise


7309it [00:00, 730709.06it/s]
7309it [00:00, 1826729.11it/s]


loaded tuples of (corr,incorr) examples from e:\nlp\nlp-env\neuspell\neuspell\../data\traintest
len of train and test data:  5848 1461
CHECKPOINT_PATH: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\new_models\elmo-base-cased
Training model params from scratch
In epoch: 0
train_data size: 5848
Percent: [----------------------------->] 100% || batch_time: 1.7134 || batch_loss: 0.0763 || avg_batch_loss: 0.2813 || batch_acc: 0.7664 || avg_batch_acc: 0.7664 
Epoch 0 train_loss: 0.2813334097383452
valid_data size: 1461
Percent: [----------------------------->] 100% || batch_time: 1.3783 || batch_loss: 0.0173 || avg_batch_loss: 0.1022 || batch_acc: 1.0000 || avg_batch_acc: 0.9698 
Epoch 0 valid_loss: 0.10223011644152195
Model saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\new_models\elmo-base-cased\model.pth.tar in epoch 0
In epoch: 1
train_data size: 5848
Percent: [----------------------------->] 100% || batch_time: 1.5775

### Evaluate on testing set after fine-tuning

In [None]:
elmo_checker_new.evaluate(clean_file="test_lyrics.txt", corrupt_file="test_lyrics_noisy.txt", data_dir=test_data_path)

e:\nlp\nlp-env\neuspell\neuspell\../data\traintest clean_lyrics.txt noisy_lyrics.txt


7309it [00:00, 811976.37it/s]
7309it [00:00, 1044076.29it/s]


loaded tuples of (corr,incorr) examples from e:\nlp\nlp-env\neuspell\neuspell\../data\traintest
data size: 7309


1828it [14:29,  2.10it/s]


Epoch None valid_loss: 0.024568097715663473
total inference time for this data is: 869.061731 secs
###############################################
total token count: 49947
corr2corr:34261, corr2incorr:45, incorr2corr:11927, incorr2incorr:3714
accuracy is 0.9247402246381164
word correction rate is 0.7625471517166421
###############################################





### Result analysis

In [None]:
elmo_correct_new = elmo_checker_new.correct_strings(noisy_test)
elmo_correct_new[:10]

['just a lost boy in a small town',
 'singing " love is forever and eve "',
 'good on paper , picture perfect',
 "cause i know i 'm addicted to your drama",
 'you see me i be',
 'i got to tell them to myself',
 "i 'm still learning to love",
 'no , i can not sleep until i feel your touch',
 'and all i can think',
 "so beautiful yuo're leaving me"]

In [None]:
clean_test_prep = [i.replace(' ', '').strip().lower() for i in clean_test]
elmo_correct_new_prep = [i.replace(' ', '').strip().lower() for i in elmo_correct_new]

print('len data:', len(clean_test_prep))
print('correct data after bert:', sum(a == b for a, b in zip(elmo_correct_new_prep, clean_test_prep)))

example: justalostboyinasmalltown
len data: 7309
correct data after bert: 4988


In [None]:
for i in range(len(clean_test_prep)):
    if clean_test_prep[i] != elmo_correct_new_prep[i]:
        print(clean_test[i], ' | ', elmo_correct_new[i])

singing "love is forever and ever"  |  singing " love is forever and eve "
i gotta tell them to myself  |  i got to tell them to myself
no, i can't sleep until i feel your touch  |  no , i can not sleep until i feel your touch
so beautiful you're leaving me  |  so beautiful yuo're leaving me
when it wasn't yours, yeah  |  when it was not yours , yeah
and i won't be your victim  |  and i wn't be your victim
blurring all the lines, you intoxicate me  |  blurring all the lines , you inotxicate me
kick in the door waving the coco  |  kick in the door waving the codo
vroom vroom, i'll see you latеr, bye  |  vfoom vriom , i 'll see you later , bye
and a neigh neigh there.  |  and a neih neigh there .
i can't describe  |  i cn't describe
if you love me won't you  |  if you love me wons't you
don't leave me stuck here in the streets no, no  |  don'xt leave me stuck here in the streets no , no
cause i'm in a field of dandelions  |  cause i'lm in a field of dandelions
i don't mean no harm  |  i 