# Correcting songs and fine-tuing

## Import modules

In [1]:
import os
from neuspell import available_checkers, BertChecker, SclstmChecker

print(f"available checkers: {available_checkers()}")

data folder is set to `e:\nlp\nlp-env\neuspell\neuspell\../data` script
available checkers: ['BertsclstmChecker', 'CnnlstmChecker', 'ElmosclstmChecker', 'NestedlstmChecker', 'SclstmChecker', 'SclstmbertChecker', 'SclstmelmoChecker', 'BertChecker']


## Define data path

In [2]:
clean_file = 'clean_lyrics.txt'
noisy_file = 'noisy_lyrics.txt'

train_data_path = '../dataset/train'
clean_train_path = os.path.join(train_data_path, clean_file)
noisy_train_path = os.path.join(train_data_path, noisy_file)

test_data_path = '../dataset/test'
clean_test_path = os.path.join(test_data_path, clean_file)
noisy_test_path = os.path.join(test_data_path, noisy_file)

## Load training data

In [3]:
with open(clean_train_path) as f:
    clean_train = f.read().splitlines()

clean_train[:10]

['well show me the way',
 "'cause nothin' from nothin' leaves nothin'",
 "she said look, what's your game baby",
 'savage love, did somebody, did somebody break your heart',
 "told you i'll be here forever",
 'oh baby, we found love right where we are maybe',
 'all i know is we said, "hello"',
 "another saturday night and i ain't got nobody",
 'if your schemes like your dreams',
 'you fell, i caught you']

In [4]:
with open(noisy_train_path) as f:
    noisy_train = f.read().splitlines()

noisy_train[:10]

['well show me the way',
 "'caue nothvin' from nothin' leaves nothin'",
 "sye said look, wha's yocur gmae baby",
 'svage live, did somebodvy, did somebody beak your heart',
 "tlod you i'll be helre forever",
 'oh baby, we found love riaght where we ahre mabe',
 'all i know is we saiid, "ello"',
 "another saturfay ngiht and i ain't got nobody",
 'if your schemes lkie your dreas',
 'you fell, i caught you']

In [5]:
with open(clean_test_path) as f:
    clean_test = f.read().splitlines()

clean_test[:10]

["cos he's alright in the city",
 'do you want me or do you not?',
 'and when i take you shopping',
 "i'm going to jupiter with my girl don't call your exes",
 'with your golden grill, true love never dies',
 "now my dick ain't free",
 "i'm wild as can be, and i want",
 'thanks, mr. president',
 'hey, batter, batter swing',
 'city girl, but she grew up in the tri-state']

In [6]:
with open(noisy_train_path) as f:
    noisy_test = f.read().splitlines()

noisy_test[:10]

['well show me the way',
 "'caue nothvin' from nothin' leaves nothin'",
 "sye said look, wha's yocur gmae baby",
 'svage live, did somebodvy, did somebody beak your heart',
 "tlod you i'll be helre forever",
 'oh baby, we found love riaght where we ahre mabe',
 'all i know is we saiid, "ello"',
 "another saturfay ngiht and i ain't got nobody",
 'if your schemes lkie your dreas',
 'you fell, i caught you']

## BERTChecker

### Load pretrained model

In [7]:
# create BertChecker
bert_checker = BertChecker()
# use pretrained model
bert_checker.from_pretrained()

loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 185211810
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise


### Evaluate pretrained model on testing set

In [8]:
bert_checker.evaluate(clean_file=clean_file, corrupt_file=noisy_file, data_dir=test_data_path)

../dataset/test clean_lyrics.txt noisy_lyrics.txt


1418it [00:00, 472363.04it/s]
1418it [00:00, 708460.16it/s]


loaded tuples of (corr,incorr) examples from ../dataset/test
###############################################
data size: 1418


355it [01:40,  3.52it/s]


Epoch None valid_loss: 0.6795691047670503
total inference time for this data is: 100.818982 secs
###############################################


total token count: 11906
_corr2corr:8944, _corr2incorr:681, _incorr2corr:1805, _incorr2incorr:476
accuracy is 0.9028221065009239
word correction rate (precision) is 0.791319596668128
word correction rate (recall) is 0.7260659694288013
###############################################





### Example of wrong outputs 

In [9]:
bert_correct = bert_checker.correct_strings(noisy_test)
bert_correct[:10]

['will show me the way',
 "' cause nothin ' from nothin ' leaves nothin '",
 "I said look , who ' s your game baby",
 'savage life , did somebody , did somebody break your heart',
 "told you I ' ll be here forever",
 'oh baby , we found love right where we are made',
 'all I know is we said , " hello "',
 "another saturfay night and I ain ' t got nobody",
 'if your schemes like your dreams',
 'you fell , I caught you']

In [10]:
clean_test_prep = [i.replace(' ', '').strip().lower() for i in clean_test]
bert_correct_prep = [i.replace(' ', '').strip().lower() for i in bert_correct]

print('len data:', len(clean_test_prep))
print('Number of correct data after bert:', sum(a == b for a, b in zip(bert_correct_prep, clean_test_prep)))

len data: 1418
Number of correct data after bert: 0


In [11]:
for i in range(len(clean_test_prep)):
    if clean_test_prep[i] != bert_correct_prep[i]:
        print(clean_test[i], ' | ', bert_correct[i])

cos he's alright in the city  |  will show me the way
do you want me or do you not?  |  ' cause nothin ' from nothin ' leaves nothin '
and when i take you shopping  |  I said look , who ' s your game baby
i'm going to jupiter with my girl don't call your exes  |  savage life , did somebody , did somebody break your heart
with your golden grill, true love never dies  |  told you I ' ll be here forever
now my dick ain't free  |  oh baby , we found love right where we are made
i'm wild as can be, and i want  |  all I know is we said , " hello "
thanks, mr. president  |  another saturfay night and I ain ' t got nobody
hey, batter, batter swing  |  if your schemes like your dreams
city girl, but she grew up in the tri-state  |  you fell , I caught you
i need that brazilian, wavy, twenty-eight inch, you playin'  |  I ' m on my way now
thinking of your love, boo.  |  see i ' d like to see you looking still
be a gentleman  |  go , give us the reason
in such a sweet way  |  ooh , oh
in a bad he

### Fine-tuning BertChecker

In [12]:
# create new BertChecker
bert_checker_new = BertChecker()
# load pretrained model
bert_checker_new.from_pretrained()
# fine-tune on training data
bert_checker_new.finetune(clean_file=clean_file, corrupt_file=noisy_file, data_dir=train_data_path)

loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 185211810
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise


7309it [00:00, 521940.38it/s]
7309it [00:00, 115986.53it/s]


loaded tuples of (corr,incorr) examples from ../dataset/train
len of train and test data:  5848 1461
CHECKPOINT_PATH: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased
Training model params
In epoch: 0
train_data size: 5848
Percent: [>                             ] 1% || batch_time: 3.5092 || batch_loss: 0.4359 || avg_batch_loss: 0.3590 || batch_acc: 0.9381 || avg_batch_acc: 0.9381 

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:882.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


Percent: [----------------------------->] 100% || batch_time: 2.8880 || batch_loss: 0.1148 || avg_batch_loss: 0.1962 || batch_acc: 0.9381 || avg_batch_acc: 0.9381 
Epoch 0 train_loss: 0.19620183913725495
valid_data size: 1461
Percent: [----------------------------->] 100% || batch_time: 1.4874 || batch_loss: 0.0244 || avg_batch_loss: 0.1062 || batch_acc: 0.9944 || avg_batch_acc: 0.9781 
Epoch 0 valid_loss: 0.10617777105906735
validation accuracy improved from -1.0000 to 44.9934
Model saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased\pytorch_model.bin in epoch 0
In epoch: 1
train_data size: 5848
Percent: [----------------------------->] 100% || batch_time: 3.3233 || batch_loss: 0.0009 || avg_batch_loss: 0.0174 || batch_acc: 0.9928 || avg_batch_acc: 0.9928 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------->] 100% || batch_time: 3.1210 || batch_loss: 0.0006 || avg_batch_loss: 0.0174 || batch_acc: 0.9928 || avg_batch_acc: 0.9928 
Epoch 1 train_loss: 0.01738386039008721
valid_data size: 1461
Percent: [----------------------------->] 100% || batch_time: 1.4163 || batch_loss: 0.0198 || avg_batch_loss: 0.0955 || batch_acc: 0.9888 || avg_batch_acc: 0.9799 
Epoch 1 valid_loss: 0.09546498669837804
validation accuracy improved from 44.9934 to 45.0753
Model saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased\pytorch_model.bin in epoch 1
In epoch: 2
train_data size: 5848
Percent: [>                             ] 1% || batch_time: 3.7189 || batch_loss: 0.0013 || avg_batch_loss: 0.0021 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [>                             ] 2% || batch_time: 2.4036 || batch_loss: 0.0660 || avg_batch_loss: 0.0117 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [>                             ] 3% || batch_time: 3.9850 || batch_loss: 0.0023 || avg_batch_loss: 0.0091 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [>                             ] 4% || batch_time: 2.4826 || batch_loss: 0.0005 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [->                            ] 5% || batch_time: 3.3658 || batch_loss: 0.0008 || avg_batch_loss: 0.0062 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [->                            ] 6% || batch_time: 3.1748 || batch_loss: 0.0009 || avg_batch_loss: 0.0055 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [->                            ] 7% || batch_time: 3.8951 || batch_loss: 0.0014 || avg_batch_loss: 0.0055 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-->                           ] 8% || batch_time: 2.4127 || batch_loss: 0.0010 || avg_batch_loss: 0.0050 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-->                           ] 10% || batch_time: 2.9597 || batch_loss: 0.0002 || avg_batch_loss: 0.0048 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-->                           ] 11% || batch_time: 3.4498 || batch_loss: 0.0024 || avg_batch_loss: 0.0046 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--->                          ] 12% || batch_time: 3.2378 || batch_loss: 0.0017 || avg_batch_loss: 0.0048 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--->                          ] 13% || batch_time: 3.4958 || batch_loss: 0.0177 || avg_batch_loss: 0.0050 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--->                          ] 14% || batch_time: 2.1900 || batch_loss: 0.0051 || avg_batch_loss: 0.0062 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---->                         ] 15% || batch_time: 3.7073 || batch_loss: 0.0010 || avg_batch_loss: 0.0085 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---->                         ] 16% || batch_time: 3.3328 || batch_loss: 0.0009 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---->                         ] 17% || batch_time: 2.9534 || batch_loss: 0.0124 || avg_batch_loss: 0.0081 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---->                         ] 18% || batch_time: 3.7510 || batch_loss: 0.0154 || avg_batch_loss: 0.0079 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----->                        ] 19% || batch_time: 3.6068 || batch_loss: 0.0030 || avg_batch_loss: 0.0077 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----->                        ] 20% || batch_time: 3.0938 || batch_loss: 0.0043 || avg_batch_loss: 0.0074 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----->                        ] 22% || batch_time: 3.9093 || batch_loss: 0.0029 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------>                       ] 23% || batch_time: 4.5281 || batch_loss: 0.0029 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------>                       ] 24% || batch_time: 2.2179 || batch_loss: 0.0007 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------>                       ] 25% || batch_time: 2.9857 || batch_loss: 0.0014 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------->                      ] 26% || batch_time: 3.6699 || batch_loss: 0.0005 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------->                      ] 27% || batch_time: 2.6657 || batch_loss: 0.0103 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------->                      ] 28% || batch_time: 2.3986 || batch_loss: 0.0008 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------->                     ] 29% || batch_time: 2.9462 || batch_loss: 0.0004 || avg_batch_loss: 0.0067 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------->                     ] 30% || batch_time: 3.4738 || batch_loss: 0.0130 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------->                     ] 31% || batch_time: 4.0110 || batch_loss: 0.0204 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------->                    ] 33% || batch_time: 3.0747 || batch_loss: 0.0034 || avg_batch_loss: 0.0068 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------->                    ] 34% || batch_time: 3.8807 || batch_loss: 0.0056 || avg_batch_loss: 0.0067 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------->                    ] 35% || batch_time: 2.8868 || batch_loss: 0.0012 || avg_batch_loss: 0.0066 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------->                   ] 36% || batch_time: 3.5906 || batch_loss: 0.0234 || avg_batch_loss: 0.0066 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------->                   ] 37% || batch_time: 2.6405 || batch_loss: 0.0067 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------->                   ] 38% || batch_time: 4.2830 || batch_loss: 0.0193 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------->                  ] 39% || batch_time: 3.0527 || batch_loss: 0.0018 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------->                  ] 40% || batch_time: 3.2188 || batch_loss: 0.0033 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------->                  ] 41% || batch_time: 3.5638 || batch_loss: 0.0056 || avg_batch_loss: 0.0072 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------>                 ] 42% || batch_time: 4.2969 || batch_loss: 0.0302 || avg_batch_loss: 0.0073 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------>                 ] 43% || batch_time: 3.1288 || batch_loss: 0.0309 || avg_batch_loss: 0.0073 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------>                 ] 45% || batch_time: 3.8754 || batch_loss: 0.0013 || avg_batch_loss: 0.0072 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------->                ] 46% || batch_time: 4.0394 || batch_loss: 0.0088 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------->                ] 47% || batch_time: 2.6806 || batch_loss: 0.0023 || avg_batch_loss: 0.0072 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------->                ] 48% || batch_time: 3.3461 || batch_loss: 0.0167 || avg_batch_loss: 0.0072 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------->               ] 49% || batch_time: 3.8189 || batch_loss: 0.0021 || avg_batch_loss: 0.0074 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------->               ] 50% || batch_time: 2.7287 || batch_loss: 0.0012 || avg_batch_loss: 0.0073 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------->               ] 51% || batch_time: 3.1553 || batch_loss: 0.0006 || avg_batch_loss: 0.0074 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------->              ] 52% || batch_time: 2.8667 || batch_loss: 0.0226 || avg_batch_loss: 0.0073 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------->              ] 53% || batch_time: 3.6324 || batch_loss: 0.0047 || avg_batch_loss: 0.0072 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------->              ] 54% || batch_time: 3.6089 || batch_loss: 0.0081 || avg_batch_loss: 0.0073 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------->             ] 55% || batch_time: 3.8009 || batch_loss: 0.0080 || avg_batch_loss: 0.0072 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------->             ] 57% || batch_time: 2.8763 || batch_loss: 0.0005 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------->             ] 58% || batch_time: 3.7709 || batch_loss: 0.0011 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------->            ] 59% || batch_time: 3.0353 || batch_loss: 0.0015 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------->            ] 60% || batch_time: 3.3008 || batch_loss: 0.0023 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------->            ] 61% || batch_time: 4.0910 || batch_loss: 0.0009 || avg_batch_loss: 0.0068 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------>           ] 62% || batch_time: 2.5828 || batch_loss: 0.0079 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------>           ] 63% || batch_time: 3.1039 || batch_loss: 0.0006 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------>           ] 64% || batch_time: 3.0287 || batch_loss: 0.0019 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------->          ] 65% || batch_time: 3.9350 || batch_loss: 0.0016 || avg_batch_loss: 0.0068 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------->          ] 66% || batch_time: 3.7570 || batch_loss: 0.0005 || avg_batch_loss: 0.0068 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------->          ] 67% || batch_time: 3.5398 || batch_loss: 0.0018 || avg_batch_loss: 0.0068 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------->         ] 69% || batch_time: 3.6611 || batch_loss: 0.0024 || avg_batch_loss: 0.0068 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------->         ] 70% || batch_time: 4.0730 || batch_loss: 0.0033 || avg_batch_loss: 0.0067 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------->         ] 71% || batch_time: 3.6825 || batch_loss: 0.0159 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------->        ] 72% || batch_time: 3.1417 || batch_loss: 0.0002 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------->        ] 73% || batch_time: 3.5955 || batch_loss: 0.0352 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------->        ] 74% || batch_time: 3.1863 || batch_loss: 0.0018 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------------->       ] 75% || batch_time: 3.0897 || batch_loss: 0.0033 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------------->       ] 76% || batch_time: 3.8099 || batch_loss: 0.0010 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------------->       ] 77% || batch_time: 3.3353 || batch_loss: 0.0007 || avg_batch_loss: 0.0076 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------->      ] 78% || batch_time: 3.5368 || batch_loss: 0.0008 || avg_batch_loss: 0.0076 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------->      ] 80% || batch_time: 3.0117 || batch_loss: 0.0228 || avg_batch_loss: 0.0077 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------->      ] 81% || batch_time: 3.3027 || batch_loss: 0.0018 || avg_batch_loss: 0.0077 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------>     ] 82% || batch_time: 2.9087 || batch_loss: 0.0013 || avg_batch_loss: 0.0076 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------>     ] 83% || batch_time: 3.4752 || batch_loss: 0.0041 || avg_batch_loss: 0.0076 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------>     ] 84% || batch_time: 2.9756 || batch_loss: 0.0110 || avg_batch_loss: 0.0076 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------>     ] 85% || batch_time: 2.1815 || batch_loss: 0.0002 || avg_batch_loss: 0.0075 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------->    ] 86% || batch_time: 4.8018 || batch_loss: 0.0007 || avg_batch_loss: 0.0076 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------->    ] 87% || batch_time: 2.9657 || batch_loss: 0.0004 || avg_batch_loss: 0.0076 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------->    ] 88% || batch_time: 3.2673 || batch_loss: 0.0063 || avg_batch_loss: 0.0075 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------------->   ] 89% || batch_time: 3.2533 || batch_loss: 0.0001 || avg_batch_loss: 0.0075 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------------->   ] 90% || batch_time: 3.3283 || batch_loss: 0.0198 || avg_batch_loss: 0.0076 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------------->   ] 92% || batch_time: 2.9787 || batch_loss: 0.0027 || avg_batch_loss: 0.0076 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------------->  ] 93% || batch_time: 3.6297 || batch_loss: 0.0122 || avg_batch_loss: 0.0076 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------------->  ] 94% || batch_time: 3.2042 || batch_loss: 0.0019 || avg_batch_loss: 0.0075 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------------->  ] 95% || batch_time: 3.1037 || batch_loss: 0.0046 || avg_batch_loss: 0.0075 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------> ] 96% || batch_time: 4.8444 || batch_loss: 0.0009 || avg_batch_loss: 0.0074 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------> ] 97% || batch_time: 3.2352 || batch_loss: 0.0012 || avg_batch_loss: 0.0074 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------> ] 98% || batch_time: 2.9588 || batch_loss: 0.0024 || avg_batch_loss: 0.0073 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------->] 99% || batch_time: 3.0717 || batch_loss: 0.0007 || avg_batch_loss: 0.0073 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------->] 100% || batch_time: 2.5456 || batch_loss: 0.0012 || avg_batch_loss: 0.0073 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------->] 100% || batch_time: 2.4236 || batch_loss: 0.0001 || avg_batch_loss: 0.0072 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 
Epoch 2 train_loss: 0.007243361844282974
valid_data size: 1461
Percent: [----------------------------->] 100% || batch_time: 1.4113 || batch_loss: 0.0198 || avg_batch_loss: 0.0955 || batch_acc: 0.9888 || avg_batch_acc: 0.9799 
Epoch 2 valid_loss: 0.09546498669837804
validation accuracy improved from 45.0753 to 45.0753
Model saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased\pytorch_model.bin in epoch 2
Model and logs saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased


### Evaluate on testing set after fine-tuning

In [13]:
bert_checker_new.evaluate(clean_file=clean_file, corrupt_file=noisy_file, data_dir=test_data_path)

../dataset/test clean_lyrics.txt noisy_lyrics.txt


1418it [00:00, 708797.89it/s]
1418it [00:00, 706692.38it/s]


loaded tuples of (corr,incorr) examples from ../dataset/test
###############################################
data size: 1418


355it [01:33,  3.78it/s]


Epoch None valid_loss: 0.15807566335202147
total inference time for this data is: 94.004017 secs
###############################################


total token count: 11906
_corr2corr:9549, _corr2incorr:76, _incorr2corr:1937, _incorr2incorr:344
accuracy is 0.9647236687384512
word correction rate (precision) is 0.8491889522139413
word correction rate (recall) is 0.9622454048683556
###############################################





### Result analysis

In [14]:
bert_correct_new = bert_checker_new.correct_strings(noisy_test)
bert_correct_new[:10]

['well show me the way',
 "' cause nothin ' from nothin ' leaves nothin '",
 "she said look , what ' s your game baby",
 'savage love , did somebody , did somebody break your heart',
 "told you i ' ll be here forever",
 'oh baby , we found love right where we are maybe',
 'all i know is we said , " hello "',
 "another saturfay night and i ain ' t got nobody",
 'if your schemes like your dreams',
 'you fell , i caught you']

In [15]:
clean_test_prep = [i.replace(' ', '').strip().lower() for i in clean_test]
bert_correct_new_prep = [i.replace(' ', '').strip().lower() for i in bert_correct_new]

print('len data:', len(clean_test_prep))
print('correct data after bert:', sum(a == b for a, b in zip(bert_correct_new_prep, clean_test_prep)))

len data: 1418
correct data after bert: 0


In [16]:
for i in range(len(clean_test_prep)):
    if clean_test_prep[i] != bert_correct_new_prep[i]:
        print(clean_test_prep[i], ' | ', bert_correct_new[i])

coshe'salrightinthecity  |  well show me the way
doyouwantmeordoyounot?  |  ' cause nothin ' from nothin ' leaves nothin '
andwhenitakeyoushopping  |  she said look , what ' s your game baby
i'mgoingtojupiterwithmygirldon'tcallyourexes  |  savage love , did somebody , did somebody break your heart
withyourgoldengrill,trueloveneverdies  |  told you i ' ll be here forever
nowmydickain'tfree  |  oh baby , we found love right where we are maybe
i'mwildascanbe,andiwant  |  all i know is we said , " hello "
thanks,mr.president  |  another saturfay night and i ain ' t got nobody
hey,batter,batterswing  |  if your schemes like your dreams
citygirl,butshegrewupinthetri-state  |  you fell , i caught you
ineedthatbrazilian,wavy,twenty-eightinch,youplayin'  |  i ' m on my way now
thinkingofyourlove,boo.  |  gee i ' d like to see you looking swell
beagentleman  |  god , give us the reason
insuchasweetway  |  ooh , oh
inabadhey  |  i know she gon ' be ready when i slide through
theballadofadove  |  

## SC-LSTM plus ELMO (at input)

### Load pretrained model

In [17]:
# create SclstmChecker
elmo_checker = SclstmChecker()
# add ELMO at input
elmo_checker = elmo_checker.add_("elmo", at="input")  # "elmo" or "bert", "input" or "output"
# load pretrained model
elmo_checker.from_pretrained()

loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 209906438
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise
new model loaded: <class 'neuspell.corrector_elmosclstm.ElmosclstmChecker'>
loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 209906438
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise


### Evaluate pretrained model on testing set

In [18]:
elmo_checker.evaluate(clean_file=clean_file, corrupt_file=noisy_file, data_dir=test_data_path)

../dataset/test clean_lyrics.txt noisy_lyrics.txt


1418it [00:00, 472776.08it/s]
1418it [00:00, 709389.68it/s]


loaded tuples of (corr,incorr) examples from ../dataset/test
data size: 1418


355it [03:09,  1.87it/s]


Epoch None valid_loss: 1.2718946785574228
total inference time for this data is: 189.911908 secs
###############################################
total token count: 9959
corr2corr:6608, corr2incorr:1068, incorr2corr:1559, incorr2incorr:724
accuracy is 0.8200622552465107
word correction rate is 0.6828734121769602
###############################################





### Example of wrong outputs

In [19]:
# correct noisy data
elmo_correct = elmo_checker.correct_strings(noisy_test)
elmo_correct[:10]

creating spacy models ...
spacy models initialized


['well show me the way',
 "' case nothin ' from nothing leaves nothing",
 "she said look , who 's your game baby",
 'save lives , did somebody , did somebody break your heart',
 "told you i 'll be here forever",
 'oh baby , we found love right where we have maybe',
 'all I know is we said " ello "',
 'another saturfay night and i am not got nobody',
 'if your schemes like your dreams',
 'you fell , I caught you']

In [20]:
clean_test_prep = [i.replace(' ', '').strip().lower() for i in clean_test]
elmo_correct_prep = [i.replace(' ', '').strip().lower() for i in elmo_correct]

print('len data:', len(clean_test_prep))
print('correct data after bert:', sum(a == b for a, b in zip(elmo_correct_prep, clean_data_prep)))

len data: 1418


NameError: name 'clean_data_prep' is not defined

In [None]:
for i in range(len(clean_test_prep)):
    if clean_test_prep[i] != elmo_correct_prep[i]:
        print(clean_test[i], ' | ', elmo_correct[i])

cause i know i'm addicted to your drama  |  cause I know i 'm addicted to your data
you see me i be  |  you see me and be
i gotta tell them to myself  |  I got to tell them to myself
no, i can't sleep until i feel your touch  |  no , I can not sleep until i feel your touch
so beautiful you're leaving me  |  so beautiful your leaving me
when it wasn't yours, yeah  |  when it was not yours , yeah
i feel like i could die walking up to the room, oh yeah  |  I feel like I could die waking up to the room , oh yeah
i'm really gonna miss you picking fights  |  I 'm really on no misos you picking fights
even tried to bite my tongue when you start shit  |  even tried to bite my tongue when you start shut
and i won't be your victim  |  and it wont be your victim
what the fuck did i do  |  what the fuck did it do
blurring all the lines, you intoxicate me  |  blurring all the lines , you inotxicate me
kick in the door waving the coco  |  kick in the door waving the code
and your craigslist couch an

### Fine-tuning SclstmChecker

In [None]:
# create AclstmChecker
elmo_checker_new = SclstmChecker()
# add elmo at input
elmo_checker_new = elmo_checker_new.add_("elmo", at="input")  # "elmo" or "bert", "input" or "output"
# load pretrained model
elmo_checker_new.from_pretrained()
# fine-tune on training data
elmo_checker_new.finetune(clean_file=clean_file, corrupt_file=noisy_file, data_dir=train_data_path)

loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 209906438
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise
new model loaded: <class 'neuspell.corrector_elmosclstm.ElmosclstmChecker'>
loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 209906438
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise


7309it [00:00, 730709.06it/s]
7309it [00:00, 1826729.11it/s]


loaded tuples of (corr,incorr) examples from e:\nlp\nlp-env\neuspell\neuspell\../data\traintest
len of train and test data:  5848 1461
CHECKPOINT_PATH: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\new_models\elmo-base-cased
Training model params from scratch
In epoch: 0
train_data size: 5848
Percent: [----------------------------->] 100% || batch_time: 1.7134 || batch_loss: 0.0763 || avg_batch_loss: 0.2813 || batch_acc: 0.7664 || avg_batch_acc: 0.7664 
Epoch 0 train_loss: 0.2813334097383452
valid_data size: 1461
Percent: [----------------------------->] 100% || batch_time: 1.3783 || batch_loss: 0.0173 || avg_batch_loss: 0.1022 || batch_acc: 1.0000 || avg_batch_acc: 0.9698 
Epoch 0 valid_loss: 0.10223011644152195
Model saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\new_models\elmo-base-cased\model.pth.tar in epoch 0
In epoch: 1
train_data size: 5848
Percent: [----------------------------->] 100% || batch_time: 1.5775

### Evaluate on testing set after fine-tuning

In [None]:
elmo_checker_new.evaluate(clean_file=clean_file, corrupt_file=noisy_file, data_dir=test_data_path)

e:\nlp\nlp-env\neuspell\neuspell\../data\traintest clean_lyrics.txt noisy_lyrics.txt


7309it [00:00, 811976.37it/s]
7309it [00:00, 1044076.29it/s]


loaded tuples of (corr,incorr) examples from e:\nlp\nlp-env\neuspell\neuspell\../data\traintest
data size: 7309


1828it [14:29,  2.10it/s]


Epoch None valid_loss: 0.024568097715663473
total inference time for this data is: 869.061731 secs
###############################################
total token count: 49947
corr2corr:34261, corr2incorr:45, incorr2corr:11927, incorr2incorr:3714
accuracy is 0.9247402246381164
word correction rate is 0.7625471517166421
###############################################





### Result analysis

In [None]:
elmo_correct_new = elmo_checker_new.correct_strings(noisy_test)
elmo_correct_new[:10]

['just a lost boy in a small town',
 'singing " love is forever and eve "',
 'good on paper , picture perfect',
 "cause i know i 'm addicted to your drama",
 'you see me i be',
 'i got to tell them to myself',
 "i 'm still learning to love",
 'no , i can not sleep until i feel your touch',
 'and all i can think',
 "so beautiful yuo're leaving me"]

In [None]:
clean_test_prep = [i.replace(' ', '').strip().lower() for i in clean_test]
elmo_correct_new_prep = [i.replace(' ', '').strip().lower() for i in elmo_correct_new]

print('len data:', len(clean_test_prep))
print('correct data after bert:', sum(a == b for a, b in zip(elmo_correct_new_prep, clean_test_prep)))

example: justalostboyinasmalltown
len data: 7309
correct data after bert: 4988


In [None]:
for i in range(len(clean_test_prep)):
    if clean_test_prep[i] != elmo_correct_new_prep[i]:
        print(clean_test[i], ' | ', elmo_correct_new[i])

singing "love is forever and ever"  |  singing " love is forever and eve "
i gotta tell them to myself  |  i got to tell them to myself
no, i can't sleep until i feel your touch  |  no , i can not sleep until i feel your touch
so beautiful you're leaving me  |  so beautiful yuo're leaving me
when it wasn't yours, yeah  |  when it was not yours , yeah
and i won't be your victim  |  and i wn't be your victim
blurring all the lines, you intoxicate me  |  blurring all the lines , you inotxicate me
kick in the door waving the coco  |  kick in the door waving the codo
vroom vroom, i'll see you latеr, bye  |  vfoom vriom , i 'll see you later , bye
and a neigh neigh there.  |  and a neih neigh there .
i can't describe  |  i cn't describe
if you love me won't you  |  if you love me wons't you
don't leave me stuck here in the streets no, no  |  don'xt leave me stuck here in the streets no , no
cause i'm in a field of dandelions  |  cause i'lm in a field of dandelions
i don't mean no harm  |  i 