# Correcting songs and fine-tuing

## Import modules

In [1]:
import os
from neuspell import available_checkers, BertChecker, SclstmChecker

print(f"available checkers: {available_checkers()}")

data folder is set to `e:\nlp\nlp-env\neuspell\neuspell\../data` script
available checkers: ['BertsclstmChecker', 'CnnlstmChecker', 'ElmosclstmChecker', 'NestedlstmChecker', 'SclstmChecker', 'SclstmbertChecker', 'SclstmelmoChecker', 'BertChecker']


## Define data path

In [2]:
clean_file = 'clean_en_lyrics.txt'
noisy_file = 'noisy_en_lyrics.txt'

train_data_path = '../dataset/train'
clean_train_path = os.path.join(train_data_path, clean_file)
noisy_train_path = os.path.join(train_data_path, noisy_file)

test_data_path = '../dataset/test'
clean_test_path = os.path.join(test_data_path, clean_file)
noisy_test_path = os.path.join(test_data_path, noisy_file)

## Load training data

In [3]:
with open(clean_train_path) as f:
    clean_train = f.read().splitlines()

clean_train[:10]

['well show me the way',
 "'cause nothin' from nothin' leaves nothin'",
 "she said look, what's your game baby",
 'savage love, did somebody, did somebody break your heart',
 "told you i'll be here forever",
 'oh baby, we found love right where we are maybe',
 'all i know is we said, "hello"',
 "another saturday night and i ain't got nobody",
 'if your schemes like your dreams',
 'you fell, i caught you']

In [4]:
with open(noisy_train_path) as f:
    noisy_train = f.read().splitlines()

noisy_train[:10]

['well show me the way',
 "'caue nothvin' from nothin' leaves nothin'",
 "sye said look, wha's yocur gmae baby",
 'svage live, did somebodvy, did somebody beak your heart',
 "tlod you i'll be helre forever",
 'oh baby, we found love riaght where we ahre mabe',
 'all i know is we saiid, "ello"',
 "another saturfay ngiht and i ain't got nobody",
 'if your schemes lkie your dreas',
 'you fell, i caught you']

In [5]:
with open(clean_test_path) as f:
    clean_test = f.read().splitlines()

clean_test[:10]

['\ufefftell her that she beautiful every day i remind her',
 'i get lost in her eyes like dust from the skies',
 "did i say that out loud i'm so crazy about mine",
 "but we don't have the same soul",
 'tell me that you love me baby say it again',
 "repeat it over and over until it's in my brain",
 "you need to send your location i can't think",
 'did i say that out loud',
 "i'm so crazy about mine",
 'when i look back']

In [6]:
with open(noisy_test_path) as f:
    noisy_test = f.read().splitlines()

noisy_test[:10]

['tell her that she beautiful evrey day i remind her',
 'i get lost in her eeys like dust from the skies',
 "did i say that out loud i'm so crazy abput mine",
 "but we don't have tye smae soul",
 'tekl me taht you llve me baby say it agatin',
 "repet it over anfd ovdr unitl it's in my brain",
 "you need to send your location i can't thik",
 'did i say that out loud',
 "i'm so crzy about mine",
 'when i look bqck']

## BERTChecker

### Load pretrained model

In [7]:
# create BertChecker
bert_checker = BertChecker()
# use pretrained model
bert_checker.from_pretrained()

loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 185211810
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise


### Evaluate pretrained model on testing set

In [8]:
bert_checker.evaluate(clean_file=clean_file, corrupt_file=noisy_file, data_dir=test_data_path)

../dataset/test clean_en_lyrics.txt noisy_en_lyrics.txt


1046it [00:00, 1046075.82it/s]
1046it [00:00, 1046325.30it/s]


loaded tuples of (corr,incorr) examples from ../dataset/test
###############################################
data size: 1046


262it [01:02,  4.18it/s]


Epoch None valid_loss: 0.7659597544334386
total inference time for this data is: 62.650257 secs
###############################################


total token count: 7477
_corr2corr:5462, _corr2incorr:528, _incorr2corr:1225, _incorr2incorr:262
accuracy is 0.8943426507957737
word correction rate is 0.8238063214525891
precision is 0.9118530884808014
recall is 0.9542278127183788
f1 score is 0.9325593307153833
###############################################





### Example of wrong outputs 

In [9]:
bert_correct = bert_checker.correct_strings(noisy_test)
bert_correct[:10]

['tell her that she beautiful every day i remind her',
 'I get lost in her eyes like dust from the skies',
 "did I say that out loud i ' m so crazy about mine",
 "but we don ' t have the same soul",
 'tell me that you love the baby say it again',
 "repeat it over and over until it ' s in my brain",
 "you need to send your location I can ' t think",
 'did I say that out loud',
 "I ' m so crazy about mine",
 'when i look back']

In [10]:
clean_test_prep = [i.replace(' ', '').strip().lower() for i in clean_test]
bert_correct_prep = [i.replace(' ', '').strip().lower() for i in bert_correct]

print('len data:', len(clean_test_prep))
print('Number of correct data after bert:', sum(a == b for a, b in zip(bert_correct_prep, clean_test_prep)))

len data: 1046
Number of correct data after bert: 695


In [11]:
for i in range(len(clean_test_prep)):
    if clean_test_prep[i] != bert_correct_prep[i]:
        print(clean_test[i], ' | ', bert_correct[i])

tell her that she beautiful every day i remind her  |  tell her that she beautiful every day i remind her
tell me that you love me baby say it again  |  tell me that you love the baby say it again
and if i do will you be there with me father sister brother  |  and if I do will you be there with my father sister brother
will the baby be alright  |  will the baby be brought
will i have one of mine  |  will i have one of mainly
your spirit can sit and watch tv by my side  |  your spirit can cut and watch to by my side
but lately i can't see  |  but latterly I can ' t see
baby if you care  |  baby if your care
if don't really care see  |  if don ' t really care so
don't know what they mean they're special just for you  |  don ' t know what they mean they special just for you
i like it down like it down way low  |  I like it does like it down way low
i ain't afraid of a little pain  |  I ain ' t aware of a little pain
hey i could take a big bat  |  hey i could take a big skate
i could fuck 

### Fine-tuning BertChecker

In [12]:
# create new BertChecker
bert_checker_new = BertChecker()
# load pretrained model
bert_checker_new.from_pretrained()
# fine-tune on training data
bert_checker_new.finetune(clean_file=clean_file, corrupt_file=noisy_file, data_dir=train_data_path)

loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 185211810
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise


7309it [00:00, 664257.93it/s]
7309it [00:00, 1218206.55it/s]


loaded tuples of (corr,incorr) examples from ../dataset/train
len of train and test data:  5848 1461
CHECKPOINT_PATH: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased-2
Training model params
In epoch: 0
train_data size: 5848
Percent: [>                             ] 1% || batch_time: 3.7478 || batch_loss: 0.3949 || avg_batch_loss: 0.3523 || batch_acc: 0.9381 || avg_batch_acc: 0.9381 

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:882.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


Percent: [----------------------------->] 100% || batch_time: 2.9423 || batch_loss: 0.1038 || avg_batch_loss: 0.1967 || batch_acc: 0.9381 || avg_batch_acc: 0.9381 
Epoch 0 train_loss: 0.196699008597201
valid_data size: 1461
Percent: [----------------------------->] 100% || batch_time: 1.5124 || batch_loss: 0.0387 || avg_batch_loss: 0.1091 || batch_acc: 0.9944 || avg_batch_acc: 0.9761 
Epoch 0 valid_loss: 0.10908372323636127
validation accuracy improved from -1.0000 to 44.9012
Model saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased-2\pytorch_model.bin in epoch 0
In epoch: 1
train_data size: 5848
Percent: [----------------------------->] 100% || batch_time: 3.5188 || batch_loss: 0.0074 || avg_batch_loss: 0.0167 || batch_acc: 0.9856 || avg_batch_acc: 0.9856 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------->] 100% || batch_time: 3.3414 || batch_loss: 0.0012 || avg_batch_loss: 0.0167 || batch_acc: 0.9856 || avg_batch_acc: 0.9856 
Epoch 1 train_loss: 0.016688930290026584
valid_data size: 1461
Percent: [----------------------------->] 100% || batch_time: 1.5868 || batch_loss: 0.0353 || avg_batch_loss: 0.0971 || batch_acc: 0.9944 || avg_batch_acc: 0.9790 
Epoch 1 valid_loss: 0.09710691681982059
validation accuracy improved from 44.9012 to 45.0326
Model saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased-2\pytorch_model.bin in epoch 1
In epoch: 2
train_data size: 5848
Percent: [>                             ] 1% || batch_time: 4.1288 || batch_loss: 0.0049 || avg_batch_loss: 0.0091 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [>                             ] 2% || batch_time: 2.6186 || batch_loss: 0.0095 || avg_batch_loss: 0.0105 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [>                             ] 3% || batch_time: 4.5000 || batch_loss: 0.0008 || avg_batch_loss: 0.0083 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [>                             ] 4% || batch_time: 2.6866 || batch_loss: 0.0021 || avg_batch_loss: 0.0067 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [->                            ] 5% || batch_time: 3.8559 || batch_loss: 0.0003 || avg_batch_loss: 0.0057 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [->                            ] 6% || batch_time: 3.5864 || batch_loss: 0.0003 || avg_batch_loss: 0.0050 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [->                            ] 7% || batch_time: 4.1980 || batch_loss: 0.0073 || avg_batch_loss: 0.0047 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-->                           ] 8% || batch_time: 2.6735 || batch_loss: 0.0083 || avg_batch_loss: 0.0046 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-->                           ] 10% || batch_time: 3.2987 || batch_loss: 0.0015 || avg_batch_loss: 0.0044 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-->                           ] 11% || batch_time: 3.7477 || batch_loss: 0.0052 || avg_batch_loss: 0.0042 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--->                          ] 12% || batch_time: 3.5520 || batch_loss: 0.0012 || avg_batch_loss: 0.0045 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--->                          ] 13% || batch_time: 3.5983 || batch_loss: 0.0022 || avg_batch_loss: 0.0044 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--->                          ] 14% || batch_time: 2.2926 || batch_loss: 0.0102 || avg_batch_loss: 0.0043 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---->                         ] 15% || batch_time: 3.9968 || batch_loss: 0.0018 || avg_batch_loss: 0.0044 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---->                         ] 16% || batch_time: 3.7989 || batch_loss: 0.0020 || avg_batch_loss: 0.0044 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---->                         ] 17% || batch_time: 3.1157 || batch_loss: 0.0033 || avg_batch_loss: 0.0042 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---->                         ] 18% || batch_time: 4.1949 || batch_loss: 0.0023 || avg_batch_loss: 0.0042 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----->                        ] 19% || batch_time: 4.0574 || batch_loss: 0.0102 || avg_batch_loss: 0.0043 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----->                        ] 20% || batch_time: 3.3445 || batch_loss: 0.0032 || avg_batch_loss: 0.0043 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----->                        ] 22% || batch_time: 4.3300 || batch_loss: 0.0024 || avg_batch_loss: 0.0042 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------>                       ] 23% || batch_time: 5.2617 || batch_loss: 0.0005 || avg_batch_loss: 0.0041 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------>                       ] 24% || batch_time: 2.4075 || batch_loss: 0.0005 || avg_batch_loss: 0.0041 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------>                       ] 25% || batch_time: 3.1812 || batch_loss: 0.0009 || avg_batch_loss: 0.0040 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------->                      ] 26% || batch_time: 4.0028 || batch_loss: 0.0057 || avg_batch_loss: 0.0048 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------->                      ] 27% || batch_time: 2.8716 || batch_loss: 0.0045 || avg_batch_loss: 0.0049 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------->                      ] 28% || batch_time: 2.7476 || batch_loss: 0.0006 || avg_batch_loss: 0.0048 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------->                     ] 29% || batch_time: 3.0897 || batch_loss: 0.0083 || avg_batch_loss: 0.0052 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------->                     ] 30% || batch_time: 3.6668 || batch_loss: 0.0093 || avg_batch_loss: 0.0061 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------->                     ] 31% || batch_time: 4.4170 || batch_loss: 0.0060 || avg_batch_loss: 0.0063 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------->                    ] 33% || batch_time: 3.2407 || batch_loss: 0.0025 || avg_batch_loss: 0.0061 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------->                    ] 34% || batch_time: 4.1244 || batch_loss: 0.0110 || avg_batch_loss: 0.0061 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------->                    ] 35% || batch_time: 3.0647 || batch_loss: 0.0009 || avg_batch_loss: 0.0059 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------->                   ] 36% || batch_time: 3.9106 || batch_loss: 0.0319 || avg_batch_loss: 0.0060 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------->                   ] 37% || batch_time: 2.8879 || batch_loss: 0.0058 || avg_batch_loss: 0.0061 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------->                   ] 38% || batch_time: 4.5518 || batch_loss: 0.0094 || avg_batch_loss: 0.0061 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------->                  ] 39% || batch_time: 3.3087 || batch_loss: 0.0009 || avg_batch_loss: 0.0063 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------->                  ] 40% || batch_time: 3.3528 || batch_loss: 0.0312 || avg_batch_loss: 0.0064 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------->                  ] 41% || batch_time: 3.7638 || batch_loss: 0.0028 || avg_batch_loss: 0.0063 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------>                 ] 42% || batch_time: 4.5980 || batch_loss: 0.0025 || avg_batch_loss: 0.0063 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------>                 ] 43% || batch_time: 3.3107 || batch_loss: 0.0094 || avg_batch_loss: 0.0062 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------>                 ] 45% || batch_time: 4.1769 || batch_loss: 0.0038 || avg_batch_loss: 0.0065 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------->                ] 46% || batch_time: 4.2515 || batch_loss: 0.0320 || avg_batch_loss: 0.0067 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------->                ] 47% || batch_time: 2.8646 || batch_loss: 0.0006 || avg_batch_loss: 0.0068 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------->                ] 48% || batch_time: 3.4798 || batch_loss: 0.0501 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------->               ] 49% || batch_time: 4.0573 || batch_loss: 0.0035 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------->               ] 50% || batch_time: 2.8987 || batch_loss: 0.0005 || avg_batch_loss: 0.0068 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------->               ] 51% || batch_time: 3.3313 || batch_loss: 0.0036 || avg_batch_loss: 0.0067 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------->              ] 52% || batch_time: 2.9497 || batch_loss: 0.0040 || avg_batch_loss: 0.0066 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------->              ] 53% || batch_time: 3.7742 || batch_loss: 0.0046 || avg_batch_loss: 0.0065 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------->              ] 54% || batch_time: 4.0149 || batch_loss: 0.0009 || avg_batch_loss: 0.0065 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------->             ] 55% || batch_time: 4.0781 || batch_loss: 0.0009 || avg_batch_loss: 0.0067 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------->             ] 57% || batch_time: 3.0198 || batch_loss: 0.0031 || avg_batch_loss: 0.0066 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------->             ] 58% || batch_time: 3.9797 || batch_loss: 0.0012 || avg_batch_loss: 0.0065 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------->            ] 59% || batch_time: 3.2879 || batch_loss: 0.0158 || avg_batch_loss: 0.0065 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------->            ] 60% || batch_time: 3.6559 || batch_loss: 0.0055 || avg_batch_loss: 0.0066 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------->            ] 61% || batch_time: 4.4521 || batch_loss: 0.0041 || avg_batch_loss: 0.0065 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------>           ] 62% || batch_time: 2.9078 || batch_loss: 0.0024 || avg_batch_loss: 0.0067 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------>           ] 63% || batch_time: 3.3982 || batch_loss: 0.0029 || avg_batch_loss: 0.0066 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------>           ] 64% || batch_time: 3.1650 || batch_loss: 0.0009 || avg_batch_loss: 0.0065 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------->          ] 65% || batch_time: 4.1505 || batch_loss: 0.1416 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------->          ] 66% || batch_time: 4.1074 || batch_loss: 0.0006 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------->          ] 67% || batch_time: 3.8299 || batch_loss: 0.0027 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------->         ] 69% || batch_time: 3.9074 || batch_loss: 0.0140 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------->         ] 70% || batch_time: 4.3230 || batch_loss: 0.0023 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------->         ] 71% || batch_time: 3.9829 || batch_loss: 0.0028 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------->        ] 72% || batch_time: 3.3037 || batch_loss: 0.0006 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------->        ] 73% || batch_time: 3.8769 || batch_loss: 0.0003 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------->        ] 74% || batch_time: 3.4382 || batch_loss: 0.0004 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------------->       ] 75% || batch_time: 3.2508 || batch_loss: 0.0309 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------------->       ] 76% || batch_time: 4.1364 || batch_loss: 0.0011 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [---------------------->       ] 77% || batch_time: 3.5897 || batch_loss: 0.0004 || avg_batch_loss: 0.0072 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------->      ] 78% || batch_time: 3.8417 || batch_loss: 0.0003 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------->      ] 80% || batch_time: 3.3348 || batch_loss: 0.0104 || avg_batch_loss: 0.0072 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------->      ] 81% || batch_time: 3.5809 || batch_loss: 0.0065 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------>     ] 82% || batch_time: 3.1507 || batch_loss: 0.0042 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------>     ] 83% || batch_time: 3.7748 || batch_loss: 0.0045 || avg_batch_loss: 0.0072 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------>     ] 84% || batch_time: 3.1967 || batch_loss: 0.0026 || avg_batch_loss: 0.0072 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------>     ] 85% || batch_time: 2.3362 || batch_loss: 0.0024 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------->    ] 86% || batch_time: 5.1466 || batch_loss: 0.0016 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------->    ] 87% || batch_time: 3.1572 || batch_loss: 0.0014 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [------------------------->    ] 88% || batch_time: 3.6597 || batch_loss: 0.0037 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------------->   ] 89% || batch_time: 3.3758 || batch_loss: 0.0003 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------------->   ] 90% || batch_time: 3.5408 || batch_loss: 0.0308 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [-------------------------->   ] 92% || batch_time: 3.2297 || batch_loss: 0.0029 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------------->  ] 93% || batch_time: 3.9320 || batch_loss: 0.0208 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------------->  ] 94% || batch_time: 3.4041 || batch_loss: 0.0011 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [--------------------------->  ] 95% || batch_time: 3.2597 || batch_loss: 0.0010 || avg_batch_loss: 0.0071 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------> ] 96% || batch_time: 5.0861 || batch_loss: 0.0007 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------> ] 97% || batch_time: 3.4748 || batch_loss: 0.0032 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------> ] 98% || batch_time: 3.1527 || batch_loss: 0.0188 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------->] 99% || batch_time: 3.3257 || batch_loss: 0.0018 || avg_batch_loss: 0.0070 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------->] 100% || batch_time: 2.7056 || batch_loss: 0.0008 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


Percent: [----------------------------->] 100% || batch_time: 2.6521 || batch_loss: 0.0002 || avg_batch_loss: 0.0069 || batch_acc: 1.0000 || avg_batch_acc: 1.0000 
Epoch 2 train_loss: 0.006928394167203567
valid_data size: 1461
Percent: [----------------------------->] 100% || batch_time: 1.6624 || batch_loss: 0.0353 || avg_batch_loss: 0.0971 || batch_acc: 0.9944 || avg_batch_acc: 0.9790 
Epoch 2 valid_loss: 0.09710691681982059
validation accuracy improved from 45.0326 to 45.0326
Model saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased-2\pytorch_model.bin in epoch 2
Model and logs saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/subwordbert-probwordnoise\new_models\bert-base-cased-2


### Evaluate on testing set after fine-tuning

In [13]:
bert_checker_new.evaluate(clean_file=clean_file, corrupt_file=noisy_file, data_dir=test_data_path)

../dataset/test clean_en_lyrics.txt noisy_en_lyrics.txt


1046it [00:00, 522664.04it/s]
1046it [00:00, 1046574.90it/s]


loaded tuples of (corr,incorr) examples from ../dataset/test
###############################################
data size: 1046


262it [01:01,  4.23it/s]


Epoch None valid_loss: 0.1617238586094562
total inference time for this data is: 62.013959 secs
###############################################


total token count: 7477
_corr2corr:5871, _corr2incorr:119, _incorr2corr:1351, _incorr2incorr:136
accuracy is 0.9658954125986359
word correction rate is 0.9085406859448554
precision is 0.9801335559265443
recall is 0.9773597469618778
f1 score is 0.9787446861715429
###############################################





### Result analysis

In [14]:
bert_correct_new = bert_checker_new.correct_strings(noisy_test)
bert_correct_new[:10]

['tell her that she beautiful every day i remind her',
 'i get lost in her eyes like dust from the skies',
 "did i say that out loud i ' m so crazy about mine",
 "but we don ' t have the same soul",
 'tell me that you love me baby say it again',
 "repeat it over and over until it ' s in my brain",
 "you need to send your location i can ' t think",
 'did i say that out loud',
 "i ' m so crazy about mine",
 'when i look back']

In [15]:
clean_test_prep = [i.replace(' ', '').strip().lower() for i in clean_test]
bert_correct_new_prep = [i.replace(' ', '').strip().lower() for i in bert_correct_new]

print('len data:', len(clean_test_prep))
print('correct data after bert:', sum(a == b for a, b in zip(bert_correct_new_prep, clean_test_prep)))

len data: 1046
correct data after bert: 834


In [16]:
for i in range(len(clean_test_prep)):
    if clean_test_prep[i] != bert_correct_new_prep[i]:
        print(clean_test[i], ' | ', bert_correct_new[i])

tell her that she beautiful every day i remind her  |  tell her that she beautiful every day i remind her
don't know what they mean they're special just for you  |  don ' t know what they mean there special just for you
turn it up hot loving you is free  |  turn it up how loving you is free
hey i could take a big bat  |  hey i could take a big beat
i don't know why it is that i wanna stay  |  i dont know why it is that i wanna stay
fuck you think is in more shit  |  fuck you think is in more sight
don't even know what you're good for  |  don ' t even know what your good for
i i i've never ever been this far away from home  |  i i i ' ve never ever given this far away from home
yeah my feet came off the ground  |  yeah my feet chime off the ground
i am you can won't tell no one about it  |  i am you can wont tell no one about it
for and for shut up shut up  |  for and for sit up shut up
he said show me what you got girl  |  he sraid show me what you got girl
you're so fun  |  youre so f

## SC-LSTM plus ELMO (at input)

### Load pretrained model

In [7]:
# create SclstmChecker
elmo_checker = SclstmChecker()
# add ELMO at input
elmo_checker = elmo_checker.add_("elmo", at="input")  # "elmo" or "bert", "input" or "output"
# load pretrained model
elmo_checker.from_pretrained()

loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 209906438
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise
new model loaded: <class 'neuspell.corrector_elmosclstm.ElmosclstmChecker'>
loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 209906438
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise


### Evaluate pretrained model on testing set

In [8]:
elmo_checker.evaluate(clean_file=clean_file, corrupt_file=noisy_file, data_dir=test_data_path)

../dataset/test clean_en_lyrics.txt noisy_en_lyrics.txt


1046it [00:00, 522913.23it/s]
1046it [00:00, 1047574.49it/s]


loaded tuples of (corr,incorr) examples from ../dataset/test
data size: 1046


262it [02:18,  1.89it/s]


Epoch None valid_loss: 0.8036500722212532
total inference time for this data is: 138.547093 secs
###############################################
total token count: 7071
corr2corr:5150, corr2incorr:428, incorr2corr:1221, incorr2incorr:272
accuracy is 0.9010041012586621
word correction rate is 0.8178164768921634
precision is 0.9232699892434565
recall is 0.949834009590557
f1 score is 0.9363636363636364
###############################################





### Example of wrong outputs

In [9]:
# correct noisy data
elmo_correct = elmo_checker.correct_strings(noisy_test)
elmo_correct[:10]

creating spacy models ...
spacy models initialized


['tell her that the beautiful every day i remind her',
 'I get lost in her eyes like dust from the skies',
 "did I say that out loud I 'm so crazy about mine",
 'but we do not have the same soul',
 'tell me that you love me baby say it again',
 "repeat it over and over until it 's in my brain",
 'you need to send your location i can not think',
 'did I say that out loud',
 "I 'm so crazy about mine",
 'when I look back']

In [10]:
clean_test_prep = [i.replace(' ', '').strip().lower() for i in clean_test]
elmo_correct_prep = [i.replace(' ', '').strip().lower() for i in elmo_correct]

print('len data:', len(clean_test_prep))
print('correct data after bert:', sum(a == b for a, b in zip(elmo_correct_prep, clean_test_prep)))

len data: 1046
correct data after bert: 597


In [11]:
for i in range(len(clean_test_prep)):
    if clean_test_prep[i] != elmo_correct_prep[i]:
        print(clean_test[i], ' | ', elmo_correct[i])

tell her that she beautiful every day i remind her  |  tell her that the beautiful every day i remind her
but we don't have the same soul  |  but we do not have the same soul
you need to send your location i can't think  |  you need to send your location i can not think
and if i do will you be there with me father sister brother  |  and if I do will you be there with my father sister brother
will the baby be alright  |  will the baby be brought
you said that i might  |  you said that it might
i guess i'll be fine  |  the guess will be fine
your spirit can sit and watch tv by my side  |  your spirit can cut and watch to by my side
don't you dare say  |  do not you dare say
if you don't really care  |  if you dont really care
but lately i can't see  |  but literally i cannot see
can't you see it  |  can not you see it
don't you see it  |  do not you see it
baby if you care  |  many if your care
baby don't you dare say  |  baby dont you dare say
don't just say it  |  do not just say it
ba

### Fine-tuning SclstmChecker

In [12]:
# create AclstmChecker
elmo_checker_new = SclstmChecker()
# add elmo at input
elmo_checker_new = elmo_checker_new.add_("elmo", at="input")  # "elmo" or "bert", "input" or "output"
# load pretrained model
elmo_checker_new.from_pretrained()
# fine-tune on training data
elmo_checker_new.finetune(clean_file=clean_file, corrupt_file=noisy_file, data_dir=train_data_path)

loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 209906438
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise
new model loaded: <class 'neuspell.corrector_elmosclstm.ElmosclstmChecker'>
loading vocab from path:e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\vocab.pkl
initializing model
Number of parameters in the model: 209906438
Loading model params from checkpoint dir: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise


7309it [00:00, 811868.85it/s]
7309it [00:00, 1828145.26it/s]


loaded tuples of (corr,incorr) examples from ../dataset/train
len of train and test data:  5848 1461
CHECKPOINT_PATH: e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\new_models\elmo-base-cased-1
Training model params from scratch
In epoch: 0
train_data size: 5848
Percent: [----------------------------->] 100% || batch_time: 1.8574 || batch_loss: 0.0289 || avg_batch_loss: 0.2516 || batch_acc: 0.8835 || avg_batch_acc: 0.8835 
Epoch 0 train_loss: 0.25164507261268754
valid_data size: 1461
Percent: [----------------------------->] 100% || batch_time: 1.6337 || batch_loss: 0.0423 || avg_batch_loss: 0.0786 || batch_acc: 0.9800 || avg_batch_acc: 0.9773 
Epoch 0 valid_loss: 0.07855986807819294
Model saved at e:\nlp\nlp-env\neuspell\neuspell\../data/checkpoints/elmoscrnn-probwordnoise\new_models\elmo-base-cased-1\model.pth.tar in epoch 0
In epoch: 1
train_data size: 5848
Percent: [----------------------------->] 100% || batch_time: 2.0445 || batch_loss: 0.0225 || avg

### Evaluate on testing set after fine-tuning

In [13]:
elmo_checker_new.evaluate(clean_file=clean_file, corrupt_file=noisy_file, data_dir=test_data_path)

../dataset/test clean_en_lyrics.txt noisy_en_lyrics.txt


1046it [00:00, 349330.52it/s]
1046it [00:00, 1046824.62it/s]


loaded tuples of (corr,incorr) examples from ../dataset/test
data size: 1046


262it [02:24,  1.82it/s]


Epoch None valid_loss: 0.10063038498620554
total inference time for this data is: 144.303340 secs
###############################################
total token count: 7071
corr2corr:5528, corr2incorr:50, incorr2corr:1314, incorr2incorr:179
accuracy is 0.9676141988403337
word correction rate is 0.8801071667782987
precision is 0.9910362136966655
recall is 0.9686350096372875
f1 score is 0.979707576428888
###############################################





### Result analysis

In [14]:
elmo_correct_new = elmo_checker_new.correct_strings(noisy_test)
elmo_correct_new[:10]

['tell her that she beautiful every day i remind her',
 'i get lost in her eyes like dust from the skies',
 "did i say that out loud i 'm so crazy about mine",
 'but we do not have the same soul',
 'tell me that you love me baby say it again',
 "repeat it over and over until it 's in my brain",
 'you need to send your location i can not think',
 'did i say that out loud',
 "i 'm so crazy about mine",
 'when i look back']

In [15]:
clean_test_prep = [i.replace(' ', '').strip().lower() for i in clean_test]
elmo_correct_new_prep = [i.replace(' ', '').strip().lower() for i in elmo_correct_new]

print('len data:', len(clean_test_prep))
print('correct data after bert:', sum(a == b for a, b in zip(elmo_correct_new_prep, clean_test_prep)))

len data: 1046
correct data after bert: 753


In [16]:
for i in range(len(clean_test_prep)):
    if clean_test_prep[i] != elmo_correct_new_prep[i]:
        print(clean_test[i], ' | ', elmo_correct_new[i])

tell her that she beautiful every day i remind her  |  tell her that she beautiful every day i remind her
but we don't have the same soul  |  but we do not have the same soul
you need to send your location i can't think  |  you need to send your location i can not think
i guess i'll be fine  |  i guess i'kl be fine
your spirit can sit and watch tv by my side  |  your spirit can cut and watch tv by my side
don't you dare say  |  do not you dare say
if you don't really care  |  if you dno't really care
but lately i can't see  |  but lately i cawn't see
can't you see it  |  can not you see it
don't you see it  |  do not you see it
baby don't you dare say  |  baby dn't you dare say
don't just say it  |  do not just say it
baby if you care then don't you dare say  |  baby if you care then dn't you dare say
yeah don't you dare say  |  yeah do not you dare say
if don't really care see  |  if do not really care see
don't know what they mean they're special just for you  |  do not know what the