In [1]:
from lib.utilities import *
from lib.experiments.an4_speech_encoder_decoder import *

In [2]:
H = HYPERPARAMETERS({
    'EXPERIMENT': 'AN4',
    'DESCRIPTION': 'Sequence To Sequence model',
    'TIMESTAMP': HYPERPARAMETERS.create_timestamp(),

    'MODEL_NAME': 'AN4_SPEECH_ENCODER_DECODER',

    'PRELOAD_MODEL_PATH': None,  # 'AN4_STS_NewNew_no_augmentation.tar',

    'ROOT_DIR': '/Volumes/SSD1',
    'MANIFESTS': ['manifest.json'],  # , 'sts_manifest_pseudo.json'],

    'TARGET_ENCODING': 'sts',  # ' ctc

    'BATCH_SIZE': 20,
    'NUM_WORKERS': 8,

    'RNN_HIDDEN_SIZE': 256,
    'RNN_NUM_LAYERS': 2,
    'RNN_DROPOUT': 0.5,
    'CNN_DROPOUT': 0.5,
    'BIDIRECTIONAL': True,

    'LR': 0.0003,
    'LR_LAMBDA': lambda epoch: max(math.pow(0.78, math.floor((1 + epoch) / 200.0)), 0.01),
    'WEIGHT_DECAY': 0,
    'MOMENTUM': 0.9,
    'NESTEROV': True,

    'TEACHER_FORCING_RATIO': 0.5,

    'LABEL_SMOOTHING' : 0.2,

    'MAX_GRAD_NORM': 400,

    'MAX_EPOCHS': 200,

    'STOPPING_PATIENCE': 80,

    'CHECKPOINT_INTERVAL': 10,
    'CHECKPOINT_RESTORE': False,

    'USE_CUDA': torch.cuda.is_available(),

    'SEED': 123456,

    'DATASET_MEAN_STD': (0.060487103, 0.16884679),

    'NORMALIZE_DB': -40,
    'NORMALIZE_MAX_GAIN': 300,

    'MIN_MAX_AUDIO_DURATION': None,  # (1, 15),
    'MIN_MAX_TRANSCRIPT_LEN': None,  # (0, 15),
    'MIN_TRANSCRIPT_CONFIDENCE': None,  # 0.95,

    'AUDIO_SAMPLE_RATE': 16000,

    'SPECT_WINDOW_SIZE': 0.02,
    'SPECT_WINDOW_STRIDE': 0.01,
    'SPECT_WINDOW': 'hamming',

    'AUGMENTATION_PROBABILITY': 0.0,

    'NOISE_BG_PROBABILITY': 0.4,
    'NOISE_BG_LEVELS': (0.0, 0.5),
    'NOISE_BG_DIR': '/Volumes/SSD1/BACKGROUND_NOISE',

    'AUDIO_PITCH_PROBABILITY': 0.4,
    'AUDIO_PITCH_PM': 4,

    'AUDIO_SPEED_PROBABILITY': 0.4,
    'AUDIO_SPEED_LOW_HIGH': (0.9, 1.1),

    'AUDIO_DYNAMIC_PROBABILITY': 0.4,
    'AUDIO_DYNAMIC_LOW_HIGH': (0.5, 1.1),

    'AUDIO_SHIFT_PROBABILITY': 0.4,
    'AUDIO_SHIFT_MIN_MAX': (-10, 10),

    'AUDIO_NOISE_PROBABILITY': 0.4,
    'AUDIO_NOISE_LEVELS': (0.0, 0.5),
    'AUDIO_NOISE_COLORS': ['white', 'pink', 'blue', 'brown', 'violet'],
})

In [3]:
H.MODEL_NAME = 'AN4_SPEECH_ENCODER_DECODER_1st'

run_training(H)

2019-02-03 17:47:37,390 | INFO : Training start.
2019-02-03 17:47:37,392 | INFO : {
    'EXPERIMENT'                    : 'AN4' ,
    'DESCRIPTION'                   : 'Sequence To Sequence model' ,
    'TIMESTAMP'                     : '2019-02-03-17-47-37-000037' ,
    'MODEL_NAME'                    : 'AN4_SPEECH_ENCODER_DECODER_1st' ,
    'PRELOAD_MODEL_PATH'            : None ,
    'ROOT_DIR'                      : '/Volumes/SSD1' ,
    'MANIFESTS'                     : ['manifest.json'] ,
    'TARGET_ENCODING'               : 'sts' ,
    'BATCH_SIZE'                    : 20 ,
    'NUM_WORKERS'                   : 8 ,
    'RNN_HIDDEN_SIZE'               : 256 ,
    'RNN_NUM_LAYERS'                : 2 ,
    'RNN_DROPOUT'                   : 0.5 ,
    'CNN_DROPOUT'                   : 0.5 ,
    'BIDIRECTIONAL'                 : True ,
    'LR'                            : 0.0003 ,
    'LR_LAMBDA': lambda epoch: max(math.pow(0.78, math.floor((1 + epoch) / 200.0)), 0.01),
    'WEIGHT_

2019-02-03 19:13:23,927 | INFO : TensorboardLogger
    Last Epoch/LR:    200 / 0.000234
    Train Loss/Score: 0.26588307654304344 / 0.9958874927229358
    Valid Loss/Score: 14.401549001840445 / 0.8841775425429271
    Best Epoch/Score: 171 / 0.901217436409744

2019-02-03 19:13:23,937 | INFO : Stopping
    Patience: 80
    Best Score: 0.9012
    Epoch of Best Score: 171

2019-02-03 19:13:23,938 | INFO : Checkpoint
    Timestamp: 2019-02-03-19-13-23-000023
    Last Checkpoint: AN4/chkpt/2019-02-03-17-47-37-000037/2019-02-03-19-13-23-000023/state.tar

2019-02-03 19:13:23,939 | INFO : Training end.


In [4]:
H.MODEL_NAME = 'AN4_SPEECH_ENCODER_DECODER_1st'

from lib.models.speech_encoder_decoder import *
from lib.vocabulary import Vocabulary

vocab = Vocabulary(os.path.join(H.ROOT_DIR, H.EXPERIMENT), encoding=H.TARGET_ENCODING)

model = NeuralSpeechRecognizer(vocab, 50, rnn_hidden_size=H.RNN_HIDDEN_SIZE,
                                   rnn_num_layers=H.RNN_NUM_LAYERS, rnn_dropout=H.RNN_DROPOUT,
                                   cnn_dropout=H.CNN_DROPOUT,
                                   teacher_forcing_ratio=H.TEACHER_FORCING_RATIO,
                                   sample_rate=H.AUDIO_SAMPLE_RATE, window_size=H.SPECT_WINDOW_SIZE,
                                   initialize=torch_weight_init)

state = torch.load(os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar'))
model.load_state_dict(state)


new_model = NeuralSpeechRecognizer(vocab, 50, rnn_hidden_size=H.RNN_HIDDEN_SIZE,
                                   rnn_num_layers=H.RNN_NUM_LAYERS, rnn_dropout=H.RNN_DROPOUT,
                                   cnn_dropout=H.CNN_DROPOUT,
                                   teacher_forcing_ratio=H.TEACHER_FORCING_RATIO,
                                   sample_rate=H.AUDIO_SAMPLE_RATE, window_size=H.SPECT_WINDOW_SIZE,
                                   initialize=torch_weight_init)

new_model.enc = model.enc

torch.save(new_model.state_dict(), os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar'))

In [5]:
H.MODEL_NAME = 'AN4_SPEECH_ENCODER_DECODER_2nd'
H.PRELOAD_MODEL_PATH = 'AN4_SPEECH_ENCODER_DECODER_1st.tar'

run_training(H)

2019-02-03 19:13:24,251 | INFO : Training start.
2019-02-03 19:13:24,253 | INFO : {
    'EXPERIMENT'                    : 'AN4' ,
    'DESCRIPTION'                   : 'Sequence To Sequence model' ,
    'TIMESTAMP'                     : '2019-02-03-17-47-37-000037' ,
    'MODEL_NAME'                    : 'AN4_SPEECH_ENCODER_DECODER_2nd' ,
    'PRELOAD_MODEL_PATH'            : 'AN4_SPEECH_ENCODER_DECODER_1st.tar' ,
    'ROOT_DIR'                      : '/Volumes/SSD1' ,
    'MANIFESTS'                     : ['manifest.json'] ,
    'TARGET_ENCODING'               : 'sts' ,
    'BATCH_SIZE'                    : 20 ,
    'NUM_WORKERS'                   : 8 ,
    'RNN_HIDDEN_SIZE'               : 256 ,
    'RNN_NUM_LAYERS'                : 2 ,
    'RNN_DROPOUT'                   : 0.5 ,
    'CNN_DROPOUT'                   : 0.5 ,
    'BIDIRECTIONAL'                 : True ,
    'LR'                            : 0.0003 ,
    'LR_LAMBDA': lambda epoch: max(math.pow(0.78, math.floor((1 + epoch

2019-02-03 19:13:24,426 | INFO : Preloaded model: AN4/AN4_SPEECH_ENCODER_DECODER_1st.tar
2019-02-03 20:23:30,886 | INFO : Early stopping at epoch: 82, score 0.930966
2019-02-03 20:23:31,137 | INFO : TensorboardLogger
    Last Epoch/LR:    163 / 0.0003
    Train Loss/Score: 0.27625186282371167 / 0.9945111703814236
    Valid Loss/Score: 11.992471900353065 / 0.8974900526823604
    Best Epoch/Score: 82 / 0.9309660318314165

2019-02-03 20:23:31,138 | INFO : Stopping
    Patience: 80
    Best Score: 0.9310
    Epoch of Best Score: 82

2019-02-03 20:23:31,140 | INFO : Checkpoint
    Timestamp: 2019-02-03-20-23-30-000030
    Last Checkpoint: AN4/chkpt/2019-02-03-17-47-37-000037/2019-02-03-20-23-30-000030/state.tar

2019-02-03 20:23:31,140 | INFO : Training end.


In [6]:
H.MODEL_NAME = 'AN4_SPEECH_ENCODER_DECODER_2nd'
run_evaluation(H)

2019-02-03 20:23:31,166 | INFO : AudioDataset
    Total of datapoints: 130
    Total of duration (min): 5.949999999999999
    Root Location: /Volumes/SSD1/AN4
    Transforms: 
        AudioNormalizeDB(db=-40, max_gain_db=300)
        AudioSpectrogram(sample_rate=16000, window_size=0.02, window_stride=0.01, window=hamming)
        AudioNormalize(mean=None, std=None)
        FromNumpyToTensor(tensor_type=FloatTensor)
    Label Transforms: 
        TranscriptEncodeSTS(vocab=_'ABCDEFGHIJKLMNOPQRSTUVWXYZ <SOS><EOS><UNK>)
        FromNumpyToTensor(tensor_type=LongTensor)
2019-02-03 20:23:33,336 | INFO : Test Summary 
Bleu: 85.170
WER:  6.903
CER:  5.249
ACC:  70.769


In [7]:
audio_transform = transforms.Compose([
                AudioNormalizeDB(db=H.NORMALIZE_DB, 
                                 max_gain_db=H.NORMALIZE_MAX_GAIN),
                AudioSpectrogram(sample_rate=H.AUDIO_SAMPLE_RATE, 
                                 window_size=H.SPECT_WINDOW_SIZE, 
                                 window_stride=H.SPECT_WINDOW_STRIDE,
                                 window=H.SPECT_WINDOW),
                AudioNormalize(),
                FromNumpyToTensor(tensor_type=torch.FloatTensor)
                ])

label_transform = transforms.Compose([
                TranscriptEncodeSTS(vocab),
                FromNumpyToTensor(tensor_type=torch.LongTensor)
                ])

test_dataset = AudioDataset(os.path.join(H.ROOT_DIR, H.EXPERIMENT), manifests_files=H.MANIFESTS, datasets="test",
                            transform=audio_transform, label_transform=label_transform, max_data_size=None, 
                            sorted_by='recording_duration')

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=H.BATCH_SIZE, num_workers=H.NUM_WORKERS, 
                                          shuffle=False, collate_fn=collate_fn, pin_memory=True)

test_dataset, len(test_loader)

(AudioDataset
     Total of datapoints: 130
     Total of duration (min): 5.949999999999999
     Root Location: /Volumes/SSD1/AN4
     Transforms: 
         AudioNormalizeDB(db=-40, max_gain_db=300)
         AudioSpectrogram(sample_rate=16000, window_size=0.02, window_stride=0.01, window=hamming)
         AudioNormalize(mean=None, std=None)
         FromNumpyToTensor(tensor_type=FloatTensor)
     Label Transforms: 
         TranscriptEncodeSTS(vocab=_'ABCDEFGHIJKLMNOPQRSTUVWXYZ <SOS><EOS><UNK>)
         FromNumpyToTensor(tensor_type=LongTensor), 7)

In [8]:
model_pred = NeuralSpeechRecognizer(vocab, test_loader.dataset.max_seq_length, rnn_hidden_size=H.RNN_HIDDEN_SIZE,
                               rnn_num_layers=H.RNN_NUM_LAYERS, rnn_dropout=H.RNN_DROPOUT, cnn_dropout=H.CNN_DROPOUT,
                               teacher_forcing_ratio=H.TEACHER_FORCING_RATIO,
                               sample_rate=H.AUDIO_SAMPLE_RATE, window_size=H.SPECT_WINDOW_SIZE,
                               initialize=torch_weight_init)
if H.USE_CUDA:
    model_pred.cuda()

state = torch.load(os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar'))
model_pred.load_state_dict(state)

criterion = LabelSmoothingLoss(padding_idx=0, label_smoothing=0.1)

sts_decoder = STSDecoder(vocab)

In [9]:
from lib.dataloader.audio import AudioDataset, BucketingSampler, collate_fn, DummyDataset, dummy_collate_fn

test_path = './data/AN4/test/wav/*.wav'

import glob
audio_files = glob.glob(test_path)

audio_transform = transforms.Compose([
                AudioNormalizeDB(db=H.NORMALIZE_DB, 
                                 max_gain_db=H.NORMALIZE_MAX_GAIN),
                AudioSpectrogram(sample_rate=H.AUDIO_SAMPLE_RATE, 
                                 window_size=H.SPECT_WINDOW_SIZE, 
                                 window_stride=H.SPECT_WINDOW_STRIDE,
                                 window=H.SPECT_WINDOW),
                AudioNormalize(),
                FromNumpyToTensor(tensor_type=torch.FloatTensor)
                ])

dummy_dataset = DummyDataset(audio_files, H.AUDIO_SAMPLE_RATE, vocab, transform=audio_transform)

dummy_loader = torch.utils.data.DataLoader(dummy_dataset, batch_size=H.BATCH_SIZE, 
                                           num_workers=H.NUM_WORKERS, shuffle=False, 
                                           collate_fn=dummy_collate_fn, pin_memory=True)

recognizer = Recognizer(model_pred, sts_decoder, dummy_loader, probabilities=True)

hypotheses = recognizer()

idxs = []
for data in dummy_loader:
    idxs.extend(data[2])    
    
audio=[]
for i in idxs:
    audio.append(dummy_dataset.audio_files[i])
      
len(hypotheses), len(audio)

(130, 130)

In [10]:
from lib.datasets.an4 import create_manifest, create_pseudo_manifest, create_data_pipelines

root_path = os.path.join('.', H.ROOT_DIR, H.EXPERIMENT)

create_pseudo_manifest(root_path, audio, hypotheses, manifest_file='sts_manifest_pseudo.json')


2019-02-03 20:23:36,025 | INFO : Creating pseudo dataset in root dir: /Volumes/SSD1/AN4
2019-02-03 20:23:36,355 | INFO : Creation completed - manifest file: /Volumes/SSD1/AN4/sts_manifest_pseudo.json
2019-02-03 20:23:36,356 | INFO : Total Entries: 130
2019-02-03 20:23:36,357 | INFO : ... done.


In [11]:
H.MODEL_NAME = 'AN4_SPEECH_ENCODER_DECODER_final1'
H.PRELOAD_MODEL_PATH = 'AN4_SPEECH_ENCODER_DECODER_2nd.tar'
        
H.MANIFESTS = ['manifest.json','sts_manifest_pseudo.json']
H.MIN_TRANSCRIPT_CONFIDENCE = 0.95
H.AUGMENTATION_PROBABILITY = 0.6

run_training(H)

2019-02-03 20:23:36,371 | INFO : Training start.
2019-02-03 20:23:36,373 | INFO : {
    'EXPERIMENT'                    : 'AN4' ,
    'DESCRIPTION'                   : 'Sequence To Sequence model' ,
    'TIMESTAMP'                     : '2019-02-03-17-47-37-000037' ,
    'MODEL_NAME'                    : 'AN4_SPEECH_ENCODER_DECODER_final1' ,
    'PRELOAD_MODEL_PATH'            : 'AN4_SPEECH_ENCODER_DECODER_2nd.tar' ,
    'ROOT_DIR'                      : '/Volumes/SSD1' ,
    'MANIFESTS'                     : ['manifest.json', 'sts_manifest_pseudo.json'] ,
    'TARGET_ENCODING'               : 'sts' ,
    'BATCH_SIZE'                    : 20 ,
    'NUM_WORKERS'                   : 8 ,
    'RNN_HIDDEN_SIZE'               : 256 ,
    'RNN_NUM_LAYERS'                : 2 ,
    'RNN_DROPOUT'                   : 0.5 ,
    'CNN_DROPOUT'                   : 0.5 ,
    'BIDIRECTIONAL'                 : True ,
    'LR'                            : 0.0003 ,
    'LR_LAMBDA': lambda epoch: max(math.

2019-02-03 20:23:36,587 | INFO : Preloaded model: AN4/AN4_SPEECH_ENCODER_DECODER_2nd.tar


SIGINT received. Delaying KeyboardInterrupt.
SIGINT received. Delaying KeyboardInterrupt.
SIGINT received. Delaying KeyboardInterrupt.
SIGINT received. Delaying KeyboardInterrupt.
SIGINT received. Delaying KeyboardInterrupt.
SIGINT received. Delaying KeyboardInterrupt.
SIGINT received. Delaying KeyboardInterrupt.
SIGINT received. Delaying KeyboardInterrupt.
SIGINT received. Delaying KeyboardInterrupt.


2019-02-03 21:09:28,672 | INFO : Training interrupted at: 100
2019-02-03 21:09:28,930 | INFO : TensorboardLogger
    Last Epoch/LR:    100 / 0.0003
    Train Loss/Score: 0.9308745736562754 / 0.9687570897222796
    Valid Loss/Score: 8.466199464064378 / 0.9184221227490458
    Best Epoch/Score: 97 / 0.9270190279805665

2019-02-03 21:09:28,931 | INFO : Stopping
    Patience: 80
    Best Score: 0.9270
    Epoch of Best Score: 97

2019-02-03 21:09:28,932 | INFO : Checkpoint
    Timestamp: 2019-02-03-21-09-28-000028
    Last Checkpoint: AN4/chkpt/2019-02-03-17-47-37-000037/2019-02-03-21-09-28-000028/state.tar

2019-02-03 21:09:28,933 | INFO : Training end.


In [12]:
run_evaluation(H)

2019-02-03 21:09:28,958 | INFO : AudioDataset
    Total of datapoints: 130
    Total of duration (min): 5.949999999999999
    Root Location: /Volumes/SSD1/AN4
    Transforms: 
        AudioNormalizeDB(db=-40, max_gain_db=300)
        AudioSpectrogram(sample_rate=16000, window_size=0.02, window_stride=0.01, window=hamming)
        AudioNormalize(mean=None, std=None)
        FromNumpyToTensor(tensor_type=FloatTensor)
    Label Transforms: 
        TranscriptEncodeSTS(vocab=_'ABCDEFGHIJKLMNOPQRSTUVWXYZ <SOS><EOS><UNK>)
        FromNumpyToTensor(tensor_type=LongTensor)
2019-02-03 21:09:31,144 | INFO : Test Summary 
Bleu: 83.770
WER:  7.529
CER:  5.735
ACC:  72.308
