In [1]:
import math

from lib.utilities import *
from lib.experiments.an4_deepspeech2_ctc import *

In [2]:
H = HYPERPARAMETERS({
    'EXPERIMENT'                    : 'AN4',
    'DESCRIPTION'                   : 'Deepspeech2 model ',
    'TIMESTAMP'                     : HYPERPARAMETERS.create_timestamp() ,
    
    'MODEL_NAME'                    : 'AN4_DEEPSPEECH2_CTC',
    
    'PRELOAD_MODEL_PATH'            : None, #'AN4_CTC_deepspeech2_no_augmentation.tar',
    
    'ROOT_DIR'                      : '/Volumes/SSD1',
    'MANIFESTS'                     : ['manifest.json'], #, 'manifest_pseudo.json'],
    
    'TARGET_ENCODING'               : 'ctc', # 's2s

    'BATCH_SIZE'                    : 20,
    'NUM_WORKERS'                   : 4,
        
    'RNN_HIDDEN_SIZE'               : 800,
    'NUM_LAYERS'                    : 5,
    'RNN_DROPOUT'                   : 0.5,
    'CNN_DROPOUT'                   : 0.5,
    'BIDIRECTIONAL'                 : True ,

    'LR'                            : 0.0003,
    'LR_LAMBDA'                     : lambda epoch: max(math.pow(0.78, math.floor((1 + epoch) / 200.0)), 0.01), 
    'WEIGHT_DECAY'                  : 0, 
    'MOMENTUM'                      : 0.9 ,
    'NESTEROV'                      : True ,
    
    'MAX_GRAD_NORM'                 : 400,
    
    'MAX_EPOCHS'                    : 200,
    
    'STOPPING_PATIENCE'             : 80,
    
    'CHECKPOINT_INTERVAL'           : 140 ,
    'CHECKPOINT_RESTORE'            : False ,
    
    'USE_CUDA'                      : torch.cuda.is_available(),
        
    'SEED'                          : 123456,

    'DATASET_MEAN_STD'              : (0.060487103, 0.16884679),

    'NORMALIZE_DB'                  : -40,
    'NORMALIZE_MAX_GAIN'            : 300,

    'MIN_MAX_AUDIO_DURATION'        : None, # (1, 15),
    'MIN_MAX_TRANSCRIPT_LEN'        : None, # (3, 60),
    'MIN_TRANSCRIPT_CONFIDENCE'     : None, # 0.92,
    
    'AUDIO_SAMPLE_RATE'             : 16000,

    'SPECT_WINDOW_SIZE'             : 0.02,
    'SPECT_WINDOW_STRIDE'           : 0.01,
    'SPECT_WINDOW'                  : 'hamming',
    
    'AUGMENTATION_PROBABILITY'      : 0.0,

    'NOISE_BG_PROBABILITY'          : 0.4,
    'NOISE_BG_LEVELS'               : (0.0, 0.5),
    'NOISE_BG_DIR'                  : '/Volumes/SSD1/BACKGROUND_NOISE',
  
    'AUDIO_PITCH_PROBABILITY'       : 0.4,
    'AUDIO_PITCH_PM'                : 4,     
    
    'AUDIO_SPEED_PROBABILITY'       : 0.4,
    'AUDIO_SPEED_LOW_HIGH'          : (0.9, 1.1),    
    
    'AUDIO_DYNAMIC_PROBABILITY'     : 0.4,
    'AUDIO_DYNAMIC_LOW_HIGH'        : (0.5, 1.1),    
    
    'AUDIO_SHIFT_PROBABILITY'       : 0.4,
    'AUDIO_SHIFT_MIN_MAX'           : (-5, 5),    
    
    'AUDIO_NOISE_PROBABILITY'       : 0.4,
    'AUDIO_NOISE_LEVELS'            : (0.0, 0.5),
    'AUDIO_NOISE_COLORS'            : ['white', 'pink', 'blue', 'brown', 'violet'],
})

In [3]:
run_training(H)

2019-02-03 13:32:28,864 | INFO : Training start.
2019-02-03 13:32:28,865 | INFO : {
    'EXPERIMENT'                    : 'AN4' ,
    'DESCRIPTION'                   : 'Deepspeech2 model ' ,
    'TIMESTAMP'                     : '2019-02-03-13-32-28-000028' ,
    'MODEL_NAME'                    : 'AN4_DEEPSPEECH2_CTC' ,
    'PRELOAD_MODEL_PATH'            : None ,
    'ROOT_DIR'                      : '/Volumes/SSD1' ,
    'MANIFESTS'                     : ['manifest.json'] ,
    'TARGET_ENCODING'               : 'ctc' ,
    'BATCH_SIZE'                    : 20 ,
    'NUM_WORKERS'                   : 4 ,
    'RNN_HIDDEN_SIZE'               : 800 ,
    'NUM_LAYERS'                    : 5 ,
    'RNN_DROPOUT'                   : 0.5 ,
    'CNN_DROPOUT'                   : 0.5 ,
    'BIDIRECTIONAL'                 : True ,
    'LR'                            : 0.0003 ,
    'LR_LAMBDA'                     : lambda epoch: max(math.pow(0.78, math.floor((1 + epoch) / 200.0)), 0.01), 
    'WEIG

2019-02-03 15:45:33,847 | INFO : TensorboardLogger
    Last Epoch/LR:    200 / 0.000234
    Train Loss/Score: 0.04859395519504127 / 0.9979756881655616
    Valid Loss/Score: 3.7829791655907266 / 0.9419370053023899
    Best Epoch/Score: 199 / 0.9437181408335255

2019-02-03 15:45:33,853 | INFO : Stopping
    Patience: 80
    Best Score: 0.9437
    Epoch of Best Score: 199

2019-02-03 15:45:33,854 | INFO : Checkpoint
    Timestamp: 2019-02-03-15-45-32-000032
    Last Checkpoint: AN4/chkpt/2019-02-03-13-32-28-000028/2019-02-03-15-45-32-000032/state.tar

2019-02-03 15:45:33,854 | INFO : Training end.


In [4]:
run_evaluation(H)

2019-02-03 15:45:33,867 | INFO : Evaluation start.
2019-02-03 15:45:33,877 | INFO : AudioDataset
    Total of datapoints: 130
    Total of duration (min): 5.949999999999999
    Root Location: /Volumes/SSD1/AN4
    Transforms: 
        AudioNormalizeDB(db=-40, max_gain_db=300)
        AudioSpectrogram(sample_rate=16000, window_size=0.02, window_stride=0.01, window=hamming)
        AudioNormalize(mean=None, std=None)
        FromNumpyToTensor(tensor_type=FloatTensor)
    Label Transforms: 
        TranscriptEncodeCTC(vocab=_'ABCDEFGHIJKLMNOPQRSTUVWXYZ )
        FromNumpyToTensor(tensor_type=LongTensor)
2019-02-03 15:45:38,015 | INFO : Test Summary 
Bleu: 89.320
WER:  5.628
CER:  3.224
ACC:  75.385


In [5]:
create_logger(H)


vocab = Vocabulary(os.path.join(H.ROOT_DIR, H.EXPERIMENT), encoding=H.TARGET_ENCODING)

model_pred = DeepSpeech2(len(vocab), rnn_hidden_size=H.RNN_HIDDEN_SIZE, nb_layers=H.NUM_LAYERS, 
                         bidirectional=H.BIDIRECTIONAL, cnn_dropout=H.CNN_DROPOUT, rnn_dropout=H.RNN_DROPOUT, 
                         sample_rate=H.AUDIO_SAMPLE_RATE, window_size=H.SPECT_WINDOW_SIZE, initialize=torch_weight_init)
if H.USE_CUDA:
    model_pred.cuda()
    
state = torch.load(os.path.join(H.EXPERIMENT, H.MODEL_NAME + '.tar'))
model_pred.load_state_dict(state)   

ctc_decoder = CTCGreedyDecoder(vocab)


In [6]:
test_path = './data/AN4/test/wav/*.wav'

from lib.dataloader.audio import AudioDataset, BucketingSampler, collate_fn, DummyDataset, dummy_collate_fn
import glob

audio_files = glob.glob(test_path)

audio_transform = transforms.Compose([
                AudioNormalizeDB(db=H.NORMALIZE_DB, 
                                 max_gain_db=H.NORMALIZE_MAX_GAIN),
                AudioSpectrogram(sample_rate=H.AUDIO_SAMPLE_RATE, 
                                 window_size=H.SPECT_WINDOW_SIZE, 
                                 window_stride=H.SPECT_WINDOW_STRIDE,
                                 window=H.SPECT_WINDOW),
                AudioNormalize(),
                FromNumpyToTensor(tensor_type=torch.FloatTensor)
                ])

dummy_dataset = DummyDataset(audio_files, H.AUDIO_SAMPLE_RATE, vocab, transform=audio_transform)

dummy_loader = torch.utils.data.DataLoader(dummy_dataset, batch_size=H.BATCH_SIZE, 
                                           num_workers=H.NUM_WORKERS, shuffle=False, 
                                           collate_fn=dummy_collate_fn, pin_memory=True)

recognizer = Recognizer(model_pred, ctc_decoder, dummy_loader, probabilities=True)

hypotheses = recognizer()

idxs = []
for data in dummy_loader:
    idxs.extend(data[2])    
    
audio=[]
for i in idxs:
    audio.append(dummy_dataset.audio_files[i])
    
    
    

In [7]:
from lib.datasets.an4 import create_manifest, create_pseudo_manifest

root_path = os.path.join('.', H.ROOT_DIR, H.EXPERIMENT)

create_pseudo_manifest(root_path, audio, hypotheses, manifest_file='ctc_manifest_pseudo.json')

2019-02-03 15:45:43,308 | INFO : Creating pseudo dataset in root dir: /Volumes/SSD1/AN4
2019-02-03 15:45:43,577 | INFO : Creation completed - manifest file: /Volumes/SSD1/AN4/ctc_manifest_pseudo.json
2019-02-03 15:45:43,578 | INFO : Total Entries: 130
2019-02-03 15:45:43,579 | INFO : ... done.


In [8]:
H.MODEL_NAME = 'AN4_DEEPSPEECH2_CTC_augmentation_pseudo'
    
H.PRELOAD_MODEL_PATH = 'AN4_DEEPSPEECH2_CTC.tar'

H.MANIFESTS = ['manifest.json', 'ctc_manifest_pseudo.json']

H.LR = 0.0006

H.AUGMENTATION_PROBABILITY = 0.0


In [9]:
run_training(H)

2019-02-03 15:45:43,597 | INFO : Training start.
2019-02-03 15:45:43,599 | INFO : {
    'EXPERIMENT'                    : 'AN4' ,
    'DESCRIPTION'                   : 'Deepspeech2 model ' ,
    'TIMESTAMP'                     : '2019-02-03-13-32-28-000028' ,
    'MODEL_NAME'                    : 'AN4_DEEPSPEECH2_CTC_augmentation_pseudo' ,
    'PRELOAD_MODEL_PATH'            : 'AN4_DEEPSPEECH2_CTC.tar' ,
    'ROOT_DIR'                      : '/Volumes/SSD1' ,
    'MANIFESTS'                     : ['manifest.json', 'ctc_manifest_pseudo.json'] ,
    'TARGET_ENCODING'               : 'ctc' ,
    'BATCH_SIZE'                    : 20 ,
    'NUM_WORKERS'                   : 4 ,
    'RNN_HIDDEN_SIZE'               : 800 ,
    'NUM_LAYERS'                    : 5 ,
    'RNN_DROPOUT'                   : 0.5 ,
    'CNN_DROPOUT'                   : 0.5 ,
    'BIDIRECTIONAL'                 : True ,
    'LR'                            : 0.0006 ,
    'LR_LAMBDA'                     : lambda epoch: m

2019-02-03 16:56:51,382 | INFO : Early stopping at epoch: 14, score 0.949060
2019-02-03 16:56:52,603 | INFO : TensorboardLogger
    Last Epoch/LR:    95 / 0.0006
    Train Loss/Score: 0.22651696422678144 / 0.992492701073406
    Valid Loss/Score: 9.369868880051833 / 0.9424360895514742
    Best Epoch/Score: 14 / 0.9490600211754058

2019-02-03 16:56:52,604 | INFO : Stopping
    Patience: 80
    Best Score: 0.9491
    Epoch of Best Score: 14

2019-02-03 16:56:52,606 | INFO : Checkpoint
    Timestamp: 2019-02-03-16-56-51-000051
    Last Checkpoint: AN4/chkpt/2019-02-03-13-32-28-000028/2019-02-03-16-56-51-000051/state.tar

2019-02-03 16:56:52,607 | INFO : Training end.


In [10]:
run_evaluation(H)

2019-02-03 16:56:52,620 | INFO : Evaluation start.
2019-02-03 16:56:52,631 | INFO : AudioDataset
    Total of datapoints: 130
    Total of duration (min): 5.949999999999999
    Root Location: /Volumes/SSD1/AN4
    Transforms: 
        AudioNormalizeDB(db=-40, max_gain_db=300)
        AudioSpectrogram(sample_rate=16000, window_size=0.02, window_stride=0.01, window=hamming)
        AudioNormalize(mean=None, std=None)
        FromNumpyToTensor(tensor_type=FloatTensor)
    Label Transforms: 
        TranscriptEncodeCTC(vocab=_'ABCDEFGHIJKLMNOPQRSTUVWXYZ )
        FromNumpyToTensor(tensor_type=LongTensor)
2019-02-03 16:56:56,807 | INFO : Test Summary 
Bleu: 89.890
WER:  5.094
CER:  2.993
ACC:  76.923
