In [1]:
import math

from lib.utilities import *
from lib.experiments.an4_speech_cnn_ctc import *

In [2]:
# torch.cuda.is_available = lambda : False
# torch.backends.cudnn.enabled=False
torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = True


In [3]:
H = HYPERPARAMETERS({
    'EXPERIMENT': 'AN4',
    'DESCRIPTION': 'Sequence To Sequence model',
    'TIMESTAMP': HYPERPARAMETERS.create_timestamp(),

    'MODEL_NAME': 'AN4_CNN_CTC',

    'PRELOAD_MODEL_PATH': None, #'AN4_CNN_CTC.tar',

    'ROOT_DIR': '/Volumes/SSD1',
    'MANIFESTS': ['manifest.json'],  # , 'sts_manifest_pseudo.json'],

    'TARGET_ENCODING': 'ctc',  # ' sts

    'BATCH_SIZE': 20,
    'NUM_WORKERS': 8,

    'CNN_HIDDEN_SIZE': 1024,
    'CNN_DROPOUT': 0.5,

    'LR': 0.0003,
    'LR_LAMBDA': lambda epoch: max(math.pow(0.78, math.floor((1 + epoch) / 80.0)), 0.01),
    'WEIGHT_DECAY': 0,
    'MOMENTUM': 0.9,
    'NESTEROV': True,

    'MAX_GRAD_NORM': 400,

    'MAX_EPOCHS': 200,

    'STOPPING_PATIENCE': 80,

    'CHECKPOINT_INTERVAL': 10,
    'CHECKPOINT_RESTORE': False,

    'USE_CUDA': torch.cuda.is_available(),

    'SEED': 123456,

    'DATASET_MEAN_STD': (0.060487103, 0.16884679),

    'NORMALIZE_DB': -40,
    'NORMALIZE_MAX_GAIN': 300,

    'MIN_MAX_AUDIO_DURATION': None,  # (1, 15),
    'MIN_MAX_TRANSCRIPT_LEN': None,  # (0, 15),
    'MIN_TRANSCRIPT_CONFIDENCE': None,  # 0.95,

    'AUDIO_SAMPLE_RATE': 16000,

    'SPECT_WINDOW_SIZE': 0.02,
    'SPECT_WINDOW_STRIDE': 0.01,
    'SPECT_WINDOW': 'hamming',

    'AUGMENTATION_PROBABILITY': 0.0,

    'NOISE_BG_PROBABILITY': 0.4,
    'NOISE_BG_LEVELS': (0.0, 0.5),
    'NOISE_BG_DIR': '/Volumes/SSD1/BACKGROUND_NOISE',

    'AUDIO_PITCH_PROBABILITY': 0.4,
    'AUDIO_PITCH_PM': 4,

    'AUDIO_SPEED_PROBABILITY': 0.4,
    'AUDIO_SPEED_LOW_HIGH': (0.9, 1.1),

    'AUDIO_DYNAMIC_PROBABILITY': 0.4,
    'AUDIO_DYNAMIC_LOW_HIGH': (0.5, 1.1),

    'AUDIO_SHIFT_PROBABILITY': 0.4,
    'AUDIO_SHIFT_MIN_MAX': (-5, 5),

    'AUDIO_NOISE_PROBABILITY': 0.4,
    'AUDIO_NOISE_LEVELS': (0.0, 0.5),
    'AUDIO_NOISE_COLORS': ['white', 'pink', 'blue', 'brown', 'violet'],
})

In [4]:
run_training(H)

2019-02-03 10:31:27,961 | INFO : Training start.
2019-02-03 10:31:27,963 | INFO : {
    'EXPERIMENT'                    : 'AN4' ,
    'DESCRIPTION'                   : 'Sequence To Sequence model' ,
    'TIMESTAMP'                     : '2019-02-03-10-31-27-000027' ,
    'MODEL_NAME'                    : 'AN4_CNN_CTC' ,
    'PRELOAD_MODEL_PATH'            : None ,
    'ROOT_DIR'                      : '/Volumes/SSD1' ,
    'MANIFESTS'                     : ['manifest.json'] ,
    'TARGET_ENCODING'               : 'ctc' ,
    'BATCH_SIZE'                    : 20 ,
    'NUM_WORKERS'                   : 8 ,
    'CNN_HIDDEN_SIZE'               : 1024 ,
    'CNN_DROPOUT'                   : 0.5 ,
    'LR'                            : 0.0003 ,
    'LR_LAMBDA': lambda epoch: max(math.pow(0.78, math.floor((1 + epoch) / 80.0)), 0.01),
    'WEIGHT_DECAY'                  : 0 ,
    'MOMENTUM'                      : 0.9 ,
    'NESTEROV'                      : True ,
    'MAX_GRAD_NORM'            

In [5]:
run_evaluation(H)

2019-02-03 12:00:04,271 | INFO : AudioDataset
    Total of datapoints: 130
    Total of duration (min): 5.949999999999999
    Root Location: /Volumes/SSD1/AN4
    Transforms: 
        AudioNormalizeDB(db=-40, max_gain_db=300)
        AudioSpectrogram(sample_rate=16000, window_size=0.02, window_stride=0.01, window=hamming)
        AudioNormalize(mean=None, std=None)
        FromNumpyToTensor(tensor_type=FloatTensor)
    Label Transforms: 
        TranscriptEncodeCTC(vocab=_'ABCDEFGHIJKLMNOPQRSTUVWXYZ )
        FromNumpyToTensor(tensor_type=LongTensor)
2019-02-03 12:00:06,916 | INFO : Test Summary 
Bleu: 68.110
WER:  18.438
CER:  11.353
ACC:  43.846


In [6]:
H.MODEL_NAME = 'AN4_CNN_CTC_augmented'

H.PRELOAD_MODEL_PATH = 'AN4_CNN_CTC.tar'

H.AUGMENTATION_PROBABILITY = 0.0

run_training(H)

2019-02-03 12:00:06,936 | INFO : Training start.
2019-02-03 12:00:06,937 | INFO : {
    'EXPERIMENT'                    : 'AN4' ,
    'DESCRIPTION'                   : 'Sequence To Sequence model' ,
    'TIMESTAMP'                     : '2019-02-03-10-31-27-000027' ,
    'MODEL_NAME'                    : 'AN4_CNN_CTC_augmented' ,
    'PRELOAD_MODEL_PATH'            : 'AN4_CNN_CTC.tar' ,
    'ROOT_DIR'                      : '/Volumes/SSD1' ,
    'MANIFESTS'                     : ['manifest.json'] ,
    'TARGET_ENCODING'               : 'ctc' ,
    'BATCH_SIZE'                    : 20 ,
    'NUM_WORKERS'                   : 8 ,
    'CNN_HIDDEN_SIZE'               : 1024 ,
    'CNN_DROPOUT'                   : 0.5 ,
    'LR'                            : 0.0003 ,
    'LR_LAMBDA': lambda epoch: max(math.pow(0.78, math.floor((1 + epoch) / 80.0)), 0.01),
    'WEIGHT_DECAY'                  : 0 ,
    'MOMENTUM'                      : 0.9 ,
    'NESTEROV'                      : True ,
    'MAX

Preloaded model: AN4/AN4_CNN_CTC.tar


2019-02-03 13:26:14,079 | INFO : Early stopping at epoch: 114, score 0.835176
2019-02-03 13:26:14,420 | INFO : TensorboardLogger
    Last Epoch/LR:    195 / 0.00018252
    Train Loss/Score: 0.030306612593215154 / 0.9995056045688957
    Valid Loss/Score: 9.997469623272236 / 0.8053685950801335
    Best Epoch/Score: 114 / 0.8351761699838622

2019-02-03 13:26:14,421 | INFO : Stopping
    Patience: 80
    Best Score: 0.8352
    Epoch of Best Score: 114

2019-02-03 13:26:14,422 | INFO : Checkpoint
    Timestamp: 2019-02-03-13-26-14-000014
    Last Checkpoint: AN4/chkpt/2019-02-03-10-31-27-000027/2019-02-03-13-26-14-000014/state.tar

2019-02-03 13:26:14,423 | INFO : Training end.


In [7]:
run_evaluation(H)

2019-02-03 13:26:14,443 | INFO : AudioDataset
    Total of datapoints: 130
    Total of duration (min): 5.949999999999999
    Root Location: /Volumes/SSD1/AN4
    Transforms: 
        AudioNormalizeDB(db=-40, max_gain_db=300)
        AudioSpectrogram(sample_rate=16000, window_size=0.02, window_stride=0.01, window=hamming)
        AudioNormalize(mean=None, std=None)
        FromNumpyToTensor(tensor_type=FloatTensor)
    Label Transforms: 
        TranscriptEncodeCTC(vocab=_'ABCDEFGHIJKLMNOPQRSTUVWXYZ )
        FromNumpyToTensor(tensor_type=LongTensor)
2019-02-03 13:26:17,048 | INFO : Test Summary 
Bleu: 70.180
WER:  16.482
CER:  9.792
ACC:  49.231
