In [0]:
import keras
import librosa
from keras import optimizers, losses
from keras.backend import set_session
from keras.callbacks import TensorBoard

import tensorflow as tf
import pandas as pd
import h5py

from sklearn.metrics import roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt
import argparse
import input_data
import numpy as np
import dnn_models
import random
import audio_utility as au
import sys

r = random.randint(1111, 9999)


def data_gen(sess, features_settings, mode='training', batch_size=5):
    offset = 0
    if mode != 'training':
        background_frequency = 0.0
        background_volume_range = 0.0
        foreground_frequency = 0.0
        foreground_volume_range = 0.0
        pseudo_frequency = 0.0
        time_shift_frequency = 0.0
        time_shift_range = [0, 0]
    while True:
        X, y = features_settings.get_data(
            how_many=batch_size, offset=0 if mode == 'training' else offset,
            mode=mode)

        offset += batch_size
        if offset > features_settings.set_size(mode) - batch_size:
            offset = 0
        yield X, y
    

def main():

    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.7
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    """ ------------------- GET TF SESSION ------------------- """
    sess = tf.InteractiveSession()

    """ ------------------- Features Configuration ------------------- """
    wanted_words = 'left,right,forward,backward,stop,go'
    speech_feature = 'cgram'
    features = input_data.GetData(wanted_words=wanted_words, feature=speech_feature)
    # initialize dataset
    features.initialize()
    model_settings = features.model_settings

    sr = random.SystemRandom()
    # version number is random every training
    version_number = (time.asctime(time.localtime(time.time()))).replace(' ', '_')
    """ ------------------- Model Configuration ------------------- """

    train_gen = data_gen(sess, features, mode='training')
    val_gen = data_gen(sess, features, mode='validation')
    test_gen = data_gen(sess, features, mode='testing')

    print(features.silence_percentage, features.unknown_percentage)

    max_epoch = 1000
    learning_rate = 0.0001
    decay_rate = learning_rate / max_epoch
    opt = optimizers.RMSprop(lr=learning_rate)
    model_name = 'cnn'

    input_size = features.input_shape
    dnn_model = dnn_models.select_model(input_size, model_settings['label_count'],
                                        model_name)
    dnn_model.compile(
        optimizer=opt, loss=losses.categorical_crossentropy,
        metrics=[
            'accuracy',
            'categorical_accuracy'
        ]
    )

    model_name = f"{version_number}_{speech_feature}_{wanted_words.replace(',', '_')}"
    dnn_model.summary()
    model_checkpoint = keras.callbacks.ModelCheckpoint(
        filepath='models/{}_{}.hdf5'.format(model_name, r),
        verbose=1, save_best_only=False,
        monitor='val_categorical_accuracy')
    model_stop_training = keras.callbacks.EarlyStopping(
        monitor='loss', patience=100, verbose=1)
    tensorboard = TensorBoard(log_dir='./retrain_logs', histogram_freq=0)
    lr_reduce_op = keras.callbacks.ReduceLROnPlateau(
        factor=0.01, min_lr=0.00001, monitor='categorical_accuracy')
    batch_size = 10

    """ -------------------------------------- train ----------------------------------------------"""
    print(50*'=', 'STAGE 2 TRAINING', 50*'=')
    dnn_model.fit_generator(train_gen,
                            steps_per_epoch=features.set_size(
                                'training')//batch_size,
                            epochs=100, verbose=1, callbacks=[
                                tensorboard,
                                model_checkpoint,
                                lr_reduce_op, model_stop_training])

    dnn_model.save(f"models/{model_name}.hdf5")

if __name__ == '__main__':
    main()


Using TensorFlow backend.


>> Downloading speech_commands_v0.02.tar.gz 34.6%Buffered data was truncated after reaching the output size limit.

In [0]:
!pip install speechpy python_speech_features SimpleITK
!pip install git+https://github.com/z430/pycochleagram.git

Collecting speechpy
  Downloading https://files.pythonhosted.org/packages/8f/12/dbda397a998063d9541d9e149c4f523ed138a48824d20598e37632ba33b1/speechpy-2.4-py2.py3-none-any.whl
Collecting python_speech_features
  Downloading https://files.pythonhosted.org/packages/ff/d1/94c59e20a2631985fbd2124c45177abaa9e0a4eee8ba8a305aa26fc02a8e/python_speech_features-0.6.tar.gz
Collecting SimpleITK
[?25l  Downloading https://files.pythonhosted.org/packages/45/ee/8ddd251cf447e1fc131622e925d538c2542780195ae75e26c122587630d1/SimpleITK-1.2.0-cp36-cp36m-manylinux1_x86_64.whl (42.5MB)
[K    100% |████████████████████████████████| 42.5MB 811kB/s 
Building wheels for collected packages: python-speech-features
  Building wheel for python-speech-features (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/3c/42/7c/f60e9d1b40015cd69b213ad90f7c18a9264cd745b9888134be
Successfully built python-speech-features
Installing collected packages: speechpy, python-speech-features, SimpleITK
Succ