### 必要なライブラリのインストール

In [0]:
!pip install pysoundfile

### データのダウンロード・展開（3分程度）

In [0]:
%%shell

DATA_DIR=data/wave
MODEL_DIR=model

date

mkdir -p $DATA_DIR
wget -q http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz -O $DATA_DIR.tar.gz
tar xzf $DATA_DIR.tar.gz -C $DATA_DIR

### MFCC特徴量の抽出（数分～数十分）

In [0]:
import librosa
import numpy as np
import os
import re
import hashlib
import logging
import argparse
import pickle
import soundfile as sf
import random

logging.basicConfig(level=logging.INFO)

DATA_ROOT = "data"
MODEL_DIR = "model"

training_userate = 60.0
validation_userate = 10.0

wavedir       = DATA_ROOT + "/wave/"
txtdir        = DATA_ROOT + "/"
mfccdir       = DATA_ROOT + "/mfcc/"
datalist      = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
SAMPLING_RATE = 16000
MFCC_DIM      = 13

# This function is the same as the code in README.md of speech_commands
MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M
def which_set(filename, validation_percentage, testing_percentage):
  """Determines which data partition the file should belong to.
  We want to keep files in the same training, validation, or testing sets even
  if new ones are added over time. This makes it less likely that testing
  samples will accidentally be reused in training when long runs are restarted
  for example. To keep this stability, a hash of the filename is taken and used
  to determine which set it should belong to. This determination only depends on
  the name and the set proportions, so it won't change as other files are added.
  It's also useful to associate particular files as related (for example words
  spoken by the same person), so anything after '_nohash_' in a filename is
  ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
  'bobby_nohash_1.wav' are always in the same set, for example.
  Args:
    filename: File path of the data sample.
    validation_percentage: How much of the data set to use for validation.
    testing_percentage: How much of the data set to use for testing.
  Returns:
    String, one of 'training', 'validation', or 'testing'.
  """
  base_name = os.path.basename(filename)
  # We want to ignore anything after '_nohash_' in the file name when
  # deciding which set to put a wav in, so the data set creator has a way of
  # grouping wavs that are close variations of each other.
  hash_name = re.sub(r'_nohash_.*$', '', base_name).encode('utf-8')
  # This looks a bit magical, but we need to decide whether this file should
  # go into the training, testing, or validation sets, and we want to keep
  # existing files in the same set even if more files are subsequently
  # added.
  # To do that, we need a stable way of deciding based on just the file name
  # itself, so we do a hash of that and then use that to generate a
  # probability value that we use to assign it.
  hash_name_hashed = hashlib.sha1(hash_name).hexdigest()
  percentage_hash = ((int(hash_name_hashed, 16) %
                      (MAX_NUM_WAVS_PER_CLASS + 1)) *
                     (100.0 / MAX_NUM_WAVS_PER_CLASS))
  if percentage_hash < validation_percentage:
    result = 'validation'
  elif percentage_hash < (testing_percentage + validation_percentage):
    result = 'testing'
  else:
    result = 'training'
  return result


logging.info('Start making mfcc')
os.makedirs(txtdir, exist_ok=True)
os.makedirs(mfccdir, exist_ok=True)
training_list   = []
testing_list    = []
validation_list = []
result = {}

if os.path.exists(mfccdir+'mfcc.pkl'):
    logging.info('mfcc data is already prepared')
    exit()

for command in datalist:
    for wavfile in os.listdir(wavedir+command):
        partition = which_set(wavfile, 10, 10)  # divide to "training", "validation", "testing" 3 parts
        filepath = command+'/'+wavfile
        if partition == 'training':
            training_list.append(filepath)
        if partition == 'testing':
            testing_list.append(filepath)
        if partition == 'validation':
            validation_list.append(filepath)
          
training_list = random.sample(training_list, int(len(training_list)*(training_userate / 100.0)))
validation_list = random.sample(validation_list, int(len(validation_list)*(validation_userate / 100.0)))

for filename in training_list + testing_list + validation_list:
    audio, sr = sf.read(wavedir+filename)
    mfcc      = librosa.feature.mfcc(audio, sr=SAMPLING_RATE, n_mfcc=MFCC_DIM, n_fft=400, hop_length=160)  # extract mfcc featur
    mfcc      = np.asarray(mfcc, dtype=np.float32)  # change format to np.float32e
    result[filename] = mfcc.T
    
with open(txtdir+'train.txt', 'w') as f:
    f.write('\n'.join(training_list))
with open(txtdir+'eval.txt', 'w') as f:
    f.write('\n'.join(testing_list))
with open(txtdir+'valid.txt', 'w') as f:
    f.write('\n'.join(validation_list))
    
with open(mfccdir+'mfcc.pkl', 'wb') as f:
    pickle.dump(result, f)
    
logging.info('Done')

### モデルの定義

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# This model classifies input audio to some classes.
# The structure is stacked biLSTM followed by an inference layer.
# The inference layer receives context vector of the encoder.

# in_size:     Input feature dimension, typically corresponds with MFCC dimension.
# num_class:   The number of classes to infer.
# hidden_size: The number of one side of encoder-lstm nodes.
#              The encoder is stacked biLSTM, so encoder-outputs dimension is 2*hidden_size.
# num_stack:   How many lstms in the encoder are stacked. 
# dropout:     Dropout ratio. This is known to be effective for overfitting.
class Classifier(nn.Module):
    def __init__(self, in_size, num_class, hidden_size, num_stack, dropout):
        super(Classifier, self).__init__()
        self.encoder   = nn.LSTM(in_size, hidden_size, num_stack, batch_first=True, bidirectional=True, dropout=dropout)
        self.inferring = nn.Linear(hidden_size*2, num_class)

    def forward(self, inputs):
        _, (h, c) = self.encoder(inputs)

        # Concatenate forward and backward LSTM's context vector
        context              = torch.cat((h[-2,:], h[-1,:]), dim=1)
        return self.inferring(context)

### （その他関数定義）

In [0]:
import torch
import numpy as np
import random
import os
import pickle

command_list   = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']

def Apply_cmvn(inputs): # apply cepstral mean and variance normalization
    batch_size, time, dim = inputs.shape
    mu    = torch.mean(inputs, dim=1).repeat(1, time).reshape(batch_size, time, dim)
    sigma = torch.pow(torch.mean(torch.pow(inputs, 2), dim=1).repeat(1, time).reshape(batch_size, time, dim) - torch.pow(mu, 2), 0.5)
    return (inputs - mu) / sigma

def insert_index_descending_order(query, num_list):
    matching_list = list(filter(lambda x: x < query, num_list)) # list(filter(if x < query for x in num_list))
    if len(matching_list) == 0:
        return len(num_list)
    else:
        return num_list.index(matching_list[0])
    
mfcc_dict = None

def Batch_generator(mfcc_root, dataset, batch_size): # data batch generator
    global mfcc_dict
    if mfcc_dict is None:
        with open(mfcc_root+'/mfcc.pkl', 'rb') as f:
            mfcc_dict = pickle.load(f)

    datalist_txt = open(dataset, 'r')

    datalist      = datalist_txt.read().strip().split('\n')
    shuffled_data = random.sample(datalist, len(datalist))
    datalist_txt.close()
    epoch         = 1

    while True:
        data_batch   = np.array([], dtype=np.float32)
        label_batch  = []
        length_batch = []
        MAX_LEN      = 0
        for i in range(batch_size):
            sample  = shuffled_data.pop() # pop data from shuffled dataset
            label   = sample.split('/')[0]
            mfcc    = mfcc_dict[sample]
            MAX_LEN = len(mfcc) if MAX_LEN < len(mfcc) else MAX_LEN # find max len in a batch
            index   = insert_index_descending_order(len(mfcc), length_batch) # insert data to get the decending sequence (for latter pack_padded_sequence)
            if i == 0:
                data_batch = np.asarray([mfcc])
            else:
                data_batch = np.pad(data_batch, ((0, 0), (0, MAX_LEN - data_batch.shape[1]), (0, 0)), mode='constant', constant_values=0)
                data_batch = np.insert(data_batch, index, np.pad(mfcc, ((0, MAX_LEN - len(mfcc)), (0, 0)), mode='constant', constant_values=0), axis=0)
            label_batch.insert(index, command_list.index(label)) # add to current batch
            length_batch.insert(index, len(mfcc))
        data_batch  = np.asarray(data_batch,  dtype=np.float32) # format change
        label_batch = np.asarray(label_batch, dtype=np.int64)

        if len(shuffled_data) < batch_size: # if remaining data (wait for pop into the batch) is not enough, do extension
            shuffled_data = random.sample(datalist, len(datalist)) + shuffled_data
            epoch        += 1

        yield data_batch, label_batch, length_batch, epoch

### モデルの学習（数分～数時間）

In [0]:
import numpy as np
import sys
import os
import logging
import argparse
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import time

start_time = time.time()

logging.basicConfig(level=logging.INFO)

DATA_ROOT = "data"
MODEL_DIR = "model"

IN_SIZE       = 13
NUM_CLASS     = 10
HIDDEN_SIZE   = 128
NUM_STACK     = 5
DROPOUT       = 0.5
USE_CMVN      = False
MAX_ITERATION = 1000000
MAX_EPOCH     = 100
BATCH_SIZE    = 256
MFCC_ROOT     = DATA_ROOT + "/mfcc"
TRAIN_LIST    = DATA_ROOT + "/train.txt"
VALID_LIST    = DATA_ROOT + "/valid.txt"
SAVE_FILE     = MODEL_DIR + "/trained.model"

os.makedirs(MODEL_DIR, exist_ok=True)

# Build up model and batch generator
device      = 'cuda' if torch.cuda.is_available() else 'cpu'  # check available gpu
model       = Classifier(IN_SIZE, NUM_CLASS, HIDDEN_SIZE, NUM_STACK, DROPOUT).to(device) # build up model
loss_fun    = nn.CrossEntropyLoss() # define CE as loss function (objective function)
optimizer   = torch.optim.Adam(model.parameters()) # define optimizer (choosed adam here, you can try others as well)
batch_train = Batch_generator(MFCC_ROOT, TRAIN_LIST, BATCH_SIZE) # batch generator
batch_valid = Batch_generator(MFCC_ROOT, VALID_LIST, BATCH_SIZE)

# print out settings
logging.info('Batch_size: {}'.format(BATCH_SIZE))
logging.info('Max epoch: {}'.format(MAX_EPOCH))
logging.info('Max iteration: {}'.format(MAX_ITERATION))
logging.info('Hidden size: {}'.format(HIDDEN_SIZE))
logging.info('Num stack: {}'.format(NUM_STACK))
logging.info('Use cmvn: {}'.format(USE_CMVN))

# Training part
now_epoch   = 1
total_num   = 0 # total number of used data
correct_num = 0 # number of corrected prediction
acc_plt = []
epoch_plt = []
for iteration in range(1, MAX_ITERATION+1):
    model.train() # train the model
    inputs, labels, lengths, epoch = next(batch_train) # generate next batch
    if USE_CMVN:
        inputs  = Apply_cmvn(torch.from_numpy(inputs).to(device)) # use cmvn
    else:
        inputs  = torch.from_numpy(inputs).to(device)
    inputs  = nn.utils.rnn.pack_padded_sequence(inputs, lengths, batch_first=True) #  pack the padded sequence (remove the redundancy padding)
    labels  = torch.from_numpy(labels).to(device) # load label
    
    outputs = model(inputs)
    loss    = loss_fun(outputs, labels) # compute loss
    optimizer.zero_grad() # clear gradient for all optimized tensor (initialize with 0)
    loss.backward() # gradient backpropagation
    optimizer.step() # update parameters

    total_num   += len(outputs) # compute total num
    correct_num += torch.bincount(torch.abs(torch.argmax(outputs, dim=1) - labels))[0] # compute corrected num

    if now_epoch < epoch: #if this iteration is final of epoch
        now_epoch   = epoch
        correct_num_v = 0
        total_num_v   = 0

        #validation
        with torch.no_grad():
            model.eval()
            while True:
                inputs, labels, lengths, epoch = next(batch_valid)

                if USE_CMVN:
                    inputs  = Apply_cmvn(torch.from_numpy(inputs).to(device))
                else:
                    inputs  = torch.from_numpy(inputs).to(device)
                inputs  = nn.utils.rnn.pack_padded_sequence(inputs, lengths, batch_first=True)
                labels  = torch.from_numpy(labels).to(device)
                outputs = model(inputs)
                loss    = loss_fun(outputs, labels)
                total_num_v   += len(outputs)
                correct_num_v += torch.bincount(torch.abs(torch.argmax(outputs, dim=1) - labels))[0]
                if epoch == now_epoch: break
        if (now_epoch-1) % 10 == 0:
            logging.info('[epoch {}, accuracy] training: {:.04f}, validation: {:.04f}'.format(
                now_epoch-1, correct_num.float() / total_num, correct_num_v.float() / total_num_v)) #print validation score
        acc_plt.append(float("{:.04f}".format(correct_num_v.float() / total_num_v)))
        epoch_plt.append(now_epoch-1)
        correct_num = 0
        total_num   = 0

    if MAX_EPOCH < now_epoch:
        break

# plot graph epoch-accuracy.jpg
# this shows how accuracy improved as training goes
plt.plot(epoch_plt, acc_plt, "ro-")
plt.xlabel("epoch number")
plt.ylabel("accuracy")
plt.savefig("epoch-accuracy.jpg")

logging.info('done')
torch.save(model.state_dict(), SAVE_FILE) # save trained model
        
elapsed_time = time.time() - start_time
with open('train.time.log', 'w') as f:
    f.write('training time = {} (sec)\n'.format(int(elapsed_time)))

### モデルの評価（数秒程度）

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import sys
import os
import logging
import argparse

logging.basicConfig(level=logging.INFO)

IN_SIZE       = 13
NUM_CLASS     = 10
HIDDEN_SIZE   = 128
NUM_STACK     = 5
USE_CMVN      = False
BATCH_SIZE    = 256
PARAM_FILE    = MODEL_DIR + "/trained.model"
EVAL_LIST     = DATA_ROOT + "/eval.txt"
MFCC_ROOT     = DATA_ROOT + "/mfcc"

# Build up model and batch generator
device      = 'cuda' if torch.cuda.is_available() else 'cpu'   # check available gpu
model       = Classifier(IN_SIZE, NUM_CLASS, HIDDEN_SIZE, NUM_STACK, 0.0).to(device) # build model (same structure as trained model)
model.load_state_dict(torch.load(PARAM_FILE)) # load parameters from trained model
batch_test  = Batch_generator(MFCC_ROOT, EVAL_LIST, BATCH_SIZE) # data batch generator for evaluation data

# Print out setting
logging.info('Batch_size: {}'.format(BATCH_SIZE))
logging.info('Hidden size: {}'.format(HIDDEN_SIZE))
logging.info('Num stack: {}'.format(NUM_STACK))
logging.info('Use cmvn: {}'.format(USE_CMVN))

# Training part
with torch.no_grad(): # disable gradient calculation, reduce memory consumption
    model.eval()
    total_num   = 0 # total num of test data
    correct_num = 0 # corrected prediction num
    while True:
        inputs, labels, lengths, epoch = next(batch_test) # generate a data batch
        if USE_CMVN:
            inputs  = Apply_cmvn(torch.from_numpy(inputs).to(device)) # use cmvn
        else:
            inputs  = torch.from_numpy(inputs).to(device)
        inputs  = nn.utils.rnn.pack_padded_sequence(inputs, lengths, batch_first=True) # pack the padded sequence (remove the redundancy padding)
        labels  = torch.from_numpy(labels).to(device)
        outputs = model(inputs)
        total_num   += len(outputs)
        correct_num += torch.bincount(torch.abs(torch.argmax(outputs, dim=1) - labels))[0] # compute the number of corrected prediction
        if epoch == 2: break
    score = correct_num.float() / total_num
    logging.info('accuracy: {:.04f}'.format(score))
    with open('score.txt', 'w') as f:
        f.write('recognition rate = {:.1%}\n'.format(score))

logging.info('done')

### 結果の出力

In [0]:
!cat train.time.log
!cat score.txt