In [1]:
import glob
import random
import os
import pickle

from scipy.io import wavfile
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

%matplotlib inline

In [2]:
with open('bad_wav_files', 'rb') as f:
    bad_wav_files = pickle.load(f)

In [3]:
ROOT = '/home/wilsonyan/data/speech'

In [4]:
class GoogleVoiceDataset(Dataset):
    def __init__(self, root, preprocessor, mode='train', window_size=0.02):
        super(GoogleVoiceDataset, self).__init__()
        self.root = root
        self.window_size = window_size
        
        with open(os.path.join(root, 'validation_list.txt'), 'r') as f:
            self.valset = f.read().split('\n')
        with open(os.path.join(root, 'testing_list.txt'), 'r') as f:
            self.testset = f.read().split('\n')
        other = set(['/home/wilsonyan/data/speech/_background_noise_/white_noise.wav',
 '/home/wilsonyan/data/speech/_background_noise_/pink_noise.wav'])
        self.trainset = set(glob.glob(os.path.join(root, '*', '**.wav'), recursive=True))
        self.trainset = self.trainset - set(self.valset) - set(self.testset) - bad_wav_files - other
        self.trainset = list(self.trainset)
        self.set_mode(mode)
        
        self.preprocessor = preprocessor
        self.label_id_map = {label: i for i, label in enumerate(self.get_labels())}
        self.id_label_map = {v: k for k, v in self.label_id_map.items()}
        self.n_classes = len(self.label_id_map)
    
    def set_mode(self, mode):
        if mode == 'train':
            self.currentset = self.trainset
        elif mode == 'val':
            self.currentset = self.valset
        elif mode == 'test':
            self.currentset = self.testset
        else:
            raise ValueError('mode must be train, val, or test')
        self.mode = mode
        
    def get_labels(self):
        files = os.listdir(self.root)
        return [file for file in files if os.path.isdir(os.path.join(self.root, file))]
    
    def get_parent_dir(self, filepath):
        return filepath.split('/')[-2]
    
    def __len__(self):
        return len(self.currentset)
    
    def __getitem__(self, index):
        if index >= len(self):
            raise IndexError('index: %s out of bounds for size %s' % (index, len(self)))
        fname = self.currentset[index]
        label = self.get_parent_dir(fname)
        rate, data = wavfile.read(fname)
        
        x = torch.FloatTensor(self.preprocessor(data, rate, self.window_size).astype(float))
        y = self.label_id_map[label]
        return x, y

In [5]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_classes):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_classes = n_classes
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, n_classes)
        )
    
    def forward(self, x):
        batch_size, n_seq, seq_len = x.size() # batch_size x 50 x 320
        hidden_state = Variable(torch.zeros(batch_size, 1, self.hidden_size))
        hidden_state = self.gru(x, hidden_state)[1]
        return self.classifier(hidden_state.squeeze(0))          

In [6]:
def get_set_accuracy(model, loader):
    correct = 0
    for x, y in enumerate(iter(loader)):
        x, y = Variable(x), Variable(y)
        score = model(x)
        _, y_pred = torch.max(score, 1)
        correct += y_pred.eq(y).sum()
    return correct / len(dset_test)

In [7]:
def train_model(model, optimizer, criterion, loader_train, loader_val, num_epochs=100):
    for epoch in range(num_epochs):
        print('Epoch %s' % epoch)
        print('=' * 40)
        
        losses = []
        correct_train, total = 0, 0
        for x, y in iter(loader_train):
            x, y = Variable(x), Variable(y)
            score = model(x)
            _, y_pred = torch.max(score, 1)
            correct_train += y_pred.eq(y).sum()
            total += x.data.size(0)
            loss = criterion(score, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.data[0])

        train_acc = correct_train.data[0] / total
#         val_acc = get_set_accuracy(model, loader_val)
        print('Loss: %s, train_acc: %s' % (np.mean(losses), train_acc))
    return model

In [8]:
def partition_sequence(sequence, rate, window_size):
    if len(sequence) < 16000:
        sequence = np.append(sequence, [0] * (16000 - len(sequence)))
    
    time_len = len(sequence) / rate
    partition_len, partitions = int(rate * window_size), []
    for i in range(0, len(sequence), partition_len):
        start, end = i, min(i + partition_len, len(sequence))
        partitions.append(sequence[start:end])
    return np.vstack(partitions)

In [9]:
BATCH_SIZE = 128

In [10]:
dset_train = GoogleVoiceDataset(ROOT, partition_sequence, mode='train')
dset_val = GoogleVoiceDataset(ROOT, partition_sequence, mode='val')
loader_train = DataLoader(dset_train, shuffle=True, batch_size=BATCH_SIZE)
loader_val = DataLoader(dset_val, shuffle=True, batch_size=BATCH_SIZE)

In [None]:
model = RNN(320, 256, dset_train.n_classes)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [None]:
model = train_model(model, optimizer, criterion, loader_train, loader_val)

Epoch 0


In [None]:
# dset_test = GoogleVoiceDataset(ROOT, partition_sequence, mode='test')
# loader_test = DataLoader(dset_val, shuffle=True, batch_size=BATCH_SIZE)
# print('test_acc: %s' % (get_set_accuracy(model, loader_test)))