In [1]:
# Import packages

import os
import sys
import h5py
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchvision import datasets
from torchvision import transforms

In [2]:
# Download drive file

import requests

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

In [3]:
# Download and unpack data
DATA_PATH = 'packed_features/'

if not os.path.exists(DATA_PATH + 'bal_train.h5'):
    download_file_from_google_drive('0B49XSFgf-0yVQk01eG92RHg4WTA', 'packed_features.zip')
    !unzip packed_features.zip

!ls -la packed_features

total 3700516
drwxr-xr-x 2 ec2-user ec2-user       4096 Sep 14  2017 .
drwxr-xr-x 8 ec2-user ec2-user       4096 Dec  8 01:51 ..
-rw-r--r-- 1 ec2-user ec2-user   40289024 Sep 13  2017 bal_train.h5
-rw-r--r-- 1 ec2-user ec2-user   37036622 Sep 13  2017 eval.h5
-rw-r--r-- 1 ec2-user ec2-user        644 Sep 14  2017 README
-rw-r--r-- 1 ec2-user ec2-user        653 Sep 14  2017 README~
-rw-r--r-- 1 ec2-user ec2-user 3711974546 Sep 13  2017 unbal_train.h5


In [4]:
# Examine the data

def load_data(hdf5_path):
    with h5py.File(hdf5_path, 'r') as hf:
        x = hf.get('x')
        y = hf.get('y')
        video_id_list = hf.get('video_id_list')
        x = np.array(x)
        y = list(y)
        video_id_list = list(video_id_list)
        
    return x, y, video_id_list

def uint8_to_float32(x):
    return (np.float32(x) - 128.) / 128.
    
def bool_to_float32(y):
    return np.float32(y)

bal_train_path = DATA_PATH + 'bal_train.h5'
(x, y, video_id_list) = load_data(bal_train_path)
x = uint8_to_float32(x)  # shape: (N, 10, 128)
y = bool_to_float32(y)   # shape: (N, 527)
print(x, y)

[[[-0.2578125 -0.046875   0.1484375 ... -0.984375   0.484375  -1.       ]
  [-0.0390625 -0.1484375  0.0703125 ... -0.0390625  0.09375    0.9375   ]
  [-0.59375    0.0546875  0.1875    ... -1.         0.9921875 -1.       ]
  ...
  [-0.1328125 -0.0546875 -0.1640625 ... -1.         0.9921875  0.9921875]
  [-0.40625   -0.2421875  0.3125    ... -0.6796875  0.9921875  0.9921875]
  [-0.484375   0.125     -0.171875  ... -0.2421875  0.65625   -1.       ]]

 [[-0.375      0.4140625  0.6875    ...  0.6171875 -0.0390625  0.8203125]
  [-0.578125   0.9921875  0.3671875 ...  0.9921875  0.6015625 -0.5703125]
  [-0.03125    0.203125   0.2890625 ... -0.46875    0.2890625 -1.       ]
  ...
  [-0.34375    0.21875    0.15625   ... -1.        -0.484375   0.3359375]
  [-0.390625  -0.1484375  0.1640625 ... -0.9609375  0.703125   0.9921875]
  [-0.4375    -0.03125    0.21875   ...  0.0625     0.4140625  0.8359375]]

 [[-0.1640625  0.171875  -0.3046875 ...  0.9921875  0.9921875 -0.6328125]
  [ 0.34375    0.42968

In [6]:
# Import audio classifier

sys.path.append('audioset-classify')

import time
import math
import logging
from sklearn import metrics
import importlib

from utils import utilities, data_generator
import core

try:
    import cPickle
except BaseException:
    import _pickle as cPickle

In [7]:
# Define models
# https://github.com/IBM/audioset-classification/blob/master/audioset_classify

def init_layer(layer):
    if layer.weight.ndimension() == 4:
        (n_out, n_in, height, width) = layer.weight.size()
        n = n_in * height * width
    elif layer.weight.ndimension() == 2:
        (n_out, n) = layer.weight.size()

    std = math.sqrt(2. / n)
    scale = std * math.sqrt(3.)
    layer.weight.data.uniform_(-scale, scale)

    if layer.bias is not None:
        layer.bias.data.fill_(0.)


def init_bn(bn):
    bn.weight.data.fill_(1.)


class EmbeddingLayers(nn.Module):

    def __init__(self, freq_bins, hidden_units, drop_rate):
        super(EmbeddingLayers, self).__init__()

        self.freq_bins = freq_bins
        self.hidden_units = hidden_units
        self.drop_rate = drop_rate

        self.conv1 = nn.Conv2d(
            in_channels=freq_bins, out_channels=hidden_units,
            kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False)

        self.conv2 = nn.Conv2d(
            in_channels=hidden_units, out_channels=hidden_units,
            kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False)

        self.conv3 = nn.Conv2d(
            in_channels=hidden_units, out_channels=hidden_units,
            kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=False)

        self.bn0 = nn.BatchNorm2d(freq_bins)
        self.bn1 = nn.BatchNorm2d(hidden_units)
        self.bn2 = nn.BatchNorm2d(hidden_units)
        self.bn3 = nn.BatchNorm2d(hidden_units)

        self.init_weights()

    def init_weights(self):

        init_layer(self.conv1)
        init_layer(self.conv2)
        init_layer(self.conv3)

        init_bn(self.bn0)
        init_bn(self.bn1)
        init_bn(self.bn2)
        init_bn(self.bn3)

    def forward(self, input, return_layers=False):
        """input: (samples_num, time_steps, freq_bins)
        """

        drop_rate = self.drop_rate

        # (samples_num, freq_bins, time_steps)
        x = input.transpose(1, 2)

        # Add an extra dimension for using Conv2d
        # (samples_num, freq_bins, time_steps, 1)
        x = x[:, :, :, None].contiguous()

        a0 = self.bn0(x)
        a1 = F.dropout(F.relu(self.bn1(self.conv1(a0))),
                       p=drop_rate,
                       training=self.training)

        a2 = F.dropout(F.relu(self.bn2(self.conv2(a1))),
                       p=drop_rate,
                       training=self.training)

        emb = F.dropout(F.relu(self.bn3(self.conv3(a2))),
                        p=drop_rate,
                        training=self.training)

        if return_layers is False:
            # (samples_num, hidden_units, time_steps, 1)
            return emb

        else:
            return [a0, a1, a2, emb]


class DecisionLevelAveragePooling(nn.Module):

    def __init__(self, freq_bins, classes_num, hidden_units, drop_rate):

        super(DecisionLevelAveragePooling, self).__init__()

        self.emb = EmbeddingLayers(freq_bins, hidden_units, drop_rate)
        self.fc_final = nn.Linear(hidden_units, classes_num)

    def init_weights(self):

        init_layer(self.fc_final)

    def forward(self, input):
        """input: (samples_num, freq_bins, time_steps, 1)
        """

        # (samples_num, hidden_units, time_steps, 1)
        b1 = self.emb(input)

        # (samples_num, time_steps, hidden_units)
        b1 = b1[:, :, :, 0].transpose(1, 2)

        b2 = F.sigmoid(self.fc_final(b1))

        # (samples_num, classes_num)
        output = torch.mean(b2, dim=1)

        return output

In [8]:
# Train model

# Reload packages
importlib.reload(core)

# Args
data_dir='packed_features'
workspace = 'results'
filename = 'log.txt'
model_type = 'decision_level_average_pooling'
balance_type = 'balance_in_batch'
mini_data = True
cuda = False

# Logs
sub_dir = os.path.join(filename,
                       'balance_type={}'.format(balance_type),
                       'model_type={}'.format(model_type))

logs_dir = os.path.join(workspace, 'logs', sub_dir)
utilities.create_folder(logs_dir)
logging = utilities.create_logging(logs_dir, filemode='w')

# Train
freq_bins = 128
classes_num = 527

# Hyper parameters
hidden_units = 1024
drop_rate = 0.5

# batch_size = 500 CHANGE THIS BACK!!!!!!!!!!!!!!!!!!!!!!!!!!
batch_size = 1

learning_rate = 1e-3

'''Global average pooling.

[2] Lin, Min, et al. Qiang Chen, and Shuicheng Yan. "Network in
network." arXiv preprint arXiv:1312.4400 (2013).
'''
model = DecisionLevelAveragePooling(freq_bins, classes_num, hidden_units, drop_rate)

core.train(data_dir='packed_features', 
           workspace=workspace,
           mini_data=mini_data,
           balance_type=balance_type,
           learning_rate=learning_rate,
           filename=filename,
           model_type=model_type,
           model=model,
           batch_size=batch_size,
           cuda=cuda)

root        : INFO     Loading data time: 0.179 s
root        : INFO     Training data shape: (22160, 10, 128)


KeyboardInterrupt: 