In [1]:
import numpy as np
import random
import os
import tensorflow as tf

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"    

In [3]:
nFolds = 5
n_classes = 50
n_splits = 1

# Dataset preparation

Splitting the dataset into train & validation set

In [4]:
def one_hot_encoding(labels):
    b = np.zeros((len(labels), n_classes))
    b[np.arange(len(labels)), labels] = 1
    return b

In [5]:
def setup(split):
    dataset = np.load(os.path.join('.','wav16.npz'))
    # Split to train and val
    train_sounds = []
    train_labels = []
    val_sounds = []
    val_labels = []
    for i in range(1, nFolds + 1):
        sounds = dataset['fold{}'.format(i)].item()['sounds']
        labels = dataset['fold{}'.format(i)].item()['labels']
        if i == split:
            val_sounds.extend(sounds)
            val_labels.extend(labels)
        else:
            train_sounds.extend(sounds)
            train_labels.extend(labels)
                  
    train_labels= one_hot_encoding(train_labels)
    val_labels= one_hot_encoding(val_labels)
    
    return train_sounds,train_labels, val_sounds,val_labels

In [6]:
#name={'crying_baby':0, 'glass_breaking':1, 'coughing':2}
#train_labels=list(map(name.get, train_labels))
#val_labels=list(map(name.get, val_labels))

Normalize to have value between -1 and 1

In [6]:
def normalize(sound, factor=32768):
    return [s/factor for s in sound]

Random crop of the sound values to have T-s window of sound

In [7]:
def random_crop(sound, size=24014):
    cropped_sound=[]
    for s in sound:
        org_size = len(s)
        start = random.randint(0, org_size - size)
        cropped_s=s[start: start + size]
        cropped_sound.append(cropped_s)
    return cropped_sound

In [8]:
def padding(sound, pad=24014//2):
    padded_sound=[]
    for i in range(len(sound)):
        padded_s=np.pad(sound[i], pad, 'constant')
        padded_sound.append(padded_s)
    return padded_sound

Multi-crop for testing phase

In [9]:
def multi_crop(sounds,input_length=24014, n_crops=10):
    multi_cropped_sounds=[]
    for s in sounds:
        stride = (len(s) - input_length) // (n_crops - 1)
        multi_cropped_sound = [s[stride * i: stride * i + input_length] for i in range(n_crops)]
        multi_cropped_sounds.append(np.array(multi_cropped_sound))
    return multi_cropped_sounds

In [46]:
def random_scale(sounds,max_scale=1.25, interpolate='Linear'):
    sounds_augmented=[]
    for s in sounds:
        scale = np.power(max_scale, random.uniform(-1, 1))
        output_size = int(len(s) * scale)
        ref = np.arange(output_size) / scale
        if interpolate == 'Linear':
            ref1 = ref.astype(np.int32)
            ref2 = np.minimum(ref1 + 1, len(s) - 1)
            r = ref - ref1
            scaled_sound = s[ref1] * (1 - r) + s[ref2] * r
        elif interpolate == 'Nearest':
            scaled_sound = s[ref.astype(np.int32)]
        else:
            raise Exception('Invalid interpolation mode {}'.format(interpolate))

        sounds_augmented.append(scaled_sound)
    return sounds_augmented

In [11]:
# what is supposed to be the pad parameter ?
#train_sounds=padding(train_sounds, 24014//2) 
#val_sounds=padding(train_sounds, 24014//2)

Size of window = 1.5s (24014)

Normalization constant = 32768 

# Model

## Input of the model

Recall that the tensor is 4D : [batch, height, width, channel]

In [26]:
tf.reset_default_graph()

In [27]:
X = tf.placeholder(tf.float32, shape=[None, 24014,1, 1])
y = tf.placeholder(tf.float32, shape=[None, n_classes])

In [28]:
keep_prob = tf.placeholder(tf.float32) # 50%

In [29]:
is_training = tf.placeholder(tf.bool)

## Hyperparameters

In [30]:
training_epoch = 150 
batch_size = 64
learning_rate = tf.placeholder(tf.float32)

Lr decay

In [31]:
def lr(epoch):
    if (0<=epoch<=80):
        return 0.01
    if (80<epoch<=100):
        return 0.001
    if (100<epoch<= 120):
        return 0.0001
    else: return 0.00001

Stride =1

No Padding

In [32]:
def conv2d(x, W, b,is_training, strides=1):
    # Conv2D wrapper, with bias and relu activation
    x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='VALID')
    x = tf.nn.bias_add(x, b)
    x = tf.contrib.layers.batch_norm(x, is_training=is_training)
    return tf.nn.relu(x) 

def maxpool2d(x, k_h=2, k_w=2): #non-overlapping max-pooling
    return tf.nn.max_pool(x, ksize=[1, k_h, k_w, 1], strides=[1, k_h, k_w, 1],padding='VALID')

## Parameters of the model

Initialization of the parameters = He initialization

Review the order (does the order actually matters ?)

[heigth_filter, width_filter, depth_filter, number of filters]

In [33]:
weights = {
    #conv1
    'wc1': tf.get_variable('W0', shape=(8,1,1,40), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    #conv2
    'wc2': tf.get_variable('W1', shape=(8,1,40,40), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    #conv3
    'wc3': tf.get_variable('W2', shape=(13,8,1,50), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    #conv4
    'wc4': tf.get_variable('W3', shape=(5,1,50,50), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    #fc5
    'wfc5': tf.get_variable('W4', shape=(50*11*14,4096), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    #fc5
    'wfc6': tf.get_variable('W5', shape=(4096,4096), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    #output
    'out': tf.get_variable('W6', shape=(4096,n_classes), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)), 
}
biases = {
    'bc1': tf.get_variable('B0', shape=(40), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    'bc2': tf.get_variable('B1', shape=(40), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    'bc3': tf.get_variable('B2', shape=(50), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    'bc4': tf.get_variable('B3', shape=(50), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    'bfc5': tf.get_variable('B4', shape=(4096), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    'bfc6': tf.get_variable('B5', shape=(4096), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    'out': tf.get_variable('B6', shape=(n_classes), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
}

## EnvNet

In [34]:
def env_net(x, weights, biases):  

    conv1 = conv2d(x, weights['wc1'], biases['bc1'], is_training)
    
    conv2 = conv2d(conv1, weights['wc2'], biases['bc2'], is_training)
    conv2 = maxpool2d(conv2, k_h=160, k_w=1)
    
    conv2=tf.reshape(conv2, [-1, 150, 40, 1])
    
    conv3 = conv2d(conv2, weights['wc3'], biases['bc3'], is_training)
    conv3 = maxpool2d(conv3, k_h=3, k_w=3)
    
    conv4 = conv2d(conv3, weights['wc4'], biases['bc4'], is_training)
    conv4 = maxpool2d(conv4, k_h=3, k_w=1)
    
    # Fully connected layer
    fc5 = tf.reshape(conv4, [-1, weights['wfc5'].get_shape().as_list()[0]])
    fc5 = tf.add(tf.matmul(fc5, weights['wfc5']), biases['bfc5'])
    fc5 = tf.nn.relu(fc5)
    # Drop out
    drop_out_fc5 = tf.nn.dropout(fc5, keep_prob)
    
    fc6 = tf.add(tf.matmul(drop_out_fc5, weights['wfc6']), biases['bfc6'])
    fc6 = tf.nn.relu(fc6)
    drop_out_fc6 = tf.nn.dropout(fc6, keep_prob)
    # Output, class prediction
    # finally we multiply the fully connected layer with the weights and add a bias term. 
    out = tf.add(tf.matmul(drop_out_fc6, weights['out']), biases['out'])
    return out

## Loss & optimizer of the model

In [35]:
init_momentum = 0.9
momentum = tf.Variable(init_momentum, trainable=False)

In [36]:
beta=5e-4 

pred = env_net(X, weights, biases)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=y))

# regularizers = tf.nn.l2_loss(weights['wc1']) + tf.nn.l2_loss(weights['wc2']) + \
#                 tf.nn.l2_loss(weights['wc3']) + tf.nn.l2_loss(weights['wc4']) + \
#                 tf.nn.l2_loss(weights['wfc5']) + tf.nn.l2_loss(weights['wfc6']) + \
#                 tf.nn.l2_loss(weights['out'])

# cost = tf.reduce_mean(cost + beta * regularizers)

with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9, use_nesterov=True).minimize(cost)

In [37]:
#Here you check whether the index of the maximum value of the predicted image is equal to the actual labelled image. and both will be a column vector.
#correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))

correct_prediction_train = tf.equal(tf.argmax(pred_train, 1), tf.argmax(y, 1))
correct_prediction_test = tf.equal(tf.argmax(pred_test, 0), tf.argmax(y, 1)[0])

#calculate accuracy across all the given images and average them out. 
accuracy_train = tf.reduce_mean(tf.cast(correct_prediction_train, tf.float32))
accuracy_test=tf.cast(correct_prediction_test, tf.float32)

In [38]:
# Initializing the variables
init = tf.global_variables_initializer()

Integrate the version of Perla

In [None]:
def train(split,train_loss,train_accuracy,test_loss,test_accuracy):
    train_sounds,train_y, val_sounds,test_y=setup(split)
    for i in range(training_epoch):
        train_X=random_crop(normalize(padding(random_scale(train_sounds))))  # Choosing randomly a 1.5s section
        train_X = np.reshape(train_X, (-1, 24014, 1, 1)) # Reshape to have 1D input
        for batch in range(len(train_X)//batch_size):
            batch_x = train_X[batch*batch_size:min((batch+1)*batch_size,len(train_X))]
            batch_y = train_y[batch*batch_size:min((batch+1)*batch_size,len(train_y))]    
                # Run optimization op (backprop).
                    # Calculate batch loss and accuracy
            opt = sess.run(optimizer, feed_dict={X: batch_x, y: batch_y, 
                                                     keep_prob:0.5, is_training:True, 
                                                     learning_rate:lr(i)})
            loss, acc = sess.run([cost, accuracy_train], feed_dict={X: batch_x, y: batch_y, 
                                                                     keep_prob:0.5, is_training:True,
                                                                 learning_rate:lr(i)})
        print("Iter " + str(i) + ", Loss= " + \
                          "{:.6f}".format(loss) + ", Training Accuracy= " + \
                          "{:.5f}".format(acc))
        print("Optimization Finished!")

        acc_test=[]
        for crop in range(len(test_X)//crop_size):
            crop_x = test_X[crop*crop_size:min((crop+1)*crop_size,len(test_X))]
            crop_y = test_y[crop*crop_size:min((crop+1)*crop_size,len(test_y))] 

            acc_t= sess.run([accuracy_test], feed_dict={X: crop_x, y : crop_y, 
                                                                       keep_prob:1.0, is_training:False,
                                                        learning_rate:lr(i)})
            acc_test.append(acc_t)

        test_acc=np.mean(acc_test)
        #print(len(acc_test))

        train_loss.append(loss)
        #test_loss.append(valid_loss)
        train_accuracy.append(acc)
        test_accuracy.append(test_acc)
        print("Testing Accuracy:","{:.5f}".format(test_acc))


In [56]:
def train(split,train_loss,train_accuracy,test_accuracy):
    train_sounds,train_y, val_sounds,test_y=setup(split)
    for i in range(training_epoch):
        train_X=random_crop(normalize(padding(train_sounds))) # Choosing randomly a 1.5s section
        train_X = np.reshape(train_X, (-1, 24014, 1, 1)) # Reshape to have 1D input
        test_X=random_crop(normalize(padding(val_sounds)))
        test_X = np.reshape(test_X, (-1, 24014, 1, 1))
        for batch in range(len(train_X)//batch_size):
            batch_x = train_X[batch*batch_size:min((batch+1)*batch_size,len(train_X))]
            batch_y = train_y[batch*batch_size:min((batch+1)*batch_size,len(train_y))]    
                # Run optimization op (backprop).
                    # Calculate batch loss and accuracy
            opt = sess.run(optimizer, feed_dict={X: batch_x, y: batch_y, 
                                                     keep_prob:0.5, is_training:True, 
                                                     learning_rate:lr(i)})
            loss, acc = sess.run([cost, accuracy], feed_dict={X: batch_x, y: batch_y, 
                                                                     keep_prob:0.5, is_training:True,
                                                                 learning_rate:lr(i)})
        train_loss.append(loss)
        train_accuracy.append(acc)
 
        test_acc = sess.run([accuracy], feed_dict={X: test_X, y : test_y, 
                                                                        keep_prob:1.0, is_training:False,
                                                                        learning_rate:lr(i)})
        test_accuracy.append(test_acc[0])
        if i % 10 == 0:
            print('| Epoch: {}/{} | Train: LR {}  Loss {:.6f} Training Accuracy : {:.3f} Testing Accuracy: {:.2f}\n'.format(
                        i, training_epoch, lr(i), train_loss[i],train_accuracy[i],test_accuracy[i]))

Version with mean

In [53]:
def train(split,train_loss,train_accuracy,test_loss,test_accuracy):
    train_sounds,train_y, val_sounds,test_y=setup(split)
    for i in range(training_epoch):
        train_X=random_crop(normalize(padding(random_scale(train_sounds)))) # Choosing randomly a 1.5s section
        train_X = np.reshape(train_X, (-1, 24014, 1, 1)) # Reshape to have 1D input
        test_X=random_crop(normalize(padding(val_sounds)))
        test_X = np.reshape(test_X, (-1, 24014, 1, 1))
        loss_by_epoch=0
        acc_by_epoch=0
        for batch in range(len(train_X)//batch_size):
            batch_x = train_X[batch*batch_size:min((batch+1)*batch_size,len(train_X))]
            batch_y = train_y[batch*batch_size:min((batch+1)*batch_size,len(train_y))]    
            # Run optimization op (backprop).
            # Calculate batch loss and accuracy
            opt = sess.run(optimizer, feed_dict={X: batch_x, y: batch_y, 
                                                     keep_prob:0.5, is_training:True, 
                                                     learning_rate:lr(i)})
            loss, acc = sess.run([cost, accuracy], feed_dict={X: batch_x, y: batch_y, 
                                                                     keep_prob:0.5, is_training:True,
                                                                 learning_rate:lr(i)})
            loss_by_epoch+=loss*len(batch_y)
            acc_by_epoch+=acc*len(batch_y)
        loss_by_epoch/= len(train_y)
        train_loss.append(loss_by_epoch)
        
        acc_by_epoch = 100 * (acc_by_epoch / len(train_y))
        train_accuracy.append(acc_by_epoch)
        
        loss_acc,test_acc = sess.run([cost,accuracy], feed_dict={X: test_X, y : test_y, 
                                                                        keep_prob:1.0, is_training:False,
                                                                        learning_rate:lr(i)})
        test_accuracy.append(test_acc*100)
        test_loss.append(loss_acc*100)
        if i % 10 == 0 or i == training_epoch-1:
            print('| Epoch: {}/{} | Train: LR {}  Loss {:.6f} Training Accuracy : {:.3f} Testing Accuracy: {:.2f}\n'.format(
                        i+1, training_epoch, lr(i), train_loss[i],train_accuracy[i],test_accuracy[i]))

In [54]:
with tf.device("/gpu:1"):
    with tf.Session() as sess:
        sess.run(init) 
        train_loss = [ [] for i in range(n_splits) ]
        train_accuracy = [ [] for i in range(n_splits) ]
        test_loss = [ [] for i in range(n_splits) ]
        test_accuracy = [ [] for i in range(n_splits) ]
        summary_writer = tf.summary.FileWriter('./Output', sess.graph)
        for split in range(1,n_splits+1):
            print('+-- Split {} --+'.format(split))
            train(split,train_loss[split-1],train_accuracy[split-1],test_loss[split-1],test_accuracy[split-1])            
        summary_writer.close()

+-- Split 1 --+
| Epoch: 1/150 | Train: LR 0.01  Loss 5.074906 Training Accuracy : 6.562 Testing Accuracy: 2.00

| Epoch: 11/150 | Train: LR 0.01  Loss 3.202474 Training Accuracy : 14.375 Testing Accuracy: 4.25



KeyboardInterrupt: 

In [47]:
with tf.device("/gpu:1"):
    with tf.Session() as sess:
        sess.run(init) 
        train_loss = [ [] for i in range(n_splits) ]
        train_accuracy = [ [] for i in range(n_splits) ]
        test_loss = [ [] for i in range(n_splits) ]
        test_accuracy = [ [] for i in range(n_splits) ]
        summary_writer = tf.summary.FileWriter('./Output', sess.graph)
        for split in range(1,n_splits+1):
            print('+-- Split {} --+'.format(split))
            train(split,train_loss[split-1],train_accuracy[split-1],test_loss[split-1],test_accuracy[split-1])            
        summary_writer.close()

+-- Split 1 --+
| Epoch: 0/150 | Train: LR 0.01  Loss 5.147108 Training Accuracy : 7.187 Testing Accuracy: 0.02

| Epoch: 10/150 | Train: LR 0.01  Loss 3.204636 Training Accuracy : 13.875 Testing Accuracy: 0.07

| Epoch: 20/150 | Train: LR 0.01  Loss 2.652540 Training Accuracy : 25.625 Testing Accuracy: 0.04

| Epoch: 30/150 | Train: LR 0.01  Loss 2.282438 Training Accuracy : 34.500 Testing Accuracy: 0.04

| Epoch: 40/150 | Train: LR 0.01  Loss 2.013028 Training Accuracy : 40.312 Testing Accuracy: 0.03

| Epoch: 50/150 | Train: LR 0.01  Loss 1.729627 Training Accuracy : 49.562 Testing Accuracy: 0.05

| Epoch: 60/150 | Train: LR 0.01  Loss 1.564614 Training Accuracy : 53.938 Testing Accuracy: 0.06

| Epoch: 70/150 | Train: LR 0.01  Loss 1.370370 Training Accuracy : 59.188 Testing Accuracy: 0.06

| Epoch: 80/150 | Train: LR 0.01  Loss 1.270736 Training Accuracy : 62.313 Testing Accuracy: 0.09

| Epoch: 90/150 | Train: LR 0.001  Loss 1.172295 Training Accuracy : 65.625 Testing Accuracy: 0

In [None]:
with tf.device("/gpu:1"):
    with tf.Session() as sess:
        sess.run(init) 
        train_loss = [ [] for i in range(n_splits) ]
        train_accuracy = [ [] for i in range(n_splits) ]
        test_accuracy = [ [] for i in range(n_splits) ]
        summary_writer = tf.summary.FileWriter('./Output', sess.graph)
        for split in range(1,n_splits+1):
            print('+-- Split {} --+'.format(split))
            train(split,train_loss[split-1],train_accuracy[split-1],test_accuracy[split-1])            
        summary_writer.close()

+-- Split 1 --+
| Epoch: 0/150 | Train: LR 0.01  Loss 5.077315 Testing Accuracy: 0.02

| Epoch: 10/150 | Train: LR 0.01  Loss 3.152364 Testing Accuracy: 0.05

| Epoch: 20/150 | Train: LR 0.01  Loss 2.567582 Testing Accuracy: 0.05

| Epoch: 30/150 | Train: LR 0.01  Loss 2.201598 Testing Accuracy: 0.03

| Epoch: 40/150 | Train: LR 0.01  Loss 1.889521 Testing Accuracy: 0.04

| Epoch: 50/150 | Train: LR 0.01  Loss 1.672241 Testing Accuracy: 0.04

| Epoch: 60/150 | Train: LR 0.01  Loss 1.490539 Testing Accuracy: 0.05

| Epoch: 70/150 | Train: LR 0.01  Loss 1.278729 Testing Accuracy: 0.07

| Epoch: 80/150 | Train: LR 0.01  Loss 1.243228 Testing Accuracy: 0.08

| Epoch: 90/150 | Train: LR 0.001  Loss 1.110288 Testing Accuracy: 0.10

| Epoch: 100/150 | Train: LR 0.001  Loss 1.034573 Testing Accuracy: 0.13

| Epoch: 110/150 | Train: LR 0.0001  Loss 1.052393 Testing Accuracy: 0.20

| Epoch: 120/150 | Train: LR 0.0001  Loss 1.011366 Testing Accuracy: 0.25

| Epoch: 130/150 | Train: LR 1e-05  Loss

In [None]:
with tf.device("/gpu:1"):
    with tf.Session() as sess:
        sess.run(init) 
        train_loss = [ [] for i in range(nFolds) ]
        train_accuracy = [ [] for i in range(nFolds) ]
        test_accuracy = [ [] for i in range(nFolds) ]
        summary_writer = tf.summary.FileWriter('./Output', sess.graph)
        for split in range(1,nFolds+1):
            print('+-- Split {} --+'.format(split))
            train(split,train_loss[split-1],train_accuracy[split-1],test_accuracy[split-1])            
        summary_writer.close()

TO DO : 
- looks like it's overfitting (check if drop out is working)
- look if there is a bug somewhere
- look padding function ? what is the use ?
- maybe a problem with the testing phase ? what is multi_crop function ?
- Accuracy for now : 0.2950 (with my dataset), 0.35750 (with their dataset)
- why is training accuracy low ?
- check for silent windows -> remove them