In [1]:
import numpy as np
import random
import os

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="2"    

In [3]:
#dataset=np.load('wav16.npz')

In [4]:
#dataset.files

In [5]:
#len(dataset['fold1'].item().get('sounds'))

# Dataset preparation

Splitting the dataset into train & validation set

In [6]:
dataset = np.load('data_dl/ESC-50-master/audio_resampled/wav_complete_silent.npz')
split=2
# Split to train and val
train_sounds = []
train_labels = []
val_sounds = []
val_labels = []
for i in range(1, 6):
    sounds = dataset['fold{}'.format(i)].item()['sounds']
    labels = dataset['fold{}'.format(i)].item()['labels']
    if i == split:
        val_sounds.extend(sounds)
        val_labels.extend(labels)
    else:
        train_sounds.extend(sounds)
        train_labels.extend(labels)

In [7]:
#name={'crying_baby':0, 'glass_breaking':1, 'coughing':2}
#train_labels=list(map(name.get, train_labels))
#val_labels=list(map(name.get, val_labels))

In [8]:
def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [9]:
train_labels=one_hot_encode(train_labels)
val_labels=one_hot_encode(val_labels)

Normalize to have value between -1 and 1

In [10]:
def normalize(sound, factor):
    return [s/factor for s in sound]

Random crop of the sound values to have T-s window of sound

In [11]:
def random_crop(sound, size):
    cropped_sound=[]
    for s in sound:
        org_size = len(s)
        start = random.randint(0, org_size - size)
        cropped_s=s[start: start + size]
        cropped_sound.append(cropped_s)
    return cropped_sound

In [12]:
def padding(sound, pad):
    padded_sound=[]
    for i in range(len(sound)):
        padded_s=np.pad(sound[i], pad, 'constant')
        padded_sound.append(padded_s)
    return padded_sound

In [13]:
# what is supposed to be the pad parameter ?
train_sounds=padding(train_sounds, 24014//2) 
val_sounds=padding(train_sounds, 24014//2)

Size of window = 1.5s (24014)

Normalization constant = 32768 

In [14]:
train_X=random_crop(normalize(train_sounds,32768), 24014)
train_y=train_labels

In [15]:
test_X=random_crop(normalize(val_sounds,32768), 24014)
test_y=val_labels

In [16]:
train_X = np.reshape(train_X, (-1, 24014, 1, 1))
test_X = np.reshape(test_X, (-1, 24014, 1, 1))

In [17]:
len(test_y[1])

50

50

To improve : do the cross validation

# Model

In [18]:
import tensorflow as tf

In [None]:
n_classes=50 # subject to change

## Input of the model

Recall that the tensor is 4D : [batch, height, width, channel]

In [None]:
X = tf.placeholder(tf.float32, shape=[None, 24014,1, 1])
y = tf.placeholder(tf.float32, shape=[None, n_classes])

In [None]:
keep_prob = tf.placeholder(tf.float32) # 50%

In [None]:
is_training = tf.placeholder(tf.bool)

## Hyperparameters

In [None]:
training_epoch = 150 
batch_size = 64
learning_rate = tf.placeholder(tf.float32)

In [None]:
def lr(epoch):
    if (0<=epoch<=80):
        return 0.01
    if (80<epoch<=100):
        return 0.001
    if (100<epoch<= 120):
        return 0.0001
    else: return 0.00001

Stride =1

No Padding

In [None]:
def conv2d(x, W, b,is_training, strides=1):
    # Conv2D wrapper, with bias and relu activation
    x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='VALID')
    x = tf.nn.bias_add(x, b)
    x = tf.contrib.layers.batch_norm(x, is_training=is_training)
    return tf.nn.relu(x) 

def maxpool2d(x, k_h=2, k_w=2):
    return tf.nn.max_pool(x, ksize=[1, k_h, k_w, 1], strides=[1, k_h, k_w, 1],padding='VALID')

## Parameters of the model

Initialization of the parameters = He initialization

Review the order (does the order actually matters ?)

[heigth_filter, width_filter, depth_filter, number of filters]

In [None]:
weights = {
    #conv1
    'wc1': tf.get_variable('W0', shape=(8,1,1,40), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    #conv2
    'wc2': tf.get_variable('W1', shape=(8,1,40,40), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    #conv3
    'wc3': tf.get_variable('W2', shape=(13,8,1,50), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    #conv4
    'wc4': tf.get_variable('W3', shape=(5,1,50,50), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    #fc5
    'wfc5': tf.get_variable('W4', shape=(50*11*14,4096), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    #fc5
    'wfc6': tf.get_variable('W5', shape=(4096,4096), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    #output
    'out': tf.get_variable('W6', shape=(4096,n_classes), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)), 
}
biases = {
    'bc1': tf.get_variable('B0', shape=(40), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    'bc2': tf.get_variable('B1', shape=(40), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    'bc3': tf.get_variable('B2', shape=(50), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    'bc4': tf.get_variable('B3', shape=(50), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    'bfc5': tf.get_variable('B4', shape=(4096), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    'bfc6': tf.get_variable('B5', shape=(4096), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
    'out': tf.get_variable('B6', shape=(n_classes), initializer=tf.contrib.layers.variance_scaling_initializer(dtype=tf.float32)),
}

## EnvNet

In [None]:
def env_net(x, weights, biases):  

    conv1 = conv2d(x, weights['wc1'], biases['bc1'], is_training)
    
    conv2 = conv2d(conv1, weights['wc2'], biases['bc2'], is_training)
    conv2 = maxpool2d(conv2, k_h=160, k_w=1)
    
    conv2=tf.reshape(conv2, [-1, 150, 40, 1])
    
    conv3 = conv2d(conv2, weights['wc3'], biases['bc3'], is_training)
    conv3 = maxpool2d(conv3, k_h=3, k_w=3)
    
    conv4 = conv2d(conv3, weights['wc4'], biases['bc4'], is_training)
    conv4 = maxpool2d(conv4, k_h=3, k_w=1)
    
    # Fully connected layer
    fc5 = tf.reshape(conv4, [-1, weights['wfc5'].get_shape().as_list()[0]])
    fc5 = tf.add(tf.matmul(fc5, weights['wfc5']), biases['bfc5'])
    fc5 = tf.nn.relu(fc5)
    # Drop out
    drop_out_fc5 = tf.nn.dropout(fc5, keep_prob)
    
    fc6 = tf.add(tf.matmul(drop_out_fc5, weights['wfc6']), biases['bfc6'])
    fc6 = tf.nn.relu(fc6)
    drop_out_fc6 = tf.nn.dropout(fc6, keep_prob)
    # Output, class prediction
    # finally we multiply the fully connected layer with the weights and add a bias term. 
    out = tf.add(tf.matmul(drop_out_fc6, weights['out']), biases['out'])
    return out

## Loss & optimizer of the model

In [None]:
init_momentum = 0.9
momentum = tf.Variable(init_momentum, trainable=False)

In [None]:
pred = env_net(X, weights, biases)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=y))

with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9, use_nesterov=True).minimize(cost)

In [None]:
#Here you check whether the index of the maximum value of the predicted image is equal to the actual labelled image. and both will be a column vector.
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))

#calculate accuracy across all the given images and average them out. 
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [None]:
# Initializing the variables
init = tf.global_variables_initializer()

In [None]:
with tf.device("/gpu:0"):
    with tf.Session() as sess:
        sess.run(init) 
        train_loss = []
        test_loss = []
        train_accuracy = []
        test_accuracy = []
        summary_writer = tf.summary.FileWriter('./Output', sess.graph)
        for i in range(training_epoch):
                for batch in range(len(train_X)//batch_size):
                    batch_x = train_X[batch*batch_size:min((batch+1)*batch_size,len(train_X))]
                    batch_y = train_y[batch*batch_size:min((batch+1)*batch_size,len(train_y))]    
                    # Run optimization op (backprop).
                        # Calculate batch loss and accuracy
                    opt = sess.run(optimizer, feed_dict={X: batch_x, y: batch_y, 
                                                         keep_prob:0.5, is_training:True, 
                                                         learning_rate:lr(i)})
                    loss, acc = sess.run([cost, accuracy], feed_dict={X: batch_x, y: batch_y, 
                                                                         keep_prob:0.5, is_training:True,
                                                                     learning_rate:lr(i)})
                print("Iter " + str(i) + ", Loss= " + \
                              "{:.6f}".format(loss) + ", Training Accuracy= " + \
                              "{:.5f}".format(acc))
                print("Optimization Finished!")
                
                #for batch in range(len(test_X)//batch_size):
                #batch_x = test_X[batch*batch_size:min((batch+1)*batch_size,len(test_X))]
                #batch_y = test_y[batch*batch_size:min((batch+1)*batch_size,len(test_y))]  
                test_acc,valid_loss = sess.run([accuracy,cost], feed_dict={X: batch_x, y: batch_y, 
                                                                           keep_prob:1.0, is_training:False,
                                                                          learning_rate:lr(i)})
                train_loss.append(loss)
                test_loss.append(valid_loss)
                train_accuracy.append(acc)
                test_accuracy.append(test_acc)
                print("Testing Accuracy:","{:.5f}".format(test_acc))
        summary_writer.close()

Iter 0, Loss= 4.816803, Training Accuracy= 0.01562
Optimization Finished!
Testing Accuracy: 0.07812
Iter 0, Loss= 4.816803, Training Accuracy= 0.01562
Optimization Finished!
Testing Accuracy: 0.07812
Iter 1, Loss= 4.167206, Training Accuracy= 0.04688
Optimization Finished!
Testing Accuracy: 0.04688
Iter 1, Loss= 4.167206, Training Accuracy= 0.04688
Optimization Finished!
Testing Accuracy: 0.04688
Iter 2, Loss= 4.420867, Training Accuracy= 0.03125
Optimization Finished!
Testing Accuracy: 0.04688
Iter 2, Loss= 4.420867, Training Accuracy= 0.03125
Optimization Finished!
Testing Accuracy: 0.04688
Iter 3, Loss= 3.995804, Training Accuracy= 0.03125
Optimization Finished!
Testing Accuracy: 0.04688
Iter 3, Loss= 3.995804, Training Accuracy= 0.03125
Optimization Finished!
Testing Accuracy: 0.04688
Iter 4, Loss= 3.662889, Training Accuracy= 0.07812
Optimization Finished!
Testing Accuracy: 0.01562
Iter 4, Loss= 3.662889, Training Accuracy= 0.07812
Optimization Finished!
Testing Accuracy: 0.01562


Iter 41, Loss= 0.905854, Training Accuracy= 0.76562
Optimization Finished!
Testing Accuracy: 0.03125
Iter 41, Loss= 0.905854, Training Accuracy= 0.76562
Optimization Finished!
Testing Accuracy: 0.03125
Iter 42, Loss= 1.178743, Training Accuracy= 0.71875
Optimization Finished!
Testing Accuracy: 0.06250
Iter 42, Loss= 1.178743, Training Accuracy= 0.71875
Optimization Finished!
Testing Accuracy: 0.06250
Iter 43, Loss= 0.994262, Training Accuracy= 0.76562
Optimization Finished!
Testing Accuracy: 0.04688
Iter 43, Loss= 0.994262, Training Accuracy= 0.76562
Optimization Finished!
Testing Accuracy: 0.04688
Iter 44, Loss= 0.966207, Training Accuracy= 0.70312
Optimization Finished!
Testing Accuracy: 0.03125
Iter 44, Loss= 0.966207, Training Accuracy= 0.70312
Optimization Finished!
Testing Accuracy: 0.03125
Iter 45, Loss= 0.851860, Training Accuracy= 0.79688
Optimization Finished!
Testing Accuracy: 0.04688
Iter 45, Loss= 0.851860, Training Accuracy= 0.79688
Optimization Finished!
Testing Accuracy

Iter 82, Loss= 0.479891, Training Accuracy= 0.87500
Optimization Finished!
Testing Accuracy: 0.26562
Iter 82, Loss= 0.479891, Training Accuracy= 0.87500
Optimization Finished!
Testing Accuracy: 0.26562
Iter 83, Loss= 0.418802, Training Accuracy= 0.89062
Optimization Finished!
Testing Accuracy: 0.23438
Iter 83, Loss= 0.418802, Training Accuracy= 0.89062
Optimization Finished!
Testing Accuracy: 0.23438
Iter 84, Loss= 0.431335, Training Accuracy= 0.87500
Optimization Finished!
Testing Accuracy: 0.26562
Iter 84, Loss= 0.431335, Training Accuracy= 0.87500
Optimization Finished!
Testing Accuracy: 0.26562
Iter 85, Loss= 0.424109, Training Accuracy= 0.89062
Optimization Finished!
Testing Accuracy: 0.26562
Iter 85, Loss= 0.424109, Training Accuracy= 0.89062
Optimization Finished!
Testing Accuracy: 0.26562
Iter 86, Loss= 0.401060, Training Accuracy= 0.89062
Optimization Finished!
Testing Accuracy: 0.28125
Iter 86, Loss= 0.401060, Training Accuracy= 0.89062
Optimization Finished!
Testing Accuracy

Iter 122, Loss= 0.391676, Training Accuracy= 0.87500
Optimization Finished!
Testing Accuracy: 0.68750
Iter 123, Loss= 0.376788, Training Accuracy= 0.90625
Optimization Finished!
Testing Accuracy: 0.70312
Iter 123, Loss= 0.376788, Training Accuracy= 0.90625
Optimization Finished!
Testing Accuracy: 0.70312
Iter 124, Loss= 0.438491, Training Accuracy= 0.89062
Optimization Finished!
Testing Accuracy: 0.70312
Iter 124, Loss= 0.438491, Training Accuracy= 0.89062
Optimization Finished!
Testing Accuracy: 0.70312
Iter 125, Loss= 0.404416, Training Accuracy= 0.89062
Optimization Finished!
Testing Accuracy: 0.70312
Iter 125, Loss= 0.404416, Training Accuracy= 0.89062
Optimization Finished!
Testing Accuracy: 0.70312
Iter 126, Loss= 0.411138, Training Accuracy= 0.89062
Optimization Finished!
Testing Accuracy: 0.70312
Iter 126, Loss= 0.411138, Training Accuracy= 0.89062
Optimization Finished!
Testing Accuracy: 0.70312
Iter 127, Loss= 0.406894, Training Accuracy= 0.89062
Optimization Finished!
Testin

TO DO : 
- implement the cross-validation
- looks like it's overfitting (check if drop out is working)
- look if there is a bug somewhere
- look padding function ? what is the use ?
- maybe a problem with the testing phase ? what is multi_crop function ?