## Acoustic Scene Classification using transfer learning on VGGish pre trained model

This model is related to a paper presenting an acoustic scene classification method, which uses transfer learning on a VGGish pre-trained model. Transfer learning is a method where knowledge from solving one problem is gained and stored, and can subsequently be used and applied to a related problem. The performance of this method is evaluated on the TUT Acoustic Scenes 2017 data set. A data set collected in Finland by Tampere University of technology. The data collection has received funding from the European Research Council and is part of a DCASE \textit{(Detection and Classification of Acoustic Scenes and Events)} 2017 challenge.

By Tobias Toft Christensen, Mikkel Heber Hahn Petersen, and Anders Hansen Warming. 

In [1]:
from __future__ import absolute_import, division, print_function

import os
import sys
sys.path.append(os.path.join('.', '..'))
import utils
import tensorflow as tf
import numpy as np
from tensorflow.contrib.layers import fully_connected, batch_norm, dropout
from tensorflow.python.ops.nn import relu, softmax
%matplotlib inline
import matplotlib.pyplot as plt

None


In [2]:
(e1_train,l1_train) =  utils.tfRead('train1')
print("tfRecord train1 uploaded!")
(e2_train,l2_train) =  utils.tfRead('train2')
print("tfRecord train2 uploaded!")
(e3_train,l3_train) =  utils.tfRead('train3')
print("tfRecord train3 uploaded!")
(e4_train,l4_train) =  utils.tfRead('train4')
print("tfRecord train4 uploaded!")
(e5_train,l5_train) =  utils.tfRead('train5')
print("tfRecord train5 uploaded!")
(e6_train,l6_train) =  utils.tfRead('train6')
print("tfRecord train6 uploaded!")
(e7_train,l7_train) =  utils.tfRead('test1')
print("tfRecord train7 uploaded!")
(e8_train,l8_train) =  utils.tfRead('test2')
print("tfRecord train8 uploaded!")
(e9_train,l9_train) =  utils.tfRead('val1')
print("tfRecord train9 uploaded!")
(e10_train,l10_train) =  utils.tfRead('val2')
print("tfRecord train10 uploaded!")
embedding_train= np.concatenate((e1_train, e2_train, e3_train, e4_train, e5_train, e6_train, e7_train, e8_train, e9_train, e10_train), axis=0)
print("Train embedding shape: ",embedding_train.shape)

embedding_labels_train = np.concatenate((l1_train, l2_train, l3_train, l4_train, l5_train, l6_train, l7_train, l8_train, l9_train, l10_train), axis=0)
print(embedding_labels_train.shape)

(e1_val,l1_val) =  utils.tfRead('Evalval1')
print("tfRecord val1 uploaded!")
(e2_val,l2_val) =  utils.tfRead('Evalval2')
#print("tfRecord val2 uploaded!")
#embedding_val = e1_val
embedding_val= np.concatenate((e1_val, e2_val), axis=0)
print("Val embedding shape: ",embedding_val.shape)
#embedding_labels_val = l1_val
embedding_labels_val = np.concatenate((l1_val,l2_val), axis=0)
print(embedding_labels_val.shape)


(4771, 128)
4771
tfRecord train1 uploaded!
(4772, 128)
4772
tfRecord train2 uploaded!
(4772, 128)
4772
tfRecord train3 uploaded!
(4772, 128)
4772
tfRecord train4 uploaded!
(4772, 128)
4772
tfRecord train5 uploaded!
(4211, 128)
4211
tfRecord train6 uploaded!
(4680, 128)
4680
tfRecord train7 uploaded!
(4680, 128)
4680
tfRecord train8 uploaded!
(4680, 128)
4680
tfRecord train9 uploaded!
(4680, 128)
4680
tfRecord train10 uploaded!
Train embedding shape:  (46790, 128)
(46790,)
(4820, 128)
4820
tfRecord val1 uploaded!
(4810, 128)
4810
Val embedding shape:  (9630, 128)
(9630,)


In [3]:
embedding_labels_train = utils.labelMinimizer(embedding_labels_train)
embedding_labels_val = utils.labelMinimizer(embedding_labels_val)

In [4]:

#One hot encoding
embedding_labels_train = utils.OnehotEnc(embedding_labels_train)
embedding_labels_val = utils.OnehotEnc(embedding_labels_val)

weight_initializer = tf.truncated_normal_initializer(stddev=0.1)

In [5]:

num_features = embedding_train[0].shape[0]
num_classes = embedding_labels_train[0].shape[0]
print('number of features: ', num_features)
print('number of classes: ', num_classes)

reuse_flag = True

## Define placeholders
x_pl = tf.placeholder(tf.float32, shape=[None, num_features], name='xPlaceholder')
y_pl = tf.placeholder(tf.float32, shape=[None, num_classes], name='yPlaceholder')


## Define initializer for the weigths
num_hidden1 = 128
num_hidden2 = 128

l1 =  fully_connected(x_pl, num_hidden1, activation_fn=relu)

lout = fully_connected(l1, num_classes, activation_fn=None)

prediction = tf.nn.softmax(lout,name="op_to_restore")



number of features:  128
number of classes:  3


In [6]:
### Implement training ops

LEARNING_RATE = 0.1;

# 1) Define cross entropy loss
cross_entropy = -tf.reduce_sum(y_pl * tf.log(prediction), reduction_indices=[1])
cross_entropy = tf.reduce_mean(cross_entropy)

# 2) Define the training op
#train_op = tf.train.AdamOptimizer(LEARNING_RATE).minimize(cross_entropy)
train_op = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(cross_entropy)

# 3) Define accuracy op
correct_prediction = tf.equal(tf.argmax(prediction, axis=1), tf.argmax(y_pl, axis=1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


In [7]:
max_epochs = 100
batch_size = 128

epochs_completed_train = 0
epochs_completed_val = 0

idx_epochs_train = 0
idx_epochs_val = 0

old_epochs_completed = 0
# restricting memory usage, TensorFlow is greedy and will use all memory otherwise
gpu_opts = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)

train_cost, val_cost, train_acc, val_acc = [],[],[],[]
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_opts)) as sess:
    sess.run(tf.global_variables_initializer())
    try:

        while epochs_completed_train < max_epochs:
            train_cost, train_acc = [],[]
            (x_tr,y_tr,epochs_completed_train, idx_epochs_train, embedding_train,embedding_labels_train) = utils.next_batch(batch_size,idx_epochs_train,epochs_completed_train,embedding_train,embedding_labels_train)
            (x_val,y_val,epochs_completed_val, idx_epochs_val, embedding_val,embedding_labels_val) = utils.next_batch(batch_size,idx_epochs_val,epochs_completed_val,embedding_val,embedding_labels_val)
            
                # Traning optimizer
            feed_dict_train = {x_pl: x_tr, y_pl: y_tr}

                # running the train_op
            res = sess.run( [train_op, cross_entropy, accuracy], feed_dict=feed_dict_train)

            train_cost += [res[1]]
            train_acc += [res[2]]

                # Validation:
            feed_dict_valid = {x_pl: x_val, y_pl: y_val}

            res = sess.run([cross_entropy, accuracy], feed_dict=feed_dict_valid)
            val_cost += [res[0]]
            val_acc += [res[1]]
            
            if old_epochs_completed != epochs_completed_train:
                print("Epoch %i, Train Cost: %0.3f\tVal Cost: %0.3f\t Val acc: %0.3f" \
                      %(epochs_completed_train, train_cost[-1],val_cost[-1],val_acc[-1]))
                
            old_epochs_completed = epochs_completed_train
            
            
            # Save the output of the network to a local place
        saver = tf.train.Saver()
        saver.save(sess, "C:/tmp/audio_classifier")

    except KeyboardInterrupt:
        print('KeyboardInterrupt')

print('Done')


Epoch 1, Train Cost: 0.407	Val Cost: 0.506	 Val acc: 0.795
Epoch 2, Train Cost: 0.457	Val Cost: 0.526	 Val acc: 0.756
Epoch 3, Train Cost: 0.438	Val Cost: 0.649	 Val acc: 0.748
Epoch 4, Train Cost: 0.322	Val Cost: 0.359	 Val acc: 0.835
Epoch 5, Train Cost: 0.262	Val Cost: 0.512	 Val acc: 0.803
Epoch 6, Train Cost: 0.250	Val Cost: 0.621	 Val acc: 0.811
Epoch 7, Train Cost: 0.238	Val Cost: 0.610	 Val acc: 0.748
Epoch 8, Train Cost: 0.328	Val Cost: 0.460	 Val acc: 0.858
Epoch 9, Train Cost: 0.299	Val Cost: 0.582	 Val acc: 0.811
Epoch 10, Train Cost: 0.264	Val Cost: 0.554	 Val acc: 0.787
Epoch 11, Train Cost: 0.208	Val Cost: 0.537	 Val acc: 0.780
Epoch 12, Train Cost: 0.242	Val Cost: 0.614	 Val acc: 0.819
Epoch 13, Train Cost: 0.284	Val Cost: 0.523	 Val acc: 0.772
Epoch 14, Train Cost: 0.339	Val Cost: 0.452	 Val acc: 0.819
Epoch 15, Train Cost: 0.186	Val Cost: 0.595	 Val acc: 0.756
Epoch 16, Train Cost: 0.171	Val Cost: 0.456	 Val acc: 0.843
Epoch 17, Train Cost: 0.271	Val Cost: 0.699	 Val 