In [1]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from glob import glob
from google.colab import drive
import os
import pandas as pd
from sklearn.model_selection import train_test_split
drive.mount("/content/gdrive")

sr=16000

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import tensorflow.compat.v1 as tf
import matplotlib.pyplot as plt
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [3]:
def readDir(filename, Fs):
    x , sr = librosa.load(filename , sr = Fs)
    return x

In [4]:
def readSpectrogram(infilename):
    X = np.load(infilename)
    return X

In [5]:
def zero_pad(x):
    curr_len = np.size(x)
    #Making the duration exactly 10 sec
    if curr_len > 160000:
        x = x[0:160000]
    else:
        x = np.pad(x, (0, 160000-curr_len), 'constant')
    return x

In [6]:
def calc_spec(x):
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = complex))
    X = librosa.power_to_db(X**2,ref=np.max)
    return X

In [7]:
def label_framewise(arr):
    #generate framewise one hot vectors for 3 classes(music,silence, speech)
    hop_len = 512
    win_length = 1024
    Fs = 16000
    no_of_events, x = np.shape(arr)
    x = np.array([[0,1,0]]*313)

    for i in range(no_of_events):
      start_frame = round((arr[i][0]*Fs - win_length)/(hop_len)+1)
      end_frame = round((arr[i][1]*Fs - win_length)/(hop_len)+1)
      curr_class = arr[i][2]

      if curr_class == 0: #music class
        x[int(start_frame):int(end_frame)+1] = [1,0,0]
      if curr_class == 2: #speech class
        x[int(start_frame):int(end_frame)+1] = [0,0,1]
      
      
    return x

In [8]:
def frame_to_time(m):
    hop_len = 512
    win_length = 1024
    Fs = 16000
    return ((m-1)*hop_len+win_length)/Fs

In [9]:
def randomize(x, y):
    """ Randomizes the order of data samples and their corresponding labels"""
    permutation = np.random.permutation(y.shape[0])
    shuffled_x = x[permutation, :]
    shuffled_y = y[permutation]
    return shuffled_x, shuffled_y

def get_next_batch(x, y, start, end):
    x_batch = x[start:end]
    y_batch = y[start:end]
    return x_batch, y_batch

In [10]:
epochs = 10             # Total number of training epochs
batch_size = 100        # Training batch size
display_freq = 100      # Frequency of displaying the training results
learning_rate = 0.001 

In [11]:
def weight_variable(shape):
    initer = tf.truncated_normal_initializer(stddev=0.01)
    return tf.get_variable('W',dtype=tf.float32,shape=shape,initializer=initer)

def bias_variable(shape):
    initial = tf.constant(0., shape=shape, dtype=tf.float32)
    return tf.get_variable('b',dtype=tf.float32,initializer=initial)
 

In [12]:
Fs = 16000
#folder containing all the wav files for training
curr_folder = '/content/gdrive/My Drive/coding-1/wav_folder_final'
path = os.path.join(curr_folder, '*.wav')
folder = glob(path)

file_count = 0
for file in folder:
    file_count = file_count + 1

#csv file containing the classes and their onset and offset time
csv_file = '/content/gdrive/My Drive/coding-1/wav_folder_final/labels.csv'
df = pd.read_csv(csv_file)
df = df.to_numpy()    

file_list = os.listdir(curr_folder)
file_name_list = df[:,0]

feature_per_frame = 513
frame_per_file = 313

X_train = np.zeros((feature_per_frame,frame_per_file*file_count))
Y_train = np.array([[0,1,0]]*frame_per_file*file_count)

start = 0
end = frame_per_file

i = 0
for file in folder:
    x_data = readDir(file,Fs)
    curr_len = np.size(x_data)
    #data preprocessing
    x_data = zero_pad(x_data)
    
    curr_spectrogram = calc_spec(x_data)
    X_train[:, start:end] = curr_spectrogram

    curr_file = (file_list[i])
    curr_file = curr_file[0:len(curr_file)-4]
    file_index = np.where(file_name_list == curr_file)
    event_count = np.size(file_index)
    timestamp_array = np.ones((event_count,3))
    
    for event in range(event_count):
        curr_index = file_index[0][event]
        onset = df[curr_index][1]
        offset =  df[curr_index][2]
        curr_class = df[curr_index][3]

        timestamp_array[event][0] = onset
        timestamp_array[event][1] = offset

        if curr_class == 'music':
            timestamp_array[event][2] = 0
        if curr_class == 'speech':
            timestamp_array[event][2] = 2

    curr_label = label_framewise(timestamp_array)
    Y_train[start:end,:] = curr_label

    start = end
    end = end + frame_per_file
    i = i + 1

In [13]:
#Splitting Data into training and validation
X_train=np.transpose(X_train)
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, train_size=0.8)
img_size_flat = 513
n_classes=3

In [14]:
# Create the graph for the linear model
# Placeholders for inputs (x) and outputs(y)
x = tf.placeholder(tf.float32, shape=[None, img_size_flat], name='X')
y = tf.placeholder(tf.float32, shape=[None, n_classes], name='Y')

In [15]:
W = weight_variable(shape=[img_size_flat, n_classes])
b = bias_variable(shape=[n_classes])

output_logits = tf.matmul(x, W) + b

In [16]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=output_logits), name='loss')
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, name='Adam-op').minimize(loss)
correct_prediction = tf.equal(tf.argmax(output_logits, 1), tf.argmax(y, 1), name='correct_pred')
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')

# Model predictions
cls_prediction = tf.argmax(output_logits, axis=1, name='predictions')
 

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [17]:
# Creating the op for initializing all variables
init = tf.global_variables_initializer()

In [18]:
# Create an interactive session (to keep the session in the other cells)
sess = tf.InteractiveSession()
# Initialize all variables
sess.run(init)
# Number of training iterations in each epoch
num_tr_iter = int(len(y_train) / batch_size)
for epoch in range(epochs):
    print('Training epoch: {}'.format(epoch + 1))
    # Randomly shuffle the training data at the beginning of each epoch 
    x_train, y_train = randomize(x_train, y_train)
    for iteration in range(num_tr_iter):
        start = iteration * batch_size
        end = (iteration + 1) * batch_size
        x_batch, y_batch = get_next_batch(x_train, y_train, start, end)

        # Run optimization op (backprop)
        feed_dict_batch = {x: x_batch, y: y_batch}
        # print(feed_dict_batch)
        sess.run(optimizer, feed_dict=feed_dict_batch)

        if iteration % display_freq == 0:
            # Calculate and display the batch loss and accuracy
            loss_batch, acc_batch = sess.run([loss, accuracy],
                                             feed_dict=feed_dict_batch)

            print("iter {0:3d}:\t Loss={1:.2f},\tTraining Accuracy={2:.01%}".
                  format(iteration, loss_batch, acc_batch))

    # Run validation after every epoch
    feed_dict_valid = {x: x_valid[:1000], y: y_valid[:1000]}
    loss_valid, acc_valid = sess.run([loss, accuracy], feed_dict=feed_dict_valid)
    print('---------------------------------------------------------')
    print("Epoch: {0}, validation loss: {1:.2f}, validation accuracy: {2:.01%}".
          format(epoch + 1, loss_valid, acc_valid))
    print('---------------------------------------------------------')

Training epoch: 1
iter   0:	 Loss=11.98,	Training Accuracy=17.0%
iter 100:	 Loss=0.58,	Training Accuracy=83.0%
---------------------------------------------------------
Epoch: 1, validation loss: 0.55, validation accuracy: 79.2%
---------------------------------------------------------
Training epoch: 2
iter   0:	 Loss=0.62,	Training Accuracy=79.0%
iter 100:	 Loss=0.65,	Training Accuracy=74.0%
---------------------------------------------------------
Epoch: 2, validation loss: 0.41, validation accuracy: 88.0%
---------------------------------------------------------
Training epoch: 3
iter   0:	 Loss=0.34,	Training Accuracy=89.0%
iter 100:	 Loss=0.25,	Training Accuracy=88.0%
---------------------------------------------------------
Epoch: 3, validation loss: 0.47, validation accuracy: 85.0%
---------------------------------------------------------
Training epoch: 4
iter   0:	 Loss=0.30,	Training Accuracy=89.0%
iter 100:	 Loss=0.57,	Training Accuracy=87.0%
-------------------------------

In [19]:
#Testing
x_test, y_test = x_valid, y_valid
feed_dict_test = {x: x_test[:1000], y: y_test[:1000]}
loss_test, acc_test = sess.run([loss, accuracy], feed_dict=feed_dict_test)
print('---------------------------------------------------------')
print("Test loss: {0:.2f}, test accuracy: {1:.01%}".format(loss_test, acc_test))
print('---------------------------------------------------------')
 

---------------------------------------------------------
Test loss: 0.52, test accuracy: 87.1%
---------------------------------------------------------


In [20]:
XX=W.eval()
bb=b.eval()
np.save('/content/gdrive/My Drive/coding-1/W.npy',XX)
np.save('/content/gdrive/My Drive/coding-1/b.npy',bb)