In [13]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from glob import glob
from google.colab import drive
import os
import pandas as pd
from sklearn.model_selection import train_test_split
drive.mount("/content/gdrive")
import keras
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping


sr=16000

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [14]:
def readDir(filename, Fs):
    x , sr = librosa.load(filename , sr = Fs)
    return x

In [15]:
def readSpectrogram(infilename):
    X = np.load(infilename)
    return X

In [16]:
def zero_pad(x):
    curr_len = np.size(x)
    #Making the duration exactly 10 sec
    if curr_len > 160000:
        x = x[0:160000]
    else:
        x = np.pad(x, (0, 160000-curr_len), 'constant')
    return x

In [17]:
def calc_spec(x):
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = complex))
    X = librosa.power_to_db(X**2,ref=np.max)
    return X

In [18]:
def label_framewise(arr):
    #generate framewise one hot vectors for 3 classes(music,silence, speech)
    hop_len = 512
    win_length = 1024
    Fs = 16000
    no_of_events, x = np.shape(arr)
    x = np.array([[0,1,0]]*313)

    for i in range(no_of_events):
      start_frame = round((arr[i][0]*Fs - win_length)/(hop_len)+1)
      end_frame = round((arr[i][1]*Fs - win_length)/(hop_len)+1)
      curr_class = arr[i][2]

      if curr_class == 0: #music class
        x[int(start_frame):int(end_frame)+1] = [1,0,0]
      if curr_class == 2: #speech class
        x[int(start_frame):int(end_frame)+1] = [0,0,1]
      
      
    return x

In [19]:
def frame_to_time(m):
    hop_len = 512
    win_length = 1024
    Fs = 16000
    return ((m-1)*hop_len+win_length)/Fs

In [20]:

Fs = 16000
curr_folder = '/content/gdrive/My Drive/wav_folder'
path = os.path.join(curr_folder, '*.wav')
folder = glob(path)

file_count = 0
for file in folder:
    file_count = file_count + 1

csv_file = '/content/gdrive/My Drive/wav_folder/labels.csv'
df = pd.read_csv(csv_file)
df = df.to_numpy()    

file_list = os.listdir(curr_folder)
file_name_list = df[:,0]

feature_per_frame = 513
frame_per_file = 313

X_train = np.zeros((feature_per_frame,frame_per_file*file_count))
Y_train = np.array([[0,1,0]]*frame_per_file*file_count)

start = 0
end = frame_per_file

i = 0
X_train=[]
for file in folder:
    x_data = readDir(file,Fs)
    curr_len = np.size(x_data)
    #data preprocessing
    x_data = zero_pad(x_data)
    
    curr_spectrogram = calc_spec(x_data)
    X_train.append(curr_spectrogram)

    curr_file = (file_list[i])
    curr_file = curr_file[0:len(curr_file)-4]
    file_index = np.where(file_name_list == curr_file)
    event_count = np.size(file_index)
    timestamp_array = np.ones((event_count,3))
    
    for event in range(event_count):
        curr_index = file_index[0][event]
        onset = df[curr_index][1]
        offset =  df[curr_index][2]
        curr_class = df[curr_index][3]

        timestamp_array[event][0] = onset
        timestamp_array[event][1] = offset

        if curr_class == 'music':
            timestamp_array[event][2] = 0
        if curr_class == 'speech':
            timestamp_array[event][2] = 2

    curr_label = label_framewise(timestamp_array)
    Y_train[start:end,:] = curr_label

    start = end
    end = end + frame_per_file
    i = i + 1

X_train=np.array(X_train)
tt=int((np.shape(Y_train)[0]/313))
Y_train=np.reshape(Y_train,(tt,313,3))
# X_train=np.transpose(X_train)
print(np.shape(X_train))
print(np.shape(Y_train))

(30, 513, 313)
(30, 313, 3)


In [21]:
i, j, k = np.where(Y_train==1)
y = np.zeros((30,313))
y[i,j] = k
Y_train = y.astype(int)

In [22]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, train_size=0.8)
epochs = 1
emb_dim = 128
n_most_common_words=10000
batch_size = 10

In [23]:
input_shape=(513,313)
model = keras.Sequential()
model.add(LSTM(513,input_shape=input_shape))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(48, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(3, activation='softmax'))
model.summary() 

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 513)               1697004   
                                                                 
 dropout (Dropout)           (None, 513)               0         
                                                                 
 dense (Dense)               (None, 128)               65792     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 48)                3120      
                                                                 
 dropout_2 (Dropout)         (None, 48)               

In [24]:
model.compile(optimizer='adam',loss='SparseCategoricalCrossentropy',metrics=['acc'])
history = model.fit(x_train, y_train, epochs=50, batch_size=72,  validation_data=(x_valid, y_valid), shuffle=False)

Epoch 1/50


InvalidArgumentError: ignored

In [None]:
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))