In [1]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from glob import glob
from google.colab import drive
import os
import pandas as pd
drive.mount("/content/gdrive")

sr=16000

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

In [3]:
def readDir(filename, Fs):
    x , sr = librosa.load(filename , sr = Fs)
    return x

In [4]:
def readSpectrogram(infilename):
    X = np.load(infilename)
    return X

In [5]:
def zero_pad(x):
    curr_len = np.size(x)
    #Making the duration exactly 10 sec
    if curr_len > 160000:
        x = x[0:160000]
    else:
        x = np.pad(x, (0, 160000-curr_len), 'constant')
    return x

In [6]:
def calc_spec(x):
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = complex))
    X = librosa.power_to_db(X**2,ref=np.max)
    return X

In [7]:
def label_framewise(arr):
    #generate framewise one hot vectors for 3 classes(music,silence, speech)
    hop_len = 512
    win_length = 1024
    Fs = 16000
    no_of_events, x = np.shape(arr)
    x = np.array([[0,1,0]]*313)

    for i in range(no_of_events):
      start_frame = round((arr[i][0]*Fs - win_length)/(hop_len)+1)
      end_frame = round((arr[i][1]*Fs - win_length)/(hop_len)+1)
      curr_class = arr[i][2]

      if curr_class == 0: #music class
        x[int(start_frame):int(end_frame)+1] = [1,0,0]
      if curr_class == 2: #speech class
        x[int(start_frame):int(end_frame)+1] = [0,0,1]
      
      
    return x

In [8]:
def frame_to_time(m):
    hop_len = 512
    win_length = 1024
    Fs = 16000
    return ((m-1)*hop_len+win_length)/Fs

In [9]:
if __name__=="__main__":
    
    Fs = 16000
    curr_folder = '/content/gdrive/My Drive/coding-1/wav_folder_final'
    path = os.path.join(curr_folder, '*.wav')
    folder = glob(path)

    file_count = 0
    for file in folder:
        file_count = file_count + 1
    
    csv_file = '/content/gdrive/My Drive/coding-1/wav_folder_final/labels.csv'
    df = pd.read_csv(csv_file)
    df = df.to_numpy()    

    file_list = os.listdir(curr_folder)
    file_name_list = df[:,0]

    feature_per_frame = 513
    frame_per_file = 313

    X_train = np.zeros((feature_per_frame,frame_per_file*file_count))
    Y_train = np.array([[0,1,0]]*frame_per_file*file_count)
    
    start = 0
    end = frame_per_file
    
    i = 0
    for file in folder:
        x_data = readDir(file,Fs)
        curr_len = np.size(x_data)
        #data preprocessing
        x_data = zero_pad(x_data)
        
        curr_spectrogram = calc_spec(x_data)
        X_train[:, start:end] = curr_spectrogram

        curr_file = (file_list[i])
        curr_file = curr_file[0:len(curr_file)-4]
        file_index = np.where(file_name_list == curr_file)
        event_count = np.size(file_index)
        timestamp_array = np.ones((event_count,3))
        
        for event in range(event_count):
            curr_index = file_index[0][event]
            onset = df[curr_index][1]
            offset =  df[curr_index][2]
            curr_class = df[curr_index][3]

            timestamp_array[event][0] = onset
            timestamp_array[event][1] = offset

            if curr_class == 'music':
                timestamp_array[event][2] = 0
            if curr_class == 'speech':
                timestamp_array[event][2] = 2

        curr_label = label_framewise(timestamp_array)
        Y_train[start:end,:] = curr_label

        start = end
        end = end + frame_per_file
        i = i + 1
     

In [10]:
X_train=np.transpose(X_train)
model = Sequential()
model.add(Dense(16, input_shape=(X_train.shape[1],), activation='relu')) 
model.add(Dense(3, activation='softmax'))
model.summary()

model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                8224      
                                                                 
 dense_1 (Dense)             (None, 3)                 51        
                                                                 
Total params: 8,275
Trainable params: 8,275
Non-trainable params: 0
_________________________________________________________________


In [11]:
es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=10, restore_best_weights=True)
history = model.fit(X_train, Y_train, callbacks=[es], epochs=8000, batch_size=32, shuffle=True, validation_split=0.1,verbose=1)

Epoch 1/8000
Epoch 2/8000
Epoch 3/8000
Epoch 4/8000
Epoch 5/8000
Epoch 6/8000
Epoch 7/8000
Epoch 8/8000
Epoch 9/8000
Epoch 10/8000
Epoch 11/8000
Epoch 12/8000
Epoch 13/8000
Epoch 14/8000


In [12]:
#sample test data whose ground truth is known
z=readSpectrogram('/content/gdrive/My Drive/coding-1/spectrogram_folder/test3.npy')
array_spec=z

In [13]:
z=10**(z/10)
s=np.sum(z,axis=0)
loc=np.where(s<0.001,0,1) # 1- Not Silent, 0- Silent

In [14]:
#TASK 1 - Event Detection
start=0
end=0
list1=[]
for i in range(0,len(loc)):
  if i==0 and loc[i]==1:
    start=0
    end=0
  elif loc[i]==0:
    if(start != end):
      list1.append([start, end])
    start=loc[i]
    end=start
  elif loc[i]==1 and loc[i-1]==0:
    start=i
    end=start
  elif loc[i]==1 and loc[i-1]==1:
    end=end+1
event_count = len(list1)
arr2 = np.zeros((event_count,2))
for i in range(event_count):
  onset = frame_to_time(list1[i][0])
  offset = frame_to_time(list1[i][1])

  arr2[i][0] = onset
  arr2[i][1] = offset
print(list1)
print(arr2)
#arr2 contains the onset and offset times for every event in a 10 sec audio

[[3, 118], [181, 302]]
[[0.128 3.808]
 [5.824 9.696]]


In [15]:
#TASK 2 - Audio Classification
pred=model.predict(np.transpose(array_spec))
output=[]
for interval in list1:
  temp=pred[interval[0]:interval[1]]
  temp=np.sum(temp,axis=0)
  index=np.argmax(temp)
  cat='music'
  if index==1:
    cat='silence'
  if index==2:
    cat='speech'
  output.append([frame_to_time(interval[0]),frame_to_time(interval[1]),cat])
print(output)

[[0.128, 3.808, 'music'], [5.824, 9.696, 'music']]


In [16]:
model.save('/content/gdrive/My Drive/coding-1/')

INFO:tensorflow:Assets written to: /content/gdrive/My Drive/coding-1/assets
