<a href="https://colab.research.google.com/github/xychong/edgeaimonitoring/blob/main/Preprocessing/Train_and_Test_Split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import librosa, librosa.display
import numpy as np
import os
import IPython
import pandas as pd

# make plot outputs appear and be stored within notebook
%matplotlib inline

import sklearn
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pathname_ambience = "/content/drive/MyDrive/FYP Data/Data (2500)/Ambience"
pathname_footsteps = "/content/drive/MyDrive/FYP Data/Data (2500)/Footsteps"
pathname_horn = "/content/drive/MyDrive/FYP Data/Data (2500)/Horn"
pathname_music = "/content/drive/MyDrive/FYP Data/Data (2500)/Music"
pathname_shout = "/content/drive/MyDrive/FYP Data/Data (2500)/Shout"

### Ambience

In [None]:
df_melspec_ambience = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [None]:
# Window size = 2048
# Number of Features = sample_rate/hop_length * duration_of_audio
# Compute FFT for each window
# Take entire frequency spectrum and separate into 224 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_ambience):
  audiopath = pathname_ambience + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  duration = librosa.get_duration(y)/2
  hop_len = round(44100/(224/duration))
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = hop_len, n_mels=224, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_ambience = df_melspec_ambience.append({'Class': "Ambience",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [None]:
df_melspec_ambience.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Ambience,ambience (300).wav,"[[[-23.015066, -23.015066, -23.015066], [-36.7..."
1,Ambience,ambience (317).wav,"[[[-35.291317, -35.291317, -35.291317], [-37.1..."
2,Ambience,ambience (308).wav,"[[[-28.944914, -28.944914, -28.944914], [-33.7..."
3,Ambience,ambience (307).wav,"[[[-41.734478, -41.734478, -41.734478], [-45.7..."
4,Ambience,ambience (314).wav,"[[[-40.293793, -40.293793, -40.293793], [-38.5..."


In [None]:
melspec_list = np.array(df_melspec_ambience['Log Mel Spectrogram'].values.tolist())

In [None]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 

In [None]:
df_input = df_melspec_ambience[['Filename','Log Mel Spectrogram']]
train_ambience, test_ambience = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_ambience

Unnamed: 0,Filename,Log Mel Spectrogram
367,ambience (454).wav,"[[[-18.808973, -18.808973, -18.808973], [-15.6..."
172,ambience (137).wav,"[[[-24.659868, -24.659868, -24.659868], [-23.1..."
391,ambience (437).wav,"[[[-10.943905, -10.943905, -10.943905], [-8.67..."
24,ambience (295).wav,"[[[-24.503147, -24.503147, -24.503147], [-33.1..."
378,ambience (474).wav,"[[[-16.491426, -16.491426, -16.491426], [-15.7..."
...,...,...
111,ambience (202).wav,"[[[-35.981052, -35.981052, -35.981052], [-44.0..."
314,ambience (324).wav,"[[[-13.557112, -13.557112, -13.557112], [-14.6..."
310,ambience (5).wav,"[[[-8.872603, -8.872603, -8.872603], [-14.4596..."
411,ambience (422).wav,"[[[-26.567402, -26.567402, -26.567402], [-17.0..."


In [None]:
test_ambience

Unnamed: 0,Filename,Log Mel Spectrogram
14,ambience (319).wav,"[[[-37.62004, -37.62004, -37.62004], [-32.8695..."
174,ambience (151).wav,"[[[-18.781565, -18.781565, -18.781565], [-21.1..."
473,ambience (350).wav,"[[[-9.855973, -9.855973, -9.855973], [-14.2419..."
149,ambience (164).wav,"[[[-20.290745, -20.290745, -20.290745], [-11.1..."
289,ambience (39).wav,"[[[-14.63257, -14.63257, -14.63257], [-15.7625..."
...,...,...
399,ambience (431).wav,"[[[-21.115847, -21.115847, -21.115847], [-21.7..."
280,ambience (38).wav,"[[[-10.878064, -10.878064, -10.878064], [-5.91..."
197,ambience (128).wav,"[[[-35.569466, -35.569466, -35.569466], [-39.1..."
10,ambience (315).wav,"[[[-27.22994, -27.22994, -27.22994], [-35.6758..."


### Footsteps

In [None]:
df_melspec_footsteps = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [None]:
# Window size = 2048
# Number of Features = sample_rate/hop_length * duration_of_audio
# Compute FFT for each window
# Take entire frequency spectrum and separate into 224 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_footsteps):
  audiopath = pathname_footsteps + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  duration = librosa.get_duration(y)/2
  hop_len = round(44100/(224/duration))
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = hop_len, n_mels=224, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_footsteps = df_melspec_footsteps.append({'Class': "Footsteps",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [None]:
df_melspec_footsteps.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Footsteps,footsteps (10).wav,"[[[-49.480644, -49.480644, -49.480644], [-42.0..."
1,Footsteps,footsteps (9).wav,"[[[-36.579136, -36.579136, -36.579136], [-46.3..."
2,Footsteps,footsteps (346).wav,"[[[-32.801277, -32.801277, -32.801277], [-23.4..."
3,Footsteps,footsteps (13).wav,"[[[-44.793125, -44.793125, -44.793125], [-42.6..."
4,Footsteps,footsteps (340).wav,"[[[-47.264633, -47.264633, -47.264633], [-40.9..."


In [None]:
melspec_list = np.array(df_melspec_footsteps['Log Mel Spectrogram'].values.tolist())

In [None]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 

In [None]:
df_input = df_melspec_footsteps[['Filename','Log Mel Spectrogram']]
train_footsteps, test_footsteps = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_footsteps

Unnamed: 0,Filename,Log Mel Spectrogram
226,footsteps (44).wav,"[[[-22.833313, -22.833313, -22.833313], [-23.4..."
180,footsteps (73).wav,"[[[-38.709854, -38.709854, -38.709854], [-29.2..."
115,footsteps (175).wav,"[[[-26.410227, -26.410227, -26.410227], [-29.0..."
347,footsteps (496).wav,"[[[-43.380745, -43.380745, -43.380745], [-29.5..."
488,footsteps (15).wav,"[[[-48.47395, -48.47395, -48.47395], [-29.5890..."
...,...,...
342,footsteps (472).wav,"[[[-33.339066, -33.339066, -33.339066], [-28.9..."
235,footsteps (166).wav,"[[[-63.121853, -63.121853, -63.121853], [-54.8..."
135,footsteps (126).wav,"[[[-11.822969, -11.822969, -11.822969], [-11.7..."
313,footsteps (314).wav,"[[[-45.166862, -45.166862, -45.166862], [-36.0..."


In [None]:
test_footsteps

Unnamed: 0,Filename,Log Mel Spectrogram
49,footsteps (304).wav,"[[[-29.66637, -29.66637, -29.66637], [-36.1903..."
348,footsteps (434).wav,"[[[-16.428417, -16.428417, -16.428417], [-29.5..."
298,footsteps (267).wav,"[[[-34.04514, -34.04514, -34.04514], [-22.7396..."
240,footsteps (51).wav,"[[[-46.687088, -46.687088, -46.687088], [-48.5..."
466,footsteps (410).wav,"[[[-63.137775, -63.137775, -63.137775], [-67.9..."
...,...,...
401,footsteps (455).wav,"[[[-27.651564, -27.651564, -27.651564], [-35.5..."
329,footsteps (493).wav,"[[[-5.7827263, -5.7827263, -5.7827263], [-7.03..."
35,footsteps (326).wav,"[[[-6.0533, -6.0533, -6.0533], [-10.223169, -1..."
202,footsteps (20).wav,"[[[-14.047472, -14.047472, -14.047472], [-11.7..."


### Horn

In [None]:
df_melspec_horn = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [None]:
# Window size = 2048
# Number of Features = sample_rate/hop_length * duration_of_audio
# Compute FFT for each window
# Take entire frequency spectrum and separate into 224 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_horn):
  audiopath = pathname_horn + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  duration = librosa.get_duration(y)/2
  hop_len = round(44100/(224/duration))
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = hop_len, n_mels=224, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_horn = df_melspec_horn.append({'Class': "Horn",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [None]:
df_melspec_horn.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Horn,horn (28).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
1,Horn,horn (20).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
2,Horn,horn (39).wav,"[[[-64.84104, -64.84104, -64.84104], [-60.9975..."
3,Horn,horn (19).wav,"[[[-74.74084, -74.74084, -74.74084], [-63.9060..."
4,Horn,horn (43).wav,"[[[-27.015419, -27.015419, -27.015419], [-28.3..."


In [None]:
melspec_list = np.array(df_melspec_horn['Log Mel Spectrogram'].values.tolist())

In [None]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 

In [None]:
df_input = df_melspec_horn[['Filename','Log Mel Spectrogram']]
train_horn, test_horn = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_horn

Unnamed: 0,Filename,Log Mel Spectrogram
293,horn (325).wav,"[[[-80.0, -80.0, -80.0], [-72.25369, -72.25369..."
239,horn (374).wav,"[[[-34.23175, -34.23175, -34.23175], [-37.2089..."
315,horn (321).wav,"[[[-37.045506, -37.045506, -37.045506], [-36.1..."
52,horn (230).wav,"[[[-21.015245, -21.015245, -21.015245], [-22.8..."
64,horn (192).wav,"[[[-14.522879, -14.522879, -14.522879], [-15.5..."
...,...,...
392,horn (455).wav,"[[[-49.756996, -49.756996, -49.756996], [-43.6..."
415,horn (436).wav,"[[[-58.35489, -58.35489, -58.35489], [-53.3003..."
409,horn (426).wav,"[[[-58.52715, -58.52715, -58.52715], [-66.2471..."
270,horn (367).wav,"[[[-55.817078, -55.817078, -55.817078], [-67.2..."


In [None]:
test_horn

Unnamed: 0,Filename,Log Mel Spectrogram
433,horn (282).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
349,horn (243).wav,"[[[-41.574165, -41.574165, -41.574165], [-24.7..."
181,horn (96).wav,"[[[-40.018932, -40.018932, -40.018932], [-41.6..."
12,horn (30).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
275,horn (350).wav,"[[[-42.70248, -42.70248, -42.70248], [-37.5801..."
...,...,...
214,horn (405).wav,"[[[-26.634985, -26.634985, -26.634985], [-22.5..."
149,horn (72).wav,"[[[-31.487461, -31.487461, -31.487461], [-33.3..."
479,horn (15).wav,"[[[-44.652252, -44.652252, -44.652252], [-41.9..."
137,horn (53).wav,"[[[-30.61274, -30.61274, -30.61274], [-18.2317..."


### Music

In [None]:
df_melspec_music = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [None]:
# Window size = 2048
# Number of Features = sample_rate/hop_length * duration_of_audio
# Compute FFT for each window
# Take entire frequency spectrum and separate into 224 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_music):
  audiopath = pathname_music + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  duration = librosa.get_duration(y)/2
  hop_len = round(44100/(224/duration))
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = hop_len, n_mels=224, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_music = df_melspec_music.append({'Class': "Music",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [None]:
df_melspec_music.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Music,music (15).wav,"[[[-33.532093, -33.532093, -33.532093], [-52.8..."
1,Music,music (14).wav,"[[[-22.662294, -22.662294, -22.662294], [-44.8..."
2,Music,music (11).wav,"[[[-28.591068, -28.591068, -28.591068], [-47.6..."
3,Music,music (12).wav,"[[[-28.80003, -28.80003, -28.80003], [-45.9442..."
4,Music,music (10).wav,"[[[-49.95333, -49.95333, -49.95333], [-58.5572..."


In [None]:
melspec_list = np.array(df_melspec_music['Log Mel Spectrogram'].values.tolist())

In [None]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 

In [None]:
df_input = df_melspec_music[['Filename','Log Mel Spectrogram']]
train_music, test_music = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_music

Unnamed: 0,Filename,Log Mel Spectrogram
224,music (105).wav,"[[[-29.0168, -29.0168, -29.0168], [-22.63326, ..."
271,music (52).wav,"[[[-35.202976, -35.202976, -35.202976], [-35.6..."
105,music (201).wav,"[[[-48.57589, -48.57589, -48.57589], [-46.6938..."
442,music (488).wav,"[[[-35.036453, -35.036453, -35.036453], [-34.7..."
115,music (187).wav,"[[[-34.6275, -34.6275, -34.6275], [-37.05587, ..."
...,...,...
465,music (472).wav,"[[[-43.93062, -43.93062, -43.93062], [-30.7873..."
475,music (471).wav,"[[[-25.745617, -25.745617, -25.745617], [-25.1..."
23,music (342).wav,"[[[-35.684135, -35.684135, -35.684135], [-47.5..."
11,music (8).wav,"[[[-28.724081, -28.724081, -28.724081], [-47.4..."


In [None]:
test_music

Unnamed: 0,Filename,Log Mel Spectrogram
137,music (285).wav,"[[[-23.118753, -23.118753, -23.118753], [-27.3..."
279,music (119).wav,"[[[-25.98876, -25.98876, -25.98876], [-28.8637..."
265,music (69).wav,"[[[-18.60705, -18.60705, -18.60705], [-26.7759..."
76,music (217).wav,"[[[-14.844997, -14.844997, -14.844997], [-13.4..."
418,music (404).wav,"[[[-42.039185, -42.039185, -42.039185], [-38.5..."
...,...,...
448,music (448).wav,"[[[-45.32652, -45.32652, -45.32652], [-42.8028..."
367,music (395).wav,"[[[-34.596947, -34.596947, -34.596947], [-33.5..."
311,music (124).wav,"[[[-6.4076962, -6.4076962, -6.4076962], [-4.42..."
139,music (194).wav,"[[[-45.00541, -45.00541, -45.00541], [-61.7152..."


### Shout

In [None]:
df_melspec_shout = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [None]:
# Window size = 2048
# Number of Features = sample_rate/hop_length * duration_of_audio
# Compute FFT for each window
# Take entire frequency spectrum and separate into 224 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_shout):
  audiopath = pathname_shout + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  duration = librosa.get_duration(y)/2
  hop_len = round(44100/(224/duration))
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = hop_len, n_mels=224, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_shout = df_melspec_shout.append({'Class': "Shout",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [None]:
df_melspec_shout.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Shout,shout (251).wav,"[[[-48.21131, -48.21131, -48.21131], [-42.7343..."
1,Shout,shout (248).wav,"[[[-58.597908, -58.597908, -58.597908], [-45.8..."
2,Shout,shout (266).wav,"[[[-76.53251, -76.53251, -76.53251], [-80.0, -..."
3,Shout,shout (270).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
4,Shout,shout (240).wav,"[[[-38.6855, -38.6855, -38.6855], [-38.668934,..."


In [None]:
melspec_list = np.array(df_melspec_shout['Log Mel Spectrogram'].values.tolist())

In [None]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 

In [None]:
df_input = df_melspec_shout[['Filename','Log Mel Spectrogram']]
train_shout, test_shout = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_shout

Unnamed: 0,Filename,Log Mel Spectrogram
434,shout (119).wav,"[[[-31.886131, -31.886131, -31.886131], [-32.8..."
338,shout (144).wav,"[[[-49.19461, -49.19461, -49.19461], [-53.5209..."
102,shout (355).wav,"[[[-32.439552, -32.439552, -32.439552], [-45.9..."
391,shout (93).wav,"[[[-34.62589, -34.62589, -34.62589], [-21.5155..."
396,shout (75).wav,"[[[-45.09734, -45.09734, -45.09734], [-59.3543..."
...,...,...
449,shout (225).wav,"[[[-35.96278, -35.96278, -35.96278], [-44.1042..."
342,shout (40).wav,"[[[-64.51155, -64.51155, -64.51155], [-69.0053..."
482,shout (21).wav,"[[[-45.40621, -45.40621, -45.40621], [-64.4320..."
356,shout (219).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."


In [None]:
test_shout

Unnamed: 0,Filename,Log Mel Spectrogram
275,shout (190).wav,"[[[-74.65285, -74.65285, -74.65285], [-64.4224..."
336,shout (155).wav,"[[[-40.823666, -40.823666, -40.823666], [-41.0..."
205,shout (391).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
381,shout (106).wav,"[[[-38.521156, -38.521156, -38.521156], [-49.5..."
282,shout (145).wav,"[[[-29.143269, -29.143269, -29.143269], [-33.3..."
...,...,...
491,shout (18).wav,"[[[-52.23402, -52.23402, -52.23402], [-63.3092..."
256,shout (491).wav,"[[[-41.33486, -41.33486, -41.33486], [-40.7047..."
287,shout (181).wav,"[[[-57.507534, -57.507534, -57.507534], [-70.2..."
222,shout (422).wav,"[[[-69.83048, -69.83048, -69.83048], [-39.1142..."


### Merge all train dataframes into one dataframe

In [None]:
df_train_append1 = train_ambience.append(train_footsteps)
df_train_append2 = df_train_append1.append(train_horn)
df_train_append3 = df_train_append2.append(train_music)
df_train = df_train_append3.append(train_shout)
df_train

Unnamed: 0,Filename,Log Mel Spectrogram
367,ambience (454).wav,"[[[-18.808973, -18.808973, -18.808973], [-15.6..."
172,ambience (137).wav,"[[[-24.659868, -24.659868, -24.659868], [-23.1..."
391,ambience (437).wav,"[[[-10.943905, -10.943905, -10.943905], [-8.67..."
24,ambience (295).wav,"[[[-24.503147, -24.503147, -24.503147], [-33.1..."
378,ambience (474).wav,"[[[-16.491426, -16.491426, -16.491426], [-15.7..."
...,...,...
449,shout (225).wav,"[[[-35.96278, -35.96278, -35.96278], [-44.1042..."
342,shout (40).wav,"[[[-64.51155, -64.51155, -64.51155], [-69.0053..."
482,shout (21).wav,"[[[-45.40621, -45.40621, -45.40621], [-64.4320..."
356,shout (219).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."


In [None]:
df_train_final = df_train.drop(['Filename'], axis = 1)
df_train_final

Unnamed: 0,Log Mel Spectrogram
367,"[[[-18.808973, -18.808973, -18.808973], [-15.6..."
172,"[[[-24.659868, -24.659868, -24.659868], [-23.1..."
391,"[[[-10.943905, -10.943905, -10.943905], [-8.67..."
24,"[[[-24.503147, -24.503147, -24.503147], [-33.1..."
378,"[[[-16.491426, -16.491426, -16.491426], [-15.7..."
...,...
449,"[[[-35.96278, -35.96278, -35.96278], [-44.1042..."
342,"[[[-64.51155, -64.51155, -64.51155], [-69.0053..."
482,"[[[-45.40621, -45.40621, -45.40621], [-64.4320..."
356,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."


### Merge all test dataframes into one dataframe

In [None]:
df_test_append1 = test_ambience.append(test_footsteps)
df_test_append2 = df_test_append1.append(test_horn)
df_test_append3 = df_test_append2.append(test_music)
df_test = df_test_append3.append(test_shout)
df_test

Unnamed: 0,Filename,Log Mel Spectrogram
14,ambience (319).wav,"[[[-37.62004, -37.62004, -37.62004], [-32.8695..."
174,ambience (151).wav,"[[[-18.781565, -18.781565, -18.781565], [-21.1..."
473,ambience (350).wav,"[[[-9.855973, -9.855973, -9.855973], [-14.2419..."
149,ambience (164).wav,"[[[-20.290745, -20.290745, -20.290745], [-11.1..."
289,ambience (39).wav,"[[[-14.63257, -14.63257, -14.63257], [-15.7625..."
...,...,...
491,shout (18).wav,"[[[-52.23402, -52.23402, -52.23402], [-63.3092..."
256,shout (491).wav,"[[[-41.33486, -41.33486, -41.33486], [-40.7047..."
287,shout (181).wav,"[[[-57.507534, -57.507534, -57.507534], [-70.2..."
222,shout (422).wav,"[[[-69.83048, -69.83048, -69.83048], [-39.1142..."


In [None]:
df_test_final = df_test.drop(['Filename'], axis = 1)
df_test_final

Unnamed: 0,Log Mel Spectrogram
14,"[[[-37.62004, -37.62004, -37.62004], [-32.8695..."
174,"[[[-18.781565, -18.781565, -18.781565], [-21.1..."
473,"[[[-9.855973, -9.855973, -9.855973], [-14.2419..."
149,"[[[-20.290745, -20.290745, -20.290745], [-11.1..."
289,"[[[-14.63257, -14.63257, -14.63257], [-15.7625..."
...,...
491,"[[[-52.23402, -52.23402, -52.23402], [-63.3092..."
256,"[[[-41.33486, -41.33486, -41.33486], [-40.7047..."
287,"[[[-57.507534, -57.507534, -57.507534], [-70.2..."
222,"[[[-69.83048, -69.83048, -69.83048], [-39.1142..."


### Export train and test data into npy files

In [None]:
train_data = np.empty(shape=(df_train_final.shape[0], 224, 224, 3))
train_list = np.array(df_train_final['Log Mel Spectrogram'].values.tolist())

print(train_list[2])
print(train_list.shape)

[[[-10.943905  -10.943905  -10.943905 ]
  [ -8.679169   -8.679169   -8.679169 ]
  [ -9.979926   -9.979926   -9.979926 ]
  ...
  [-12.68231   -12.68231   -12.68231  ]
  [-12.330235  -12.330235  -12.330235 ]
  [-10.928455  -10.928455  -10.928455 ]]

 [[ -8.463714   -8.463714   -8.463714 ]
  [-12.72619   -12.72619   -12.72619  ]
  [-13.16904   -13.16904   -13.16904  ]
  ...
  [-16.621725  -16.621725  -16.621725 ]
  [-14.799957  -14.799957  -14.799957 ]
  [-15.5181675 -15.5181675 -15.5181675]]

 [[ -7.627155   -7.627155   -7.627155 ]
  [-22.197338  -22.197338  -22.197338 ]
  [-17.871119  -17.871119  -17.871119 ]
  ...
  [-22.145613  -22.145613  -22.145613 ]
  [-16.386484  -16.386484  -16.386484 ]
  [-20.662397  -20.662397  -20.662397 ]]

 ...

 [[-80.        -80.        -80.       ]
  [-80.        -80.        -80.       ]
  [-80.        -80.        -80.       ]
  ...
  [-80.        -80.        -80.       ]
  [-80.        -80.        -80.       ]
  [-80.        -80.        -80.       ]]

 [

In [None]:
train_list = np.array(df_train_final['Log Mel Spectrogram'].values.tolist())
np.save("/content/drive/MyDrive/FYP Data/Train and Test/train_data.npy", train_list)

In [None]:
train = np.load("/content/drive/MyDrive/FYP Data/Train and Test/train_data.npy", allow_pickle = True)
print(train.shape)

(2000, 224, 224, 3)


In [None]:
test_data = np.empty(shape=(df_test_final.shape[0], 224, 224, 3))
test_list = np.array(df_test_final['Log Mel Spectrogram'].values.tolist())

print(test_list[2])

[[[ -9.855973   -9.855973   -9.855973 ]
  [-14.241993  -14.241993  -14.241993 ]
  [ -7.6662726  -7.6662726  -7.6662726]
  ...
  [ -3.5162106  -3.5162106  -3.5162106]
  [-14.035719  -14.035719  -14.035719 ]
  [ -8.04125    -8.04125    -8.04125  ]]

 [[-14.035427  -14.035427  -14.035427 ]
  [-15.6176815 -15.6176815 -15.6176815]
  [ -9.444467   -9.444467   -9.444467 ]
  ...
  [ -3.909122   -3.909122   -3.909122 ]
  [-10.856025  -10.856025  -10.856025 ]
  [ -9.707303   -9.707303   -9.707303 ]]

 [[-13.853289  -13.853289  -13.853289 ]
  [-12.181538  -12.181538  -12.181538 ]
  [-11.086602  -11.086602  -11.086602 ]
  ...
  [ -6.1276245  -6.1276245  -6.1276245]
  [ -7.673792   -7.673792   -7.673792 ]
  [-13.380122  -13.380122  -13.380122 ]]

 ...

 [[-65.463745  -65.463745  -65.463745 ]
  [-62.774128  -62.774128  -62.774128 ]
  [-64.02305   -64.02305   -64.02305  ]
  ...
  [-65.04924   -65.04924   -65.04924  ]
  [-63.493507  -63.493507  -63.493507 ]
  [-64.52375   -64.52375   -64.52375  ]]

 [

In [None]:
test_list = np.array(df_test_final['Log Mel Spectrogram'].values.tolist())
np.save("/content/drive/MyDrive/FYP Data/Train and Test/test_data.npy", test_list)

In [None]:
test = np.load("/content/drive/MyDrive/FYP Data/Train and Test/test_data.npy", allow_pickle = True)
print(test.shape)

(500, 224, 224, 3)


### Obtain train and test labels

In [None]:
df_train

Unnamed: 0,Filename,Log Mel Spectrogram
367,ambience (454).wav,"[[[-18.808973, -18.808973, -18.808973], [-15.6..."
172,ambience (137).wav,"[[[-24.659868, -24.659868, -24.659868], [-23.1..."
391,ambience (437).wav,"[[[-10.943905, -10.943905, -10.943905], [-8.67..."
24,ambience (295).wav,"[[[-24.503147, -24.503147, -24.503147], [-33.1..."
378,ambience (474).wav,"[[[-16.491426, -16.491426, -16.491426], [-15.7..."
...,...,...
449,shout (225).wav,"[[[-35.96278, -35.96278, -35.96278], [-44.1042..."
342,shout (40).wav,"[[[-64.51155, -64.51155, -64.51155], [-69.0053..."
482,shout (21).wav,"[[[-45.40621, -45.40621, -45.40621], [-64.4320..."
356,shout (219).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."


In [None]:
# Creating an empty list
train_label = []

In [None]:
# Creating an empty list
train_label = []

for item in df_train['Filename']:
  filename = str(item)
  sound_class = filename.split()[0] # Only obtain first word
  if sound_class == "ambience":
    train_label.append(0)
  if sound_class == "footsteps":
    train_label.append(1)
  if sound_class == "horn":
    train_label.append(2)
  if sound_class == "music":
    train_label.append(3)
  if sound_class == "shout":
    train_label.append(4)

train_label

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [None]:
df_test

Unnamed: 0,Filename,Log Mel Spectrogram
14,ambience (319).wav,"[[[-37.62004, -37.62004, -37.62004], [-32.8695..."
174,ambience (151).wav,"[[[-18.781565, -18.781565, -18.781565], [-21.1..."
473,ambience (350).wav,"[[[-9.855973, -9.855973, -9.855973], [-14.2419..."
149,ambience (164).wav,"[[[-20.290745, -20.290745, -20.290745], [-11.1..."
289,ambience (39).wav,"[[[-14.63257, -14.63257, -14.63257], [-15.7625..."
...,...,...
491,shout (18).wav,"[[[-52.23402, -52.23402, -52.23402], [-63.3092..."
256,shout (491).wav,"[[[-41.33486, -41.33486, -41.33486], [-40.7047..."
287,shout (181).wav,"[[[-57.507534, -57.507534, -57.507534], [-70.2..."
222,shout (422).wav,"[[[-69.83048, -69.83048, -69.83048], [-39.1142..."


In [None]:
# Creating an empty list
test_label = []

for item in df_test['Filename']:
  filename = str(item)
  sound_class = filename.split()[0] # Only obtain first word
  if sound_class == "ambience":
    test_label.append(0)
  if sound_class == "footsteps":
    test_label.append(1)
  if sound_class == "horn":
    test_label.append(2)
  if sound_class == "music":
    test_label.append(3)
  if sound_class == "shout":
    test_label.append(4)

test_label

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,


In [None]:
train_label_newdim = np.expand_dims(train_label, axis=1)
test_label_newdim = np.expand_dims(test_label, axis=1)

print(train_label_newdim)

[[0]
 [0]
 [0]
 ...
 [4]
 [4]
 [4]]


In [None]:
train_label_newdim = np.expand_dims(train_label, axis=1)
test_label_newdim = np.expand_dims(test_label, axis=1)
np.save("/content/drive/MyDrive/FYP Data/Train and Test/train_label.npy", train_label_newdim)
np.save("/content/drive/MyDrive/FYP Data/Train and Test/test_label.npy", test_label_newdim)

In [None]:
train_labels = np.load("/content/drive/MyDrive/FYP Data/Train and Test/train_label.npy", allow_pickle = True)
test_labels = np.load("/content/drive/MyDrive/FYP Data/Train and Test/test_label.npy", allow_pickle = True)

print(train_labels.shape)
print(test_labels.shape)

(2000, 1)
(500, 1)
