<a href="https://colab.research.google.com/github/xychong/edgeaimonitoring/blob/main/Train_and_Test_Split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import librosa, librosa.display
import numpy as np
import os
import IPython
import pandas as pd

# make plot outputs appear and be stored within notebook
%matplotlib inline

import sklearn
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
pathname_ambience = "/content/drive/MyDrive/FYP Data/Data/Ambience/"
pathname_footsteps = "/content/drive/MyDrive/FYP Data/Data/Footsteps/"
pathname_horn = "/content/drive/MyDrive/FYP Data/Data/Horn/"
pathname_music = "/content/drive/MyDrive/FYP Data/Data/Music/"
pathname_shout = "/content/drive/MyDrive/FYP Data/Data/Shout/"

### Ambience

In [4]:
df_melspec_ambience = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [5]:
# Window size = 2048
# Hop size = 512
# Compute FFT for each window
# Take entire frequency spectrum and separate into 345 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_ambience):
  audiopath = pathname_ambience + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = 512, n_mels=345, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_ambience = df_melspec_ambience.append({'Class': "Ambience",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [7]:
df_melspec_ambience.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Ambience,ambience (302).wav,"[[[-64.589325, -64.589325, -64.589325], [-44.5..."
1,Ambience,ambience (309).wav,"[[[-46.359592, -46.359592, -46.359592], [-47.0..."
2,Ambience,ambience (312).wav,"[[[-40.281746, -40.281746, -40.281746], [-47.0..."
3,Ambience,ambience (300).wav,"[[[-40.819183, -40.819183, -40.819183], [-45.8..."
4,Ambience,ambience (301).wav,"[[[-47.165558, -47.165558, -47.165558], [-53.2..."


In [8]:
melspec_list = np.array(df_melspec_ambience['Log Mel Spectrogram'].values.tolist())

In [9]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 

In [12]:
df_input = df_melspec_ambience[['Filename','Log Mel Spectrogram']]
train_ambience, test_ambience = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_ambience

Unnamed: 0,Filename,Log Mel Spectrogram
270,ambience (110).wav,"[[[-33.08299, -33.08299, -33.08299], [-38.4111..."
102,ambience (206).wav,"[[[-35.107437, -35.107437, -35.107437], [-40.7..."
197,ambience (130).wav,"[[[-23.944202, -23.944202, -23.944202], [-24.7..."
142,ambience (182).wav,"[[[-39.812046, -39.812046, -39.812046], [-44.6..."
53,ambience (271).wav,"[[[-28.620619, -28.620619, -28.620619], [-28.3..."
...,...,...
271,ambience (47).wav,"[[[-32.387253, -32.387253, -32.387253], [-37.8..."
293,ambience (6).wav,"[[[-40.24123, -40.24123, -40.24123], [-41.8383..."
112,ambience (203).wav,"[[[-66.08594, -66.08594, -66.08594], [-54.8495..."
116,ambience (184).wav,"[[[-56.168037, -56.168037, -56.168037], [-67.2..."


In [13]:
test_ambience

Unnamed: 0,Filename,Log Mel Spectrogram
167,ambience (132).wav,"[[[-44.501633, -44.501633, -44.501633], [-31.9..."
318,ambience (321).wav,"[[[-20.341702, -20.341702, -20.341702], [-23.5..."
72,ambience (252).wav,"[[[-24.07206, -24.07206, -24.07206], [-23.9069..."
3,ambience (300).wav,"[[[-40.819183, -40.819183, -40.819183], [-45.8..."
185,ambience (192).wav,"[[[-40.354637, -40.354637, -40.354637], [-45.9..."
...,...,...
314,ambience (12).wav,"[[[-45.51901, -45.51901, -45.51901], [-47.4332..."
42,ambience (261).wav,"[[[-27.42527, -27.42527, -27.42527], [-31.3088..."
126,ambience (209).wav,"[[[-64.01073, -64.01073, -64.01073], [-64.0619..."
316,ambience (1).wav,"[[[-28.40573, -28.40573, -28.40573], [-29.3921..."


### Footsteps

In [17]:
df_melspec_footsteps = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [18]:
# Window size = 2048
# Hop size = 512
# Compute FFT for each window
# Take entire frequency spectrum and separate into 345 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_footsteps):
  audiopath = pathname_footsteps + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = 512, n_mels=345, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_footsteps = df_melspec_footsteps.append({'Class': "Footsteps",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [19]:
df_melspec_footsteps.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Footsteps,footsteps (3).wav,"[[[-46.995945, -46.995945, -46.995945], [-48.3..."
1,Footsteps,footsteps (340).wav,"[[[-29.431768, -29.431768, -29.431768], [-29.1..."
2,Footsteps,footsteps (4).wav,"[[[-50.14822, -50.14822, -50.14822], [-53.7723..."
3,Footsteps,footsteps (2).wav,"[[[-43.041107, -43.041107, -43.041107], [-32.6..."
4,Footsteps,footsteps (342).wav,"[[[-47.934456, -47.934456, -47.934456], [-28.1..."


In [20]:
melspec_list = np.array(df_melspec_footsteps['Log Mel Spectrogram'].values.tolist())

In [21]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 

In [25]:
df_input = df_melspec_footsteps[['Filename','Log Mel Spectrogram']]
train_footsteps, test_footsteps = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_footsteps

Unnamed: 0,Filename,Log Mel Spectrogram
19,footsteps (6).wav,"[[[-72.777664, -72.777664, -72.777664], [-71.9..."
187,footsteps (66).wav,"[[[-22.008776, -22.008776, -22.008776], [-25.0..."
153,footsteps (109).wav,"[[[-27.706394, -27.706394, -27.706394], [-25.5..."
53,footsteps (291).wav,"[[[-24.56182, -24.56182, -24.56182], [-25.7173..."
69,footsteps (251).wav,"[[[-74.04463, -74.04463, -74.04463], [-64.9349..."
...,...,...
90,footsteps (199).wav,"[[[-40.74266, -40.74266, -40.74266], [-48.1722..."
252,footsteps (142).wav,"[[[-70.783226, -70.783226, -70.783226], [-80.0..."
66,footsteps (279).wav,"[[[-26.684332, -26.684332, -26.684332], [-29.0..."
4,footsteps (342).wav,"[[[-47.934456, -47.934456, -47.934456], [-28.1..."


In [26]:
test_footsteps

Unnamed: 0,Filename,Log Mel Spectrogram
10,footsteps (1).wav,"[[[-23.64164, -23.64164, -23.64164], [-26.3956..."
238,footsteps (59).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
222,footsteps (26).wav,"[[[-60.3381, -60.3381, -60.3381], [-57.767883,..."
251,footsteps (160).wav,"[[[-80.0, -80.0, -80.0], [-73.524345, -73.5243..."
116,footsteps (174).wav,"[[[-24.704842, -24.704842, -24.704842], [-23.5..."
...,...,...
141,footsteps (103).wav,"[[[-26.680237, -26.680237, -26.680237], [-31.0..."
235,footsteps (48).wav,"[[[-47.727676, -47.727676, -47.727676], [-39.7..."
321,footsteps (261).wav,"[[[-63.349228, -63.349228, -63.349228], [-69.1..."
317,footsteps (218).wav,"[[[-40.570045, -40.570045, -40.570045], [-39.7..."


### Horn

In [27]:
df_melspec_horn = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [28]:
# Window size = 2048
# Hop size = 512
# Compute FFT for each window
# Take entire frequency spectrum and separate into 345 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_horn):
  audiopath = pathname_horn + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = 512, n_mels=345, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_horn = df_melspec_horn.append({'Class': "Horn",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [29]:
df_melspec_horn.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Horn,horn (31).wav,"[[[-80.0, -80.0, -80.0], [-71.910706, -71.9107..."
1,Horn,horn (12).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
2,Horn,horn (20).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
3,Horn,horn (32).wav,"[[[-47.88686, -47.88686, -47.88686], [-55.0599..."
4,Horn,horn (16).wav,"[[[-36.90676, -36.90676, -36.90676], [-42.9522..."


In [30]:
melspec_list = np.array(df_melspec_horn['Log Mel Spectrogram'].values.tolist())

In [31]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 

In [32]:
df_input = df_melspec_horn[['Filename','Log Mel Spectrogram']]
train_horn, test_horn = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_horn

Unnamed: 0,Filename,Log Mel Spectrogram
64,horn (198).wav,"[[[-36.339638, -36.339638, -36.339638], [-31.6..."
34,horn (3).wav,"[[[-22.522083, -22.522083, -22.522083], [-26.3..."
203,horn (96).wav,"[[[-33.647114, -33.647114, -33.647114], [-38.1..."
25,horn (228).wav,"[[[-32.11183, -32.11183, -32.11183], [-47.4142..."
116,horn (129).wav,"[[[-56.435143, -56.435143, -56.435143], [-39.2..."
...,...,...
190,horn (94).wav,"[[[-23.553993, -23.553993, -23.553993], [-23.9..."
9,horn (11).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
228,horn (116).wav,"[[[-42.276814, -42.276814, -42.276814], [-48.2..."
171,horn (58).wav,"[[[-29.179647, -29.179647, -29.179647], [-30.5..."


In [33]:
test_horn

Unnamed: 0,Filename,Log Mel Spectrogram
174,horn (71).wav,"[[[-25.807938, -25.807938, -25.807938], [-35.4..."
88,horn (182).wav,"[[[-27.721958, -27.721958, -27.721958], [-35.0..."
58,horn (194).wav,"[[[-80.0, -80.0, -80.0], [-63.445045, -63.4450..."
165,horn (50).wav,"[[[-35.07379, -35.07379, -35.07379], [-26.1561..."
128,horn (138).wav,"[[[-28.90509, -28.90509, -28.90509], [-43.3943..."
182,horn (86).wav,"[[[-30.66608, -30.66608, -30.66608], [-31.8001..."
145,horn (49).wav,"[[[-37.700405, -37.700405, -37.700405], [-44.6..."
80,horn (162).wav,"[[[-29.292944, -29.292944, -29.292944], [-28.9..."
84,horn (164).wav,"[[[-67.983154, -67.983154, -67.983154], [-30.4..."
35,horn (219).wav,"[[[-35.43015, -35.43015, -35.43015], [-41.2921..."


### Music

In [35]:
df_melspec_music = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [36]:
# Window size = 2048
# Hop size = 512
# Compute FFT for each window
# Take entire frequency spectrum and separate into 345 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_music):
  audiopath = pathname_music + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = 512, n_mels=345, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_music = df_melspec_music.append({'Class': "Music",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [37]:
df_melspec_music.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Music,music (8).wav,"[[[-41.944553, -41.944553, -41.944553], [-47.5..."
1,Music,music (5).wav,"[[[-37.44742, -37.44742, -37.44742], [-43.7889..."
2,Music,music (4).wav,"[[[-35.821857, -35.821857, -35.821857], [-41.9..."
3,Music,music (3).wav,"[[[-54.86283, -54.86283, -54.86283], [-60.7735..."
4,Music,music (7).wav,"[[[-30.553259, -30.553259, -30.553259], [-36.2..."


In [38]:
melspec_list = np.array(df_melspec_music['Log Mel Spectrogram'].values.tolist())

In [39]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 

In [40]:
df_input = df_melspec_music[['Filename','Log Mel Spectrogram']]
train_music, test_music = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_music

Unnamed: 0,Filename,Log Mel Spectrogram
198,music (88).wav,"[[[-49.229744, -49.229744, -49.229744], [-50.4..."
51,music (293).wav,"[[[-33.380367, -33.380367, -33.380367], [-39.3..."
206,music (90).wav,"[[[-59.356606, -59.356606, -59.356606], [-67.9..."
14,music (1).wav,"[[[-37.873493, -37.873493, -37.873493], [-44.3..."
34,music (318).wav,"[[[-36.54314, -36.54314, -36.54314], [-30.5995..."
...,...,...
208,music (95).wav,"[[[-38.572178, -38.572178, -38.572178], [-45.3..."
8,music (331).wav,"[[[-67.74857, -67.74857, -67.74857], [-76.6087..."
166,music (257).wav,"[[[-44.10531, -44.10531, -44.10531], [-44.2232..."
135,music (187).wav,"[[[-52.116886, -52.116886, -52.116886], [-58.1..."


In [41]:
test_music

Unnamed: 0,Filename,Log Mel Spectrogram
220,music (78).wav,"[[[-33.710365, -33.710365, -33.710365], [-35.4..."
215,music (105).wav,"[[[-51.340065, -51.340065, -51.340065], [-55.7..."
109,music (188).wav,"[[[-63.143223, -63.143223, -63.143223], [-68.8..."
145,music (246).wav,"[[[-34.592556, -34.592556, -34.592556], [-39.9..."
78,music (235).wav,"[[[-46.250435, -46.250435, -46.250435], [-42.3..."
...,...,...
144,music (273).wav,"[[[-34.72179, -34.72179, -34.72179], [-40.2422..."
272,music (64).wav,"[[[-32.79044, -32.79044, -32.79044], [-38.1264..."
89,music (202).wav,"[[[-41.263023, -41.263023, -41.263023], [-53.7..."
294,music (118).wav,"[[[-25.499447, -25.499447, -25.499447], [-30.8..."


### Shout

In [42]:
df_melspec_shout = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [43]:
# Window size = 2048
# Hop size = 512
# Compute FFT for each window
# Take entire frequency spectrum and separate into 345 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_shout):
  audiopath = pathname_shout + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = 512, n_mels=345, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_shout = df_melspec_shout.append({'Class': "Shout",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [45]:
df_melspec_shout.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Shout,shout (14).wav,"[[[-59.581978, -59.581978, -59.581978], [-57.0..."
1,Shout,shout (17).wav,"[[[-59.914005, -59.914005, -59.914005], [-55.3..."
2,Shout,shout (21).wav,"[[[-80.0, -80.0, -80.0], [-68.91399, -68.91399..."
3,Shout,shout (13).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
4,Shout,shout (8).wav,"[[[-73.3937, -73.3937, -73.3937], [-76.30673, ..."


In [46]:
melspec_list = np.array(df_melspec_shout['Log Mel Spectrogram'].values.tolist())

In [47]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 345, 3)
(345, 

In [48]:
df_input = df_melspec_shout[['Filename','Log Mel Spectrogram']]
train_shout, test_shout = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_shout

Unnamed: 0,Filename,Log Mel Spectrogram
148,shout (233).wav,"[[[-51.91977, -51.91977, -51.91977], [-57.9230..."
7,shout (27).wav,"[[[-75.17001, -75.17001, -75.17001], [-63.9785..."
94,shout (85).wav,"[[[-71.85879, -71.85879, -71.85879], [-66.5122..."
88,shout (100).wav,"[[[-76.02086, -76.02086, -76.02086], [-80.0, -..."
196,shout (169).wav,"[[[-72.453926, -72.453926, -72.453926], [-77.4..."
...,...,...
112,shout (200).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
208,shout (150).wav,"[[[-66.7005, -66.7005, -66.7005], [-68.86181, ..."
164,shout (133).wav,"[[[-70.92355, -70.92355, -70.92355], [-65.4657..."
40,shout (53).wav,"[[[-58.36692, -58.36692, -58.36692], [-58.7806..."


In [49]:
test_shout

Unnamed: 0,Filename,Log Mel Spectrogram
134,shout (217).wav,"[[[-41.131283, -41.131283, -41.131283], [-46.8..."
228,shout (192).wav,"[[[-67.086655, -67.086655, -67.086655], [-71.0..."
237,shout (246).wav,"[[[-44.353935, -44.353935, -44.353935], [-53.4..."
83,shout (99).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
121,shout (178).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
74,shout (78).wav,"[[[-60.66146, -60.66146, -60.66146], [-52.6190..."
10,shout (12).wav,"[[[-49.288193, -49.288193, -49.288193], [-46.3..."
6,shout (22).wav,"[[[-67.130325, -67.130325, -67.130325], [-69.7..."
33,shout (62).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
201,shout (164).wav,"[[[-78.67434, -78.67434, -78.67434], [-74.4371..."


### Merge all train dataframes into one dataframe

In [60]:
df_train_append1 = train_ambience.append(train_footsteps)
df_train_append2 = df_train_append1.append(train_horn)
df_train_append3 = df_train_append2.append(train_music)
df_train = df_train_append3.append(train_shout)
df_train

Unnamed: 0,Filename,Log Mel Spectrogram
270,ambience (110).wav,"[[[-33.08299, -33.08299, -33.08299], [-38.4111..."
102,ambience (206).wav,"[[[-35.107437, -35.107437, -35.107437], [-40.7..."
197,ambience (130).wav,"[[[-23.944202, -23.944202, -23.944202], [-24.7..."
142,ambience (182).wav,"[[[-39.812046, -39.812046, -39.812046], [-44.6..."
53,ambience (271).wav,"[[[-28.620619, -28.620619, -28.620619], [-28.3..."
...,...,...
112,shout (200).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
208,shout (150).wav,"[[[-66.7005, -66.7005, -66.7005], [-68.86181, ..."
164,shout (133).wav,"[[[-70.92355, -70.92355, -70.92355], [-65.4657..."
40,shout (53).wav,"[[[-58.36692, -58.36692, -58.36692], [-58.7806..."


In [61]:
df_train_final = df_train.drop(['Filename'], axis = 1)
df_train_final

Unnamed: 0,Log Mel Spectrogram
270,"[[[-33.08299, -33.08299, -33.08299], [-38.4111..."
102,"[[[-35.107437, -35.107437, -35.107437], [-40.7..."
197,"[[[-23.944202, -23.944202, -23.944202], [-24.7..."
142,"[[[-39.812046, -39.812046, -39.812046], [-44.6..."
53,"[[[-28.620619, -28.620619, -28.620619], [-28.3..."
...,...
112,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
208,"[[[-66.7005, -66.7005, -66.7005], [-68.86181, ..."
164,"[[[-70.92355, -70.92355, -70.92355], [-65.4657..."
40,"[[[-58.36692, -58.36692, -58.36692], [-58.7806..."


### Merge all test dataframes into one dataframe

In [62]:
df_test_append1 = test_ambience.append(test_footsteps)
df_test_append2 = df_test_append1.append(test_horn)
df_test_append3 = df_test_append2.append(test_music)
df_test = df_test_append3.append(test_shout)
df_test

Unnamed: 0,Filename,Log Mel Spectrogram
167,ambience (132).wav,"[[[-44.501633, -44.501633, -44.501633], [-31.9..."
318,ambience (321).wav,"[[[-20.341702, -20.341702, -20.341702], [-23.5..."
72,ambience (252).wav,"[[[-24.07206, -24.07206, -24.07206], [-23.9069..."
3,ambience (300).wav,"[[[-40.819183, -40.819183, -40.819183], [-45.8..."
185,ambience (192).wav,"[[[-40.354637, -40.354637, -40.354637], [-45.9..."
...,...,...
202,shout (161).wav,"[[[-53.830147, -53.830147, -53.830147], [-57.7..."
256,shout (266).wav,"[[[-80.0, -80.0, -80.0], [-76.28508, -76.28508..."
59,shout (74).wav,"[[[-80.0, -80.0, -80.0], [-71.82796, -71.82796..."
75,shout (112).wav,"[[[-80.0, -80.0, -80.0], [-77.284454, -77.2844..."


In [63]:
df_test_final = df_test.drop(['Filename'], axis = 1)
df_test_final

Unnamed: 0,Log Mel Spectrogram
167,"[[[-44.501633, -44.501633, -44.501633], [-31.9..."
318,"[[[-20.341702, -20.341702, -20.341702], [-23.5..."
72,"[[[-24.07206, -24.07206, -24.07206], [-23.9069..."
3,"[[[-40.819183, -40.819183, -40.819183], [-45.8..."
185,"[[[-40.354637, -40.354637, -40.354637], [-45.9..."
...,...
202,"[[[-53.830147, -53.830147, -53.830147], [-57.7..."
256,"[[[-80.0, -80.0, -80.0], [-76.28508, -76.28508..."
59,"[[[-80.0, -80.0, -80.0], [-71.82796, -71.82796..."
75,"[[[-80.0, -80.0, -80.0], [-77.284454, -77.2844..."


### Export train and test data into npy files

In [73]:
train_data = np.empty(shape=(df_train_final.shape[0], 345, 345, 3))
train_list = np.array(df_train_final['Log Mel Spectrogram'].values.tolist())

print(train_list[2])

[[[-23.944202 -23.944202 -23.944202]
  [-24.715822 -24.715822 -24.715822]
  [-24.05683  -24.05683  -24.05683 ]
  ...
  [-21.062237 -21.062237 -21.062237]
  [-17.963087 -17.963087 -17.963087]
  [-14.818022 -14.818022 -14.818022]]

 [[-42.823982 -42.823982 -42.823982]
  [-31.546438 -31.546438 -31.546438]
  [-26.66728  -26.66728  -26.66728 ]
  ...
  [-46.49268  -46.49268  -46.49268 ]
  [-21.083515 -21.083515 -21.083515]
  [-18.341131 -18.341131 -18.341131]]

 [[-40.448177 -40.448177 -40.448177]
  [-32.0863   -32.0863   -32.0863  ]
  [-27.585508 -27.585508 -27.585508]
  ...
  [-25.724245 -25.724245 -25.724245]
  [-30.849173 -30.849173 -30.849173]
  [-28.875586 -28.875586 -28.875586]]

 ...

 [[-73.899284 -73.899284 -73.899284]
  [-79.15841  -79.15841  -79.15841 ]
  [-80.       -80.       -80.      ]
  ...
  [-80.       -80.       -80.      ]
  [-80.       -80.       -80.      ]
  [-80.       -80.       -80.      ]]

 [[-73.48602  -73.48602  -73.48602 ]
  [-78.830185 -78.830185 -78.830185]


In [74]:
np.save("/content/drive/MyDrive/FYP Data/Train and Test/train_data.npy", train_list)

In [75]:
train = np.load("/content/drive/MyDrive/FYP Data/Train and Test/train_data.npy", allow_pickle = True)
print(train.shape)

(1204, 345, 345, 3)


In [76]:
test_data = np.empty(shape=(df_test_final.shape[0], 345, 345, 3))
test_list = np.array(df_test_final['Log Mel Spectrogram'].values.tolist())

print(test_list[2])

[[[-24.07206  -24.07206  -24.07206 ]
  [-23.906939 -23.906939 -23.906939]
  [-25.452368 -25.452368 -25.452368]
  ...
  [-39.97793  -39.97793  -39.97793 ]
  [-31.852644 -31.852644 -31.852644]
  [-26.244059 -26.244059 -26.244059]]

 [[-26.092194 -26.092194 -26.092194]
  [-24.175066 -24.175066 -24.175066]
  [-13.649515 -13.649515 -13.649515]
  ...
  [-33.09472  -33.09472  -33.09472 ]
  [-24.535719 -24.535719 -24.535719]
  [-21.979366 -21.979366 -21.979366]]

 [[-47.068626 -47.068626 -47.068626]
  [-29.675768 -29.675768 -29.675768]
  [-23.72275  -23.72275  -23.72275 ]
  ...
  [-31.098347 -31.098347 -31.098347]
  [-32.971466 -32.971466 -32.971466]
  [-29.667307 -29.667307 -29.667307]]

 ...

 [[-80.       -80.       -80.      ]
  [-80.       -80.       -80.      ]
  [-80.       -80.       -80.      ]
  ...
  [-80.       -80.       -80.      ]
  [-80.       -80.       -80.      ]
  [-80.       -80.       -80.      ]]

 [[-80.       -80.       -80.      ]
  [-80.       -80.       -80.      ]


In [77]:
np.save("/content/drive/MyDrive/FYP Data/Train and Test/test_data.npy", test_list)

In [78]:
test = np.load("/content/drive/MyDrive/FYP Data/Train and Test/test_data.npy", allow_pickle = True)
print(test.shape)

(304, 345, 345, 3)


### Obtain train and test labels

In [79]:
df_train

Unnamed: 0,Filename,Log Mel Spectrogram
270,ambience (110).wav,"[[[-33.08299, -33.08299, -33.08299], [-38.4111..."
102,ambience (206).wav,"[[[-35.107437, -35.107437, -35.107437], [-40.7..."
197,ambience (130).wav,"[[[-23.944202, -23.944202, -23.944202], [-24.7..."
142,ambience (182).wav,"[[[-39.812046, -39.812046, -39.812046], [-44.6..."
53,ambience (271).wav,"[[[-28.620619, -28.620619, -28.620619], [-28.3..."
...,...,...
112,shout (200).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
208,shout (150).wav,"[[[-66.7005, -66.7005, -66.7005], [-68.86181, ..."
164,shout (133).wav,"[[[-70.92355, -70.92355, -70.92355], [-65.4657..."
40,shout (53).wav,"[[[-58.36692, -58.36692, -58.36692], [-58.7806..."


In [90]:
# Creating an empty list
train_label = []

In [91]:
for item in df_train['Filename']:
  filename = str(item)
  sound_class = filename.split()[0] # Only obtain first word
  if sound_class == "ambience":
    train_label.append(0)
  if sound_class == "footsteps":
    train_label.append(1)
  if sound_class == "horn":
    train_label.append(2)
  if sound_class == "music":
    train_label.append(3)
  if sound_class == "shout":
    train_label.append(4)

train_label

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [93]:
df_test

Unnamed: 0,Filename,Log Mel Spectrogram
167,ambience (132).wav,"[[[-44.501633, -44.501633, -44.501633], [-31.9..."
318,ambience (321).wav,"[[[-20.341702, -20.341702, -20.341702], [-23.5..."
72,ambience (252).wav,"[[[-24.07206, -24.07206, -24.07206], [-23.9069..."
3,ambience (300).wav,"[[[-40.819183, -40.819183, -40.819183], [-45.8..."
185,ambience (192).wav,"[[[-40.354637, -40.354637, -40.354637], [-45.9..."
...,...,...
202,shout (161).wav,"[[[-53.830147, -53.830147, -53.830147], [-57.7..."
256,shout (266).wav,"[[[-80.0, -80.0, -80.0], [-76.28508, -76.28508..."
59,shout (74).wav,"[[[-80.0, -80.0, -80.0], [-71.82796, -71.82796..."
75,shout (112).wav,"[[[-80.0, -80.0, -80.0], [-77.284454, -77.2844..."


In [94]:
# Creating an empty list
test_label = []

In [95]:
for item in df_test['Filename']:
  filename = str(item)
  sound_class = filename.split()[0] # Only obtain first word
  if sound_class == "ambience":
    test_label.append(0)
  if sound_class == "footsteps":
    test_label.append(1)
  if sound_class == "horn":
    test_label.append(2)
  if sound_class == "music":
    test_label.append(3)
  if sound_class == "shout":
    test_label.append(4)

test_label

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,


In [98]:
train_label_newdim = np.expand_dims(train_label, axis=1)
test_label_newdim = np.expand_dims(test_label, axis=1)

print(train_label_newdim)

[[0]
 [0]
 [0]
 ...
 [4]
 [4]
 [4]]


In [99]:
np.save("/content/drive/MyDrive/FYP Data/Train and Test/train_label.npy", train_label_newdim)
np.save("/content/drive/MyDrive/FYP Data/Train and Test/test_label.npy", test_label_newdim)

In [100]:
train_labels = np.load("/content/drive/MyDrive/FYP Data/Train and Test/train_label.npy", allow_pickle = True)
test_labels = np.load("/content/drive/MyDrive/FYP Data/Train and Test/test_label.npy", allow_pickle = True)

print(train_labels.shape)
print(test_labels.shape)

(1204, 1)
(304, 1)
