<a href="https://colab.research.google.com/github/xychong/edgeaimonitoring/blob/main/Train_and_Test_Split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import librosa, librosa.display
import numpy as np
import os
import IPython
import pandas as pd

# make plot outputs appear and be stored within notebook
%matplotlib inline

import sklearn
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pathname_ambience = "/content/drive/MyDrive/FYP Data/Data/Ambience/"
pathname_footsteps = "/content/drive/MyDrive/FYP Data/Data/Footsteps/"
pathname_horn = "/content/drive/MyDrive/FYP Data/Data/Horn/"
pathname_music = "/content/drive/MyDrive/FYP Data/Data/Music/"
pathname_shout = "/content/drive/MyDrive/FYP Data/Data/Shout/"

### Ambience

In [None]:
df_melspec_ambience = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [None]:
# Window size = 2048
# Number of Features = sample_rate/hop_length * duration_of_audio
# Compute FFT for each window
# Take entire frequency spectrum and separate into 224 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_ambience):
  audiopath = pathname_ambience + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = 788, n_mels=224, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_ambience = df_melspec_ambience.append({'Class': "Ambience",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [None]:
df_melspec_ambience.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Ambience,ambience (302).wav,"[[[-39.611282, -39.611282, -39.611282], [-28.3..."
1,Ambience,ambience (309).wav,"[[[-31.329288, -31.329288, -31.329288], [-31.1..."
2,Ambience,ambience (312).wav,"[[[-27.22994, -27.22994, -27.22994], [-35.6758..."
3,Ambience,ambience (300).wav,"[[[-31.641083, -31.641083, -31.641083], [-39.7..."
4,Ambience,ambience (301).wav,"[[[-40.700012, -40.700012, -40.700012], [-33.5..."


In [None]:
melspec_list = np.array(df_melspec_ambience['Log Mel Spectrogram'].values.tolist())

In [None]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 

In [None]:
df_input = df_melspec_ambience[['Filename','Log Mel Spectrogram']]
train_ambience, test_ambience = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_ambience

Unnamed: 0,Filename,Log Mel Spectrogram
311,ambience (58).wav,"[[[-49.223248, -49.223248, -49.223248], [-52.0..."
242,ambience (74).wav,"[[[-27.694464, -27.694464, -27.694464], [-47.4..."
218,ambience (87).wav,"[[[-33.70532, -33.70532, -33.70532], [-35.0588..."
243,ambience (64).wav,"[[[-38.80634, -38.80634, -38.80634], [-42.3053..."
255,ambience (48).wav,"[[[-25.35271, -25.35271, -25.35271], [-39.4933..."
...,...,...
30,ambience (285).wav,"[[[-18.849943, -18.849943, -18.849943], [-38.5..."
279,ambience (34).wav,"[[[-10.911771, -10.911771, -10.911771], [-8.37..."
152,ambience (152).wav,"[[[-11.35974, -11.35974, -11.35974], [-14.2006..."
99,ambience (202).wav,"[[[-15.566402, -15.566402, -15.566402], [-33.8..."


In [None]:
test_ambience

Unnamed: 0,Filename,Log Mel Spectrogram
136,ambience (188).wav,"[[[-17.036678, -17.036678, -17.036678], [-35.4..."
26,ambience (286).wav,"[[[-15.34173, -15.34173, -15.34173], [-31.5507..."
127,ambience (181).wav,"[[[-15.934418, -15.934418, -15.934418], [-32.6..."
296,ambience (24).wav,"[[[-6.908004, -6.908004, -6.908004], [-13.8484..."
95,ambience (229).wav,"[[[-41.503044, -41.503044, -41.503044], [-59.5..."
...,...,...
278,ambience (26).wav,"[[[-14.129571, -14.129571, -14.129571], [-15.4..."
74,ambience (245).wav,"[[[-12.181629, -12.181629, -12.181629], [-18.1..."
186,ambience (128).wav,"[[[-32.548466, -32.548466, -32.548466], [-30.3..."
211,ambience (89).wav,"[[[-34.271206, -34.271206, -34.271206], [-41.7..."


### Footsteps

In [None]:
df_melspec_footsteps = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [None]:
# Window size = 2048
# Number of Features = sample_rate/hop_length * duration_of_audio
# Compute FFT for each window
# Take entire frequency spectrum and separate into 224 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_footsteps):
  audiopath = pathname_footsteps + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = 788, n_mels=224, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_footsteps = df_melspec_footsteps.append({'Class': "Footsteps",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [None]:
df_melspec_footsteps.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Footsteps,footsteps (3).wav,"[[[-36.579136, -36.579136, -36.579136], [-46.3..."
1,Footsteps,footsteps (340).wav,"[[[-29.78241, -29.78241, -29.78241], [-25.9830..."
2,Footsteps,footsteps (4).wav,"[[[-49.480644, -49.480644, -49.480644], [-42.0..."
3,Footsteps,footsteps (2).wav,"[[[-29.42803, -29.42803, -29.42803], [-20.8777..."
4,Footsteps,footsteps (342).wav,"[[[-32.801277, -32.801277, -32.801277], [-23.4..."


In [None]:
melspec_list = np.array(df_melspec_footsteps['Log Mel Spectrogram'].values.tolist())

In [None]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 

In [None]:
df_input = df_melspec_footsteps[['Filename','Log Mel Spectrogram']]
train_footsteps, test_footsteps = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_footsteps

Unnamed: 0,Filename,Log Mel Spectrogram
283,footsteps (139).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
35,footsteps (327).wav,"[[[-33.129536, -33.129536, -33.129536], [-20.0..."
247,footsteps (36).wav,"[[[-40.02742, -40.02742, -40.02742], [-39.9679..."
199,footsteps (50).wav,"[[[-37.784325, -37.784325, -37.784325], [-35.0..."
33,footsteps (318).wav,"[[[-24.362091, -24.362091, -24.362091], [-20.9..."
...,...,...
277,footsteps (259).wav,"[[[-35.53466, -35.53466, -35.53466], [-25.3456..."
287,footsteps (228).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
112,footsteps (180).wav,"[[[-37.144707, -37.144707, -37.144707], [-32.4..."
164,footsteps (95).wav,"[[[-38.418343, -38.418343, -38.418343], [-44.7..."


In [None]:
test_footsteps

Unnamed: 0,Filename,Log Mel Spectrogram
310,footsteps (258).wav,"[[[-37.449356, -37.449356, -37.449356], [-36.6..."
147,footsteps (126).wav,"[[[-34.4319, -34.4319, -34.4319], [-29.792927,..."
14,footsteps (5).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
66,footsteps (279).wav,"[[[-30.045052, -30.045052, -30.045052], [-24.2..."
3,footsteps (2).wav,"[[[-29.42803, -29.42803, -29.42803], [-20.8777..."
...,...,...
65,footsteps (277).wav,"[[[-66.61857, -66.61857, -66.61857], [-40.5233..."
291,footsteps (229).wav,"[[[-23.698544, -23.698544, -23.698544], [-31.4..."
316,footsteps (221).wav,"[[[-68.85472, -68.85472, -68.85472], [-54.7482..."
273,footsteps (158).wav,"[[[-56.837772, -56.837772, -56.837772], [-74.8..."


### Horn

In [None]:
df_melspec_horn = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [None]:
# Window size = 2048
# Number of Features = sample_rate/hop_length * duration_of_audio
# Compute FFT for each window
# Take entire frequency spectrum and separate into 224 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_horn):
  audiopath = pathname_horn + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = 788, n_mels=224, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_horn = df_melspec_horn.append({'Class': "Horn",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [None]:
df_melspec_horn.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Horn,horn (31).wav,"[[[-64.84104, -64.84104, -64.84104], [-60.9975..."
1,Horn,horn (12).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
2,Horn,horn (20).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
3,Horn,horn (32).wav,"[[[-38.345306, -38.345306, -38.345306], [-52.4..."
4,Horn,horn (16).wav,"[[[-29.98085, -29.98085, -29.98085], [-47.9492..."


In [None]:
melspec_list = np.array(df_melspec_horn['Log Mel Spectrogram'].values.tolist())

In [None]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 

In [None]:
df_input = df_melspec_horn[['Filename','Log Mel Spectrogram']]
train_horn, test_horn = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_horn

Unnamed: 0,Filename,Log Mel Spectrogram
106,horn (154).wav,"[[[-27.370861, -27.370861, -27.370861], [-27.2..."
149,horn (41).wav,"[[[-21.32104, -21.32104, -21.32104], [-20.8654..."
158,horn (55).wav,"[[[-43.272346, -43.272346, -43.272346], [-26.5..."
114,horn (171).wav,"[[[-26.823004, -26.823004, -26.823004], [-18.2..."
164,horn (68).wav,"[[[-17.44281, -17.44281, -17.44281], [-22.7710..."
...,...,...
207,horn (98).wav,"[[[-15.159426, -15.159426, -15.159426], [-29.2..."
76,horn (176).wav,"[[[-21.461721, -21.461721, -21.461721], [-30.0..."
222,horn (28).wav,"[[[-35.883743, -35.883743, -35.883743], [-35.3..."
159,horn (53).wav,"[[[-78.651184, -78.651184, -78.651184], [-67.8..."


In [None]:
test_horn

Unnamed: 0,Filename,Log Mel Spectrogram
218,horn (106).wav,"[[[-34.32089, -34.32089, -34.32089], [-50.4862..."
63,horn (189).wav,"[[[-24.660583, -24.660583, -24.660583], [-32.6..."
39,horn (223).wav,"[[[-31.453293, -31.453293, -31.453293], [-21.5..."
4,horn (16).wav,"[[[-29.98085, -29.98085, -29.98085], [-47.9492..."
103,horn (151).wav,"[[[-80.0, -80.0, -80.0], [-69.18767, -69.18767..."
165,horn (50).wav,"[[[-21.189672, -21.189672, -21.189672], [-20.6..."
9,horn (11).wav,"[[[-74.74084, -74.74084, -74.74084], [-63.9060..."
90,horn (170).wav,"[[[-27.801619, -27.801619, -27.801619], [-23.9..."
54,horn (225).wav,"[[[-34.90619, -34.90619, -34.90619], [-25.8576..."
119,horn (153).wav,"[[[-27.46798, -27.46798, -27.46798], [-29.1413..."


### Music

In [None]:
df_melspec_music = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [None]:
# Window size = 2048
# Number of Features = sample_rate/hop_length * duration_of_audio
# Compute FFT for each window
# Take entire frequency spectrum and separate into 224 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_music):
  audiopath = pathname_music + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = 788, n_mels=224, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_music = df_melspec_music.append({'Class': "Music",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [None]:
df_melspec_music.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Music,music (8).wav,"[[[-33.532093, -33.532093, -33.532093], [-52.8..."
1,Music,music (5).wav,"[[[-28.80003, -28.80003, -28.80003], [-45.9442..."
2,Music,music (4).wav,"[[[-28.591068, -28.591068, -28.591068], [-47.6..."
3,Music,music (3).wav,"[[[-49.95333, -49.95333, -49.95333], [-58.5572..."
4,Music,music (7).wav,"[[[-22.662294, -22.662294, -22.662294], [-44.8..."


In [None]:
melspec_list = np.array(df_melspec_music['Log Mel Spectrogram'].values.tolist())

In [None]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 

In [None]:
df_input = df_melspec_music[['Filename','Log Mel Spectrogram']]
train_music, test_music = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_music

Unnamed: 0,Filename,Log Mel Spectrogram
34,music (318).wav,"[[[-20.125336, -20.125336, -20.125336], [-17.6..."
268,music (36).wav,"[[[-5.9489584, -5.9489584, -5.9489584], [-6.40..."
273,music (38).wav,"[[[-16.798975, -16.798975, -16.798975], [-14.6..."
326,music (130).wav,"[[[-14.38833, -14.38833, -14.38833], [-12.2292..."
224,music (84).wav,"[[[-13.002763, -13.002763, -13.002763], [-17.4..."
...,...,...
137,music (168).wav,"[[[-41.603462, -41.603462, -41.603462], [-30.3..."
252,music (44).wav,"[[[-33.76593, -33.76593, -33.76593], [-28.4033..."
50,music (305).wav,"[[[-23.950249, -23.950249, -23.950249], [-47.4..."
270,music (51).wav,"[[[-21.224525, -21.224525, -21.224525], [-12.1..."


In [None]:
test_music

Unnamed: 0,Filename,Log Mel Spectrogram
203,music (94).wav,"[[[-7.9635572, -7.9635572, -7.9635572], [-21.6..."
96,music (191).wav,"[[[-5.9649467, -5.9649467, -5.9649467], [-11.1..."
125,music (228).wav,"[[[-23.99861, -23.99861, -23.99861], [-40.1932..."
130,music (170).wav,"[[[-10.847704, -10.847704, -10.847704], [-18.1..."
254,music (56).wav,"[[[-16.243456, -16.243456, -16.243456], [-0.94..."
...,...,...
15,music (336).wav,"[[[-26.13867, -26.13867, -26.13867], [-44.7370..."
198,music (88).wav,"[[[-46.954, -46.954, -46.954], [-47.89292, -47..."
13,music (329).wav,"[[[-23.531809, -23.531809, -23.531809], [-41.3..."
204,music (101).wav,"[[[-16.129473, -16.129473, -16.129473], [-17.7..."


### Shout

In [None]:
df_melspec_shout = pd.DataFrame(columns = ['Class', 'Filename', 'Log Mel Spectrogram'])

In [None]:
# Window size = 2048
# Number of Features = sample_rate/hop_length * duration_of_audio
# Compute FFT for each window
# Take entire frequency spectrum and separate into 224 evenly spaced frequencies (spaced based on distance heard by human ear)
# For each window, decompose magnitude of signal into its components, corresponding to the frequencies in melscale
for filename in os.listdir(pathname_shout):
  audiopath = pathname_shout + "/" + filename
  y, sr = librosa.load(audiopath, 44100)
  # Minimum Frequency 20 Hz, Maximum Frequency by default 44100/2
  mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft = 2048, hop_length = 788, n_mels=224, fmin=20)
  log_mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
  log_mel_spect_newdim = np.repeat(log_mel_spect[:,:, np.newaxis], 3, axis =2)
  df_melspec_shout = df_melspec_shout.append({'Class': "Shout",'Filename': filename,'Log Mel Spectrogram': log_mel_spect_newdim}, ignore_index=True)

In [None]:
df_melspec_shout.head()

Unnamed: 0,Class,Filename,Log Mel Spectrogram
0,Shout,shout (14).wav,"[[[-58.597908, -58.597908, -58.597908], [-45.8..."
1,Shout,shout (17).wav,"[[[-48.21131, -48.21131, -48.21131], [-42.7343..."
2,Shout,shout (21).wav,"[[[-71.04677, -71.04677, -71.04677], [-66.4646..."
3,Shout,shout (13).wav,"[[[-80.0, -80.0, -80.0], [-64.94917, -64.94917..."
4,Shout,shout (8).wav,"[[[-71.44896, -71.44896, -71.44896], [-73.7504..."


In [None]:
melspec_list = np.array(df_melspec_shout['Log Mel Spectrogram'].values.tolist())

In [None]:
# MobileNetv2 Input Shape
# (number of mel bins, number of features, number of channels)
count = 0
for item in melspec_list:
  count += 1
  print(item.shape)

(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 

In [None]:
df_input = df_melspec_shout[['Filename','Log Mel Spectrogram']]
train_shout, test_shout = train_test_split(df_input, test_size = 0.2, shuffle = True)
train_shout

Unnamed: 0,Filename,Log Mel Spectrogram
186,shout (129).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
167,shout (156).wav,"[[[-45.880917, -45.880917, -45.880917], [-56.7..."
112,shout (200).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
120,shout (212).wav,"[[[-63.193233, -63.193233, -63.193233], [-73.4..."
149,shout (242).wav,"[[[-55.540443, -55.540443, -55.540443], [-55.8..."
...,...,...
45,shout (23).wav,"[[[-63.91912, -63.91912, -63.91912], [-63.3348..."
217,shout (251).wav,"[[[-30.052586, -30.052586, -30.052586], [-42.3..."
6,shout (22).wav,"[[[-63.94262, -63.94262, -63.94262], [-68.0688..."
71,shout (70).wav,"[[[-71.33958, -71.33958, -71.33958], [-71.1295..."


In [None]:
test_shout

Unnamed: 0,Filename,Log Mel Spectrogram
99,shout (121).wav,"[[[-32.439552, -32.439552, -32.439552], [-45.9..."
211,shout (188).wav,"[[[-69.83048, -69.83048, -69.83048], [-39.1142..."
38,shout (35).wav,"[[[-80.0, -80.0, -80.0], [-61.813587, -61.8135..."
3,shout (13).wav,"[[[-80.0, -80.0, -80.0], [-64.94917, -64.94917..."
8,shout (7).wav,"[[[-80.0, -80.0, -80.0], [-71.70798, -71.70798..."
139,shout (176).wav,"[[[-80.0, -80.0, -80.0], [-75.41098, -75.41098..."
143,shout (155).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
259,shout (224).wav,"[[[-59.671658, -59.671658, -59.671658], [-63.7..."
162,shout (136).wav,"[[[-26.75598, -26.75598, -26.75598], [-42.2383..."
56,shout (83).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."


### Merge all train dataframes into one dataframe

In [None]:
df_train_append1 = train_ambience.append(train_footsteps)
df_train_append2 = df_train_append1.append(train_horn)
df_train_append3 = df_train_append2.append(train_music)
df_train = df_train_append3.append(train_shout)
df_train

Unnamed: 0,Filename,Log Mel Spectrogram
311,ambience (58).wav,"[[[-49.223248, -49.223248, -49.223248], [-52.0..."
242,ambience (74).wav,"[[[-27.694464, -27.694464, -27.694464], [-47.4..."
218,ambience (87).wav,"[[[-33.70532, -33.70532, -33.70532], [-35.0588..."
243,ambience (64).wav,"[[[-38.80634, -38.80634, -38.80634], [-42.3053..."
255,ambience (48).wav,"[[[-25.35271, -25.35271, -25.35271], [-39.4933..."
...,...,...
45,shout (23).wav,"[[[-63.91912, -63.91912, -63.91912], [-63.3348..."
217,shout (251).wav,"[[[-30.052586, -30.052586, -30.052586], [-42.3..."
6,shout (22).wav,"[[[-63.94262, -63.94262, -63.94262], [-68.0688..."
71,shout (70).wav,"[[[-71.33958, -71.33958, -71.33958], [-71.1295..."


In [None]:
df_train_final = df_train.drop(['Filename'], axis = 1)
df_train_final

Unnamed: 0,Log Mel Spectrogram
311,"[[[-49.223248, -49.223248, -49.223248], [-52.0..."
242,"[[[-27.694464, -27.694464, -27.694464], [-47.4..."
218,"[[[-33.70532, -33.70532, -33.70532], [-35.0588..."
243,"[[[-38.80634, -38.80634, -38.80634], [-42.3053..."
255,"[[[-25.35271, -25.35271, -25.35271], [-39.4933..."
...,...
45,"[[[-63.91912, -63.91912, -63.91912], [-63.3348..."
217,"[[[-30.052586, -30.052586, -30.052586], [-42.3..."
6,"[[[-63.94262, -63.94262, -63.94262], [-68.0688..."
71,"[[[-71.33958, -71.33958, -71.33958], [-71.1295..."


### Merge all test dataframes into one dataframe

In [None]:
df_test_append1 = test_ambience.append(test_footsteps)
df_test_append2 = df_test_append1.append(test_horn)
df_test_append3 = df_test_append2.append(test_music)
df_test = df_test_append3.append(test_shout)
df_test

Unnamed: 0,Filename,Log Mel Spectrogram
136,ambience (188).wav,"[[[-17.036678, -17.036678, -17.036678], [-35.4..."
26,ambience (286).wav,"[[[-15.34173, -15.34173, -15.34173], [-31.5507..."
127,ambience (181).wav,"[[[-15.934418, -15.934418, -15.934418], [-32.6..."
296,ambience (24).wav,"[[[-6.908004, -6.908004, -6.908004], [-13.8484..."
95,ambience (229).wav,"[[[-41.503044, -41.503044, -41.503044], [-59.5..."
...,...,...
10,shout (12).wav,"[[[-49.30088, -49.30088, -49.30088], [-39.2809..."
67,shout (46).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
129,shout (193).wav,"[[[-36.933594, -36.933594, -36.933594], [-52.4..."
46,shout (56).wav,"[[[-61.4837, -61.4837, -61.4837], [-53.710052,..."


In [None]:
df_test_final = df_test.drop(['Filename'], axis = 1)
df_test_final

Unnamed: 0,Log Mel Spectrogram
136,"[[[-17.036678, -17.036678, -17.036678], [-35.4..."
26,"[[[-15.34173, -15.34173, -15.34173], [-31.5507..."
127,"[[[-15.934418, -15.934418, -15.934418], [-32.6..."
296,"[[[-6.908004, -6.908004, -6.908004], [-13.8484..."
95,"[[[-41.503044, -41.503044, -41.503044], [-59.5..."
...,...
10,"[[[-49.30088, -49.30088, -49.30088], [-39.2809..."
67,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
129,"[[[-36.933594, -36.933594, -36.933594], [-52.4..."
46,"[[[-61.4837, -61.4837, -61.4837], [-53.710052,..."


### Export train and test data into npy files

In [None]:
train_data = np.empty(shape=(df_train_final.shape[0], 345, 345, 3))
train_list = np.array(df_train_final['Log Mel Spectrogram'].values.tolist())

print(train_list[2])

[[[-33.70532  -33.70532  -33.70532 ]
  [-35.058826 -35.058826 -35.058826]
  [-47.51974  -47.51974  -47.51974 ]
  ...
  [-46.602516 -46.602516 -46.602516]
  [-45.59874  -45.59874  -45.59874 ]
  [-37.363605 -37.363605 -37.363605]]

 [[-31.733587 -31.733587 -31.733587]
  [-31.994196 -31.994196 -31.994196]
  [-41.84696  -41.84696  -41.84696 ]
  ...
  [-46.712994 -46.712994 -46.712994]
  [-37.20219  -37.20219  -37.20219 ]
  [-38.9427   -38.9427   -38.9427  ]]

 [[-28.678644 -28.678644 -28.678644]
  [-29.68643  -29.68643  -29.68643 ]
  [-41.860386 -41.860386 -41.860386]
  ...
  [-46.61082  -46.61082  -46.61082 ]
  [-34.31494  -34.31494  -34.31494 ]
  [-38.61753  -38.61753  -38.61753 ]]

 ...

 [[-51.815067 -51.815067 -51.815067]
  [-50.57077  -50.57077  -50.57077 ]
  [-47.441227 -47.441227 -47.441227]
  ...
  [-53.085136 -53.085136 -53.085136]
  [-52.4543   -52.4543   -52.4543  ]
  [-51.660564 -51.660564 -51.660564]]

 [[-50.825424 -50.825424 -50.825424]
  [-51.278004 -51.278004 -51.278004]


In [None]:
np.save("/content/drive/MyDrive/FYP Data/Train and Test/train_data.npy", train_list)

In [None]:
train = np.load("/content/drive/MyDrive/FYP Data/Train and Test/train_data.npy", allow_pickle = True)
print(train.shape)

(1204, 224, 224, 3)


In [None]:
test_data = np.empty(shape=(df_test_final.shape[0], 345, 345, 3))
test_list = np.array(df_test_final['Log Mel Spectrogram'].values.tolist())

print(test_list[2])

[[[-15.934418 -15.934418 -15.934418]
  [-32.6702   -32.6702   -32.6702  ]
  [-52.54168  -52.54168  -52.54168 ]
  ...
  [-34.793354 -34.793354 -34.793354]
  [-36.05825  -36.05825  -36.05825 ]
  [-32.332226 -32.332226 -32.332226]]

 [[-15.220049 -15.220049 -15.220049]
  [-31.493887 -31.493887 -31.493887]
  [-42.9403   -42.9403   -42.9403  ]
  ...
  [-29.478262 -29.478262 -29.478262]
  [-29.720003 -29.720003 -29.720003]
  [-28.009275 -28.009275 -28.009275]]

 [[-15.594459 -15.594459 -15.594459]
  [-26.374794 -26.374794 -26.374794]
  [-39.681557 -39.681557 -39.681557]
  ...
  [-26.077738 -26.077738 -26.077738]
  [-23.898903 -23.898903 -23.898903]
  [-23.8827   -23.8827   -23.8827  ]]

 ...

 [[-80.       -80.       -80.      ]
  [-80.       -80.       -80.      ]
  [-80.       -80.       -80.      ]
  ...
  [-80.       -80.       -80.      ]
  [-80.       -80.       -80.      ]
  [-80.       -80.       -80.      ]]

 [[-80.       -80.       -80.      ]
  [-80.       -80.       -80.      ]


In [None]:
np.save("/content/drive/MyDrive/FYP Data/Train and Test/test_data.npy", test_list)

In [None]:
test = np.load("/content/drive/MyDrive/FYP Data/Train and Test/test_data.npy", allow_pickle = True)
print(test.shape)

(304, 224, 224, 3)


### Obtain train and test labels

In [None]:
df_train

Unnamed: 0,Filename,Log Mel Spectrogram
311,ambience (58).wav,"[[[-49.223248, -49.223248, -49.223248], [-52.0..."
242,ambience (74).wav,"[[[-27.694464, -27.694464, -27.694464], [-47.4..."
218,ambience (87).wav,"[[[-33.70532, -33.70532, -33.70532], [-35.0588..."
243,ambience (64).wav,"[[[-38.80634, -38.80634, -38.80634], [-42.3053..."
255,ambience (48).wav,"[[[-25.35271, -25.35271, -25.35271], [-39.4933..."
...,...,...
45,shout (23).wav,"[[[-63.91912, -63.91912, -63.91912], [-63.3348..."
217,shout (251).wav,"[[[-30.052586, -30.052586, -30.052586], [-42.3..."
6,shout (22).wav,"[[[-63.94262, -63.94262, -63.94262], [-68.0688..."
71,shout (70).wav,"[[[-71.33958, -71.33958, -71.33958], [-71.1295..."


In [None]:
# Creating an empty list
train_label = []

In [None]:
for item in df_train['Filename']:
  filename = str(item)
  sound_class = filename.split()[0] # Only obtain first word
  if sound_class == "ambience":
    train_label.append(0)
  if sound_class == "footsteps":
    train_label.append(1)
  if sound_class == "horn":
    train_label.append(2)
  if sound_class == "music":
    train_label.append(3)
  if sound_class == "shout":
    train_label.append(4)

train_label

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [None]:
df_test

Unnamed: 0,Filename,Log Mel Spectrogram
136,ambience (188).wav,"[[[-17.036678, -17.036678, -17.036678], [-35.4..."
26,ambience (286).wav,"[[[-15.34173, -15.34173, -15.34173], [-31.5507..."
127,ambience (181).wav,"[[[-15.934418, -15.934418, -15.934418], [-32.6..."
296,ambience (24).wav,"[[[-6.908004, -6.908004, -6.908004], [-13.8484..."
95,ambience (229).wav,"[[[-41.503044, -41.503044, -41.503044], [-59.5..."
...,...,...
10,shout (12).wav,"[[[-49.30088, -49.30088, -49.30088], [-39.2809..."
67,shout (46).wav,"[[[-80.0, -80.0, -80.0], [-80.0, -80.0, -80.0]..."
129,shout (193).wav,"[[[-36.933594, -36.933594, -36.933594], [-52.4..."
46,shout (56).wav,"[[[-61.4837, -61.4837, -61.4837], [-53.710052,..."


In [None]:
# Creating an empty list
test_label = []

In [None]:
for item in df_test['Filename']:
  filename = str(item)
  sound_class = filename.split()[0] # Only obtain first word
  if sound_class == "ambience":
    test_label.append(0)
  if sound_class == "footsteps":
    test_label.append(1)
  if sound_class == "horn":
    test_label.append(2)
  if sound_class == "music":
    test_label.append(3)
  if sound_class == "shout":
    test_label.append(4)

test_label

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,


In [None]:
train_label_newdim = np.expand_dims(train_label, axis=1)
test_label_newdim = np.expand_dims(test_label, axis=1)

print(train_label_newdim)

[[0]
 [0]
 [0]
 ...
 [4]
 [4]
 [4]]


In [None]:
np.save("/content/drive/MyDrive/FYP Data/Train and Test/train_label.npy", train_label_newdim)
np.save("/content/drive/MyDrive/FYP Data/Train and Test/test_label.npy", test_label_newdim)

In [None]:
train_labels = np.load("/content/drive/MyDrive/FYP Data/Train and Test/train_label.npy", allow_pickle = True)
test_labels = np.load("/content/drive/MyDrive/FYP Data/Train and Test/test_label.npy", allow_pickle = True)

print(train_labels.shape)
print(test_labels.shape)

(1204, 1)
(304, 1)
