In [1]:
import librosa
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import soundfile
import os
from scipy.io import wavfile
from sklearn.preprocessing import MinMaxScaler

In [2]:
def feature_chromagram(waveform, sample_rate):
    # STFT computed here explicitly
    stft_spectrogram = np.abs(librosa.stft(waveform))
    # Produce the chromagram for all STFT frames
    chromagram = librosa.feature.chroma_stft(S = stft_spectrogram, sr = sample_rate, n_chroma = 128).T
    chromagram = MinMaxScaler((-255, 255)).fit_transform(chromagram)
    return chromagram

def feature_melspectrogram(waveform, sample_rate):
    # Produce the mel spectrogram for all STFT frames
    # Using 8khz as upper frequency bound should be enough for most speech classification tasks
    melspectrogram = librosa.feature.melspectrogram(y = waveform, sr = sample_rate, n_mels = 128, fmax = 8000).T
    melspectrogram = MinMaxScaler((-255, 255)).fit_transform(melspectrogram)
    return melspectrogram

def feature_mfcc(waveform, sample_rate):
    # Compute the MFCCs for all STFT frames
    # 128 filterbanks = 128 coefficients
    mfc_coefficients = librosa.feature.mfcc(y = waveform, sr = sample_rate, n_mfcc = 128).T
    mfc_coefficients = MinMaxScaler((-255, 255)).fit_transform(mfc_coefficients)
    return mfc_coefficients


In [3]:
def get_features(file):
    # load an individual soundfile
    with soundfile.SoundFile(file) as audio:
        waveform = audio.read(dtype="float32")

        # get the first channel of the audio only
        if len(waveform.shape) != 1:
            waveform = waveform[:441000, 0]
        else:
            waveform = waveform[:441000]
        sample_rate = audio.samplerate

        # compute features of soundfile
        melspectrogram = feature_melspectrogram(waveform, sample_rate)
        mfc_coefficients = feature_mfcc(waveform, sample_rate)
        chromagram = feature_chromagram(waveform, sample_rate)
        
        # use np.dstack to stack our feature arrays depth wise to create a feature matrix
        feature_matrix = np.dstack((chromagram, melspectrogram, mfc_coefficients))
        # print(feature_matrix.shape)

        return feature_matrix

In [4]:
tmp = get_features('../audio/Hum/Frozen_hum/5806.wav')
tmp.shape

(862, 128, 3)

In [5]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
import keras.layers as layers
from sklearn.model_selection import train_test_split


In [6]:
def transformer(file_list, label_list):
  arr = []
  label = []
  cnt = 0
  for file, l in zip(file_list, label_list):
    if cnt%300 == 0: print(cnt)
    cnt += 1
    try:
      f = get_features(file)
      if f.shape == (862, 128, 3):
        arr.append(get_features(file))
        label.append(l)
    except:
      print(file)
  return np.array(arr), np.array(label)

def alex(num_labels, input_shape):
    alexNet = tf.keras.models.Sequential([
    # input layer
      layers.InputLayer(input_shape),
    # 1st layer
      layers.Conv2D(filters = 64, kernel_size = (11, 11), strides = (4, 4), name = 'layer1_conv2d_11_4'),
      layers.BatchNormalization(name = 'layer1_normalize'),
      layers.MaxPool2D(pool_size = (2, 2), strides = (2, 2), name = 'layer1_maxpool'),
      layers.Activation('relu', name = 'layer1_relu'),
    # 2nd layer
      layers.Conv2D(filters = 128, kernel_size = (5, 5), strides = (1, 1), padding = 'same', name = 'layer2_conv2d_5_1'),
      layers.BatchNormalization(name = 'layer2_normalize'),
      layers.MaxPool2D(pool_size = (2, 2), strides = (1, 1), name = 'layer2_maxpool'),
      layers.Activation('relu', name = 'layer2_relu'),
    # 3rd layer
      layers.Conv2D(filters = 128, kernel_size = (3, 3), strides = (1, 1), padding = 'same', name = 'layer1_conv2d_3_1'),
      layers.BatchNormalization(name = 'layer3_normalize'),
      layers.MaxPool2D(pool_size = (2, 2), strides = (1, 1), name = 'layer3_maxpool'),
      layers.Activation('relu', name = 'layer3_relu'),
    # transition layer
      layers.Flatten(name = 'FC_layer'),
    # 1st Dense layer
      layers.Dense(1000, activation='relu', name = 'Dense_100'),
    # 2nd Dense layer
      layers.Dense(200, activation='relu', name = 'Dense_20'),
    # # Output layer
      layers.Dense(num_labels, activation = 'softmax', name = 'Output')
    ])
    alexNet.compile(optimizer="Adam", loss=tf.keras.losses.CategoricalCrossentropy(), metrics=["accuracy"])
    # alexNet.summary()
    return alexNet

In [11]:
def get_data_from_dir(dir):
    dir_list = [l[0] for l in os.walk(dir)][1:]
    print(dir_list)

    arr = []

    for i, song_dir in enumerate(dir_list):
        song_file = [file for file in os.listdir(song_dir) if 'wav' in file]
        data = pd.DataFrame({'file': song_file, 'label': i})
        data['file'] = data['file'].apply(lambda file: f'{song_dir}/{file}')
        arr.append(data)

    arr = pd.concat(arr, axis=0)
    
    dataset = arr['file']

    label = pd.get_dummies(arr['label'], columns=['label'])

    return dataset, label.to_numpy()
        
        

In [12]:
dataset, label = get_data_from_dir('../audio/Hum/')


['./audio/Hum/Frozen_hum', './audio/Hum/Hakuna_hum', './audio/Hum/Mamma_hum', './audio/Hum/Panther_hum', './audio/Hum/Potter_hum', './audio/Hum/Rain_hum', './audio/Hum/Showman_hum', './audio/Hum/StarWars_hum']


In [13]:
X_train, X_test, y_train, y_test = train_test_split(dataset, label, test_size = 0.3, stratify = label, random_state = 4444)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(3367,)
(3367, 8)
(1443,)
(1443, 8)


In [14]:
X_trans, y_trans = transformer(X_train.to_numpy(), y_train)

with open('./data/train_8.npz', 'wb') as train_file:
    np.savez(train_file, x = X_trans, y = y_trans)

X_test_trans, y_test_trans = transformer(X_test.to_numpy(), y_test)

with open('./data/test_8.npz', 'wb') as test_file:
    np.savez(test_file, x = X_test_trans, y = y_test_trans)

0
./audio/Hum/Mamma_hum/4981.wav


In [None]:
wave = get_features('../audio/Hum/Frozen_hum/5798.wav')
alex_model = alex(y_train.shape[-1], wave.shape)

alex_model.fit(X_trans, y_trans, epochs = 50, validation_split = 0.2, use_multiprocessing = True)

alex_model.save('./alex_record/alex_8')

print(alex_model.evaluate(X_test_trans, y_test_trans))

# acc 84.15% test 99.96% train 8 label normalize

In [7]:
def labelEncode(y_tmp):
    y_tmp = np.argmax(y_tmp, axis = 1)

    y_tmp = pd.get_dummies(y_tmp).to_numpy()

    return y_tmp

In [8]:
X_train_loaded = None
y_train_loaded = None
X_test_loaded = None
y_test_loaded = None

with open('./data/train_8.npz', 'rb') as train_file:
    train = np.load(train_file)
    X_train_loaded = train['x']
    y_train_loaded = train['y']

with open('./data/test_8.npz', 'rb') as test_file:
    test = np.load(test_file)
    X_test_loaded = test['x']
    y_test_loaded = test['y']

song_train = pd.DataFrame(np.argmax(y_train_loaded, axis=1))
song_test = pd.DataFrame(np.argmax(y_test_loaded, axis = 1))

In [9]:
train_2_loc = np.where(song_train[0].apply(lambda x: x in [0, 1]))[0]
test_2_loc = np.where(song_test[0].apply(lambda x: x in [0, 1]))[0]

song_2_X_train = X_train_loaded[train_2_loc]
song_2_y_train = y_train_loaded[train_2_loc]
song_2_X_test = X_test_loaded[test_2_loc]
song_2_y_test = y_test_loaded[test_2_loc]

with open('./data/train_2.npz', 'wb') as train_file:
    np.savez(train_file, x = song_2_X_train, y = song_2_y_train)

with open('./data/test_2.npz', 'wb') as test_file:
    np.savez(test_file, x = song_2_X_test, y = song_2_y_test)


In [10]:
train_3_loc = np.where(song_train[0].apply(lambda x: x in [0, 1, 4]))[0]
test_3_loc = np.where(song_test[0].apply(lambda x: x in [0, 1, 4]))[0]

song_3_X_train = X_train_loaded[train_3_loc]
song_3_y_train = y_train_loaded[train_3_loc]
song_3_X_test = X_test_loaded[test_3_loc]
song_3_y_test = y_test_loaded[test_3_loc]

with open('./data/train_3.npz', 'wb') as train_file:
    np.savez(train_file, x = song_3_X_train, y = song_3_y_train)

with open('./data/test_3.npz', 'wb') as test_file:
    np.savez(test_file, x = song_3_X_test, y = song_3_y_test)

In [11]:
train_5_loc = np.where(song_train[0].apply(lambda x: x [0, 1, 3, 4, 7]))[0]
test_5_loc = np.where(song_test[0].apply(lambda x: x [0, 1, 3, 4, 7]))[0]

song_5_X_train = X_train_loaded[train_5_loc]
song_5_y_train = y_train_loaded[train_5_loc]
song_5_X_test = X_test_loaded[test_5_loc]
song_5_y_test = y_test_loaded[test_5_loc]

with open('./data/train_5.npz', 'wb') as train_file:
    np.savez(train_file, x = song_5_X_train, y = song_5_y_train)

with open('./data/test_5.npz', 'wb') as test_file:
    np.savez(test_file, x = song_5_X_test, y = song_5_y_test)

In [None]:
wave = get_features('../audio/Hum/Frozen_hum/5798.wav')
for i in [2, 3, 5, 8]:

    alex_model = alex(i, wave.shape)

    with open(f'./data/train_{i}.npz', 'rb') as train_file:
        train = np.load(train_file)

        X_train = train['x']
        y_train = train['y']
        y_train = labelEncode(y_train)

        alex_model.fit(X_train, y_train, epochs = 50, validation_split = 0.2, use_multiprocessing = True)

        alex_model.save(f'./alex_record/alex_{i}')

    with open(f'./data/test_{i}.npz', 'rb') as test_file:
        test = np.load(test_file)

        X_test = test['x']
        y_test = test['y']
        y_test = pd.get_dummies(np.argmax(y_test, axis = 1)).to_numpy()

        print(alex_model.evaluate(X_test, y_test))
                