https://github.com/keunwoochoi/keras_cropping_layer/blob/master/cnn_cropping.py

### Import libraries and modify notebook settings

In [16]:
# Import libraries
import os
import sys
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
from pysndfx import AudioEffectsChain
from IPython.display import Audio, display

from keras.utils import HDF5Matrix
from keras.utils import np_utils

# Modify notebook settings
%matplotlib inline

### Create paths to data folders and files

In [2]:
# Create a variable for the project root directory
proj_root = os.path.join(os.pardir)

# Save path to the raw metadata file
# "UrbanSound8K.csv"
metadata_file = os.path.join(proj_root,
                             "data",
                             "raw",
                             "UrbanSound8K",
                             "metadata",
                             "UrbanSound8K.csv")

# Save path to the raw audio files
raw_audio_path = os.path.join(proj_root,
                             "data",
                             "raw",
                             "UrbanSound8K",
                             "audio")

# Save path to the raw audio files
fold1_path = os.path.join(raw_audio_path,
                          "fold1")


# Save the path to the folder that will contain 
# the interim data sets for modeling:
# /data/interim
interim_data_dir = os.path.join(proj_root,
                                "data",
                                "interim")


# Save the path to the folder that will contain 
# the interim trash data sets
# /data/interim
interim_trash_dir = os.path.join(interim_data_dir,
                                "trash")

# Save path to the folder for the
# spectrogram arrays that we will generate
spectrogram_arrays_path = os.path.join(interim_data_dir,
                                       "spectrogram_arrays")


In [3]:
# add the 'src' directory as one where we can import modules
src_dir = os.path.join(proj_root, "src")
sys.path.append(src_dir)

In [4]:
from utils.format_bytes_size import FormatBytesSize

# ...

In [5]:
new_file_name = 'metadata_test.csv'
new_file_path = os.path.join(interim_data_dir,
                             new_file_name)
df_test = pd.read_csv(new_file_path, index_col=0)

test_len = len(df_test)

In [6]:
new_file_name = 'metadata_train.csv'
new_file_path = os.path.join(interim_data_dir,
                             new_file_name)
df_train = pd.read_csv(new_file_path, index_col=0)

train_len = len(df_train)

## Process audio files [NEW]

In [7]:
global_sr = 22050

In [8]:
#pitch_shift_list = [None, -3.5, -3.0, -2.5, -2.0, -1.5, -1.0, -0.5, 3.5, 3.0, 2.5, 2.0, 1.5, 1.0, 0.5]

pitch_shift_list = [None, -2.0, -1.5, -1.0, -0.5, 2.0, 1.5, 1.0, 0.5]

pitch_shift_list_len = len(pitch_shift_list)

total_tran_len = train_len * pitch_shift_list_len

In [9]:
seconds = 4

chunk_tuple = (1, (global_sr * seconds), 1)

X_test_shape_tuple = (test_len, (global_sr * seconds), 1)
X_train_shape_tuple = (total_tran_len, (global_sr * seconds), 1)

y_test_shape_tuple = (test_len, 1)
y_train_shape_tuple = (total_tran_len, 1)

print('chunk_tuple:\t\t', chunk_tuple)
print('X_test_shape_tuple:\t', X_test_shape_tuple)
print('X_train_shape_tuple:\t', X_train_shape_tuple)
print('y_test_shape_tuple:\t', y_test_shape_tuple)
print('y_train_shape_tuple:\t', y_train_shape_tuple)

chunk_tuple:		 (1, 88200, 1)
X_test_shape_tuple:	 (1374, 88200, 1)
X_train_shape_tuple:	 (51660, 88200, 1)
y_test_shape_tuple:	 (1374, 1)
y_train_shape_tuple:	 (51660, 1)


In [10]:
hdf5_file_name = 'sample-level-augmented.hdf5'
hdf5_path = os.path.join(interim_trash_dir, hdf5_file_name)
hdf5_path

'../data/interim/trash/sample-level-augmented.hdf5'

In [11]:
with h5py.File(hdf5_path, 'w') as f:
    
    f.create_dataset("X_test_dset", 
                     shape=X_test_shape_tuple, 
                     dtype='float32', # data=np.zeros(X_test_shape_tuple, dtype='float32'),
                     chunks=chunk_tuple,
                     compression="gzip")
    
    f.create_dataset("X_train_dset", 
                     shape=X_train_shape_tuple, 
                     dtype='float32', # data=np.zeros(X_train_shape_tuple, dtype='float32'),
                     chunks=chunk_tuple,
                     compression="gzip")
    
    f.create_dataset("y_test_dset", 
                     shape=y_test_shape_tuple, 
                     dtype='int8', # data=np.zeros(y_test_shape_tuple, dtype='int8'),
                     compression="gzip")
    
    f.create_dataset("y_train_dset", 
                     shape=y_train_shape_tuple, 
                     dtype='int8', # data=np.zeros(y_train_shape_tuple, dtype='int8'),
                     compression="gzip")

In [12]:
#Populate X_test_dset and y_test_dset in hdf5_path

count = 0

for index, (_, row) in enumerate(df_test.iterrows()):

    sys.stdout.write("\rCount:\t {:,}  of  ".format(count) + \
                     "{:,}  ".format(y_test_shape_tuple[0]) + \
                     "({:.1f}%) \t\tSpace: ".format(100 * (count / y_test_shape_tuple[0])) + \
                     FormatBytesSize(os.path.getsize(hdf5_path)))
    sys.stdout.flush()
    sys.stdout.write('\r')
    
    # Save path to the raw audio files
    fold_name = 'fold' + str(row['fold'])
    fold_path = os.path.join(raw_audio_path,
                             fold_name)
    
    # Full path to the audio_file
    audio_file = row['slice_file_name']
    audio_path = os.path.join(fold_path,
                              audio_file)
    
    # Load the .wav audio_file
    aud_array, _ = librosa.load(audio_path, sr=global_sr)

    classID = row['classID']
    
    # Write to the hdf5 file
    with h5py.File(hdf5_path, "r+") as f:
        # X_test
        dset = f['X_test_dset']    

        # limit tensor length of 88200
        dset[count,:,] = aud_array[np.newaxis, :88200, np.newaxis]
        
        # y_test
        dset = f['y_test_dset']    
        dset[count,:] = row['classID']

    count += 1   

sys.stdout.write("\rCount:\t {:,}  of  ".format(count) + \
                 "{:,}  ".format(y_test_shape_tuple[0]) + \
                 "({:.1f}%) \t\tSpace: ".format(100 * (count / y_test_shape_tuple[0])) + \
                 FormatBytesSize(os.path.getsize(hdf5_path)))
sys.stdout.flush()
sys.stdout.write('\r')    

Count:	 1,374  of  1,374  (100.0%) 		Space: 429.28 MB

In [13]:
#Populate X_train_dset and y_train_dset in hdf5_path

count = 0

for index, (_, row) in enumerate(df_train.iterrows()):
    
    sys.stdout.write("\rCount:\t {:,}  of  ".format(count) + \
                     "{:,}  ".format(y_train_shape_tuple[0]) + \
                     "({:.1f}%) \t\tSpace: ".format(100 * (count / y_train_shape_tuple[0])) + \
                     FormatBytesSize(os.path.getsize(hdf5_path)))        
    sys.stdout.flush()
    sys.stdout.write('\r')
    
    # Save path to the raw audio files
    fold_name = 'fold' + str(row['fold'])
    fold_path = os.path.join(raw_audio_path,
                             fold_name)
    
    # Full path to the audio_file
    audio_file = row['slice_file_name']
    audio_path = os.path.join(fold_path,
                              audio_file)
    
    # Load the .wav audio_file
    aud_array, _ = librosa.load(audio_path, sr=global_sr)

    classID = row['classID']
    
    
    for ps in pitch_shift_list:

        aud_array_aug = aud_array

        # Pitch shift
        if ps is not None:
            aud_array_aug = librosa.effects.pitch_shift(aud_array_aug, global_sr, n_steps=ps)


        # Write to the hdf5 file
        with h5py.File(hdf5_path, "r+") as f:
            # X_train
            dset = f['X_train_dset']    

            # limit tensor length of 88200
            dset[count,:,] = aud_array_aug[np.newaxis, :88200, np.newaxis]

            # y_train
            dset = f['y_train_dset']    
            dset[count,:] = row['classID']

        count += 1   
    
sys.stdout.write("\rCount:\t {:,}  of  ".format(count) + \
                 "{:,}  ".format(y_train_shape_tuple[0]) + \
                 "({:.1f}%) \t\tSpace: ".format(100 * (count / y_train_shape_tuple[0])) + \
                 FormatBytesSize(os.path.getsize(hdf5_path)))        
sys.stdout.flush()
sys.stdout.write('\r')

Count:	 51,660  of  51,660  (100.0%) 		Space: 16.16 GB

# Create objects for X_train, y_train, X_test, & y_test

In [14]:
X_train = HDF5Matrix(hdf5_path, 
                     'X_train_dset')

y_train = HDF5Matrix(hdf5_path, 
                     'y_train_dset')

X_test = HDF5Matrix(hdf5_path, 
                     'X_test_dset')

y_test = HDF5Matrix(hdf5_path, 
                     'y_test_dset')

In [17]:
# 6. Preprocess class labels
Y_train = np_utils.to_categorical(y_train)
Y_test = np_utils.to_categorical(y_test)

In [18]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(51660, 88200, 1)
(1374, 88200, 1)
(51660, 1)
(1374, 1)
(51660, 10)
(1374, 10)


# Model

# ... Have ResNext1D infer classes=Y_train.shape[1]

# make targets fuzzy