### Preparing data for Classification model.

A very small subset of violins and guitars are used from the philharmonia instrument data set for this example.
Data generators will be explored in future for more efficient loading.

Dataset source: [Philharmonia Data Set](https://philharmonia.co.uk/resources/sound-samples/)

In [1]:
import os
import pickle
from typing import List

import numpy as np
from librosa import load
from librosa.util import fix_length

from utils import constants as consts
from utils.audio_tools import create_audio_player
from utils.helpers import Data

PATH_TO_SET: str = '/data-sets/small_set'  # Contains two classes with 100 samples each.
CLASSES: List[str] = ['guitar', 'violin']
PATH_TO_TEST_CLASSES: List[str] = ['guitar_test', 'violin_test']

# Labels:
# guitar = 0
# violin = 1

Load in sample as 1 second clips. This makes each audio sample size the same as the sample rate.
For the time being onset detection or other methods are __not__ being used for cutting up the audio file.

In [4]:
def load_data(path_to_set: str, classes: List[str]) -> List[Data]:
    """
    :return:
    """

    samples: List[Data] = []  # Little data structure for holding raw audio data and it's label.

    for label, _class in enumerate(classes):
        for file in os.listdir(f'{path_to_set}/{_class}'):
            sample: np.ndarray = load(
                f'{path_to_set}/{_class}/{file}',
                duration=1.0,
                mono=True,
                sr=consts.SAMPLE_RATE
            )[0]

            sample = fix_length(sample, consts.SAMPLE_RATE)  # Pad audio with 0's if it's less than a second.
            samples.append(Data(sample, label))

    return samples


train_data: List[Data] = load_data(PATH_TO_SET, CLASSES)
test_data: List[Data] = load_data(PATH_TO_SET, PATH_TO_TEST_CLASSES)

print(f'Label: {CLASSES[test_data[0].label]}')
create_audio_player(test_data[0].raw_audio)  # Sample is a pinch harmonic.

Label: guitar


#### Serialize data with pickle.

In [3]:
# Storing to a folder that is a docker volume (not in git repo).
with open('../pickled-data/train_data.pickle', 'wb') as file_handler:
    pickle.dump(train_data, file_handler)

with open('../pickled-data/test_data.pickle', 'wb') as file_handler:
    pickle.dump(test_data, file_handler)