In [None]:
# 0. import packages

import sys
import string

import time
import numpy as np
import pandas as pd

import matplotlib.pyplot as plot

In [None]:
# #1.1 GPU stuff

# print ("cuda: ", torch.cuda.is_available())
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# print ("current device: ", device)
# print ("count: ", torch.cuda.device_count())

# if torch.cuda.is_available():
#     print ("device name: ", torch.cuda.get_device_name(0))
#     torch.cuda.set_device(0)

In [None]:
# 1.2 load groove dataset
import math

groove_csv = pd.read_csv('groove/info.csv')
print("groove dataset:", len(groove_csv))

# get train, test, and validation sets
train_csv = []
test_csv = []
validation_csv = []

for index, row in groove_csv.iterrows():
    if str(row.audio_filename).lower() != "nan":
        split = row['split']
        if split == "train":
            train_csv.append(row)
        elif split == "test":
            test_csv.append(row)
        elif split == "validation":
            validation_csv.append(row)
        
print ("train: ", len(train_csv))
print ("test: ", len(test_csv))
print ("validation: ", len(validation_csv))

print (train_csv[0].midi_filename)

In [None]:
# convert audio files into tensors
from scipy import signal
import audiosegment
import librosa
import torch
import shutil
import os

def audio_to_melspec_tensor(wav_file_path, sample_rate=44_100): 
    window_size = 0.025
    window_stride = 0.01
    n_dft = int(sample_rate * window_size)
    n_mels = 128
    win_length = 1024
    hop_length = int(sample_rate * window_stride)
    # load in wav file and remove the mean of the signal
    y, sr = librosa.load(wav_file_path, sr=sample_rate)
    y = y - y.mean()
    y = np.append(y[0],y[1:]-.97*y[:-1])
    # compute mel spectrogram
    stft = librosa.stft(y, n_fft=n_dft, hop_length=hop_length, win_length=win_length, window=signal.hamming)
    spec = np.abs(stft)**2
    mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=n_dft, n_mels=n_mels, fmin=20)
    melspec = np.dot(mel_basis, spec)
    logspec = librosa.power_to_db(melspec, ref=np.max)
    logspec = np.transpose(logspec)
    # plot.imshow(logspec.T, origin='lower', aspect='auto')
    # plot.show()
    # turn into tensor
    logspec_tensor = torch.tensor(logspec)
    return logspec_tensor

def get_feats_and_labels_from_csv(csv_index):
    # load in wav file
    audio_file_path = "groove/" + csv_index.audio_filename
    wav_file = audiosegment.from_file(audio_file_path)
    # convert sample width if not set to 2 (16 bits)
    if wav_file.sample_width != 2:
        wav_file = wav_file.set_sample_width(2)
        # print("\tnew sample_width: ", wav_file.sample_width)
        wav_file.export(audio_file_path, format="wav")
    # convert file from stereo to mono if channels > 1
    if wav_file.channels != 1:
        wav_file = wav_file.set_channels(1)
        wav_file.export(audio_file_path, format="wav")
    # cutting and padding
    predefined_length = 12
    diced_wav_files = wav_file.dice(predefined_length, zero_pad=True)
    # get feature tensors
    default_sample_rate = 44100
    target_len = predefined_length * default_sample_rate
    feats_tensors_list = []
    i = 0
    for diced_file in diced_wav_files:
        # pad with zeros if not correct length
        diced_file_len = len(diced_file.to_numpy_array())
        if diced_file_len != target_len:
            zeros = target_len - diced_file_len
            diced_array = np.pad(diced_file.to_numpy_array(), (0, zeros))
            diced_file = audiosegment.from_numpy_array(diced_array, framerate=default_sample_rate)
        # export temp wav file and convert to tensor
        diced_file_path = str(csv_index.id) + "-" + str(i) + ".wav"
        diced_file_path = diced_file_path.replace('/', '-')
        diced_file_path = "temp/" + diced_file_path
        diced_file.export(diced_file_path, format="wav")
        feats_tensor = audio_to_melspec_tensor(diced_file_path, wav_file.frame_rate)
        feats_tensors_list.append(feats_tensor)
        i += 1
    # get midi file
    midi_file = csv_index.midi_filename
    # return tensors
    return feats_tensors_list #, label_tensor

# reset temp folder
if os.path.isfile('temp/'):
    shutil.rmtree('temp/', ignore_errors=True)
    os.mkdir('temp/')

i = 0
for index in train_csv:
    feats_tensors = get_feats_and_labels_from_csv(index) # , label_tensor
    for tensor in feats_tensors:
        print (i, "\ttensor: ", tensor.shape)

    i += 1