In [1]:
import librosa
import numpy as np
import os
import speechpy

DATA_DIR = "../data/mp3_samples"

In [2]:
# Get all mp3 files in dir and subdirs
mp3s = []
for (dirpath, dirnames, filenames) in os.walk(DATA_DIR):
    mp3s += [os.path.join(dirpath, file) for file in filenames if file.find(".mp3")!=-1]
print(mp3s)

['../data/mp3_samples\\Blue Moon Beat (Demo 12-1-18).mp3', '../data/mp3_samples\\Cadence Beat [Demo 6-2-20].mp3', '../data/mp3_samples\\Concert Hall Beat [Demo 6-2-20].mp3']


In [3]:
# Load mp3
i = 0
mp3 = mp3s[i]
x , sr = librosa.load(mp3, sr=44100)
print(type(x), x.shape)
print(max(x), sr)

<class 'numpy.ndarray'> (10446336,)
0.9999695 44100


In our work, Librosa was used to extract the following features from a given music clip: Mel-scaled Spectrogram, Constant-Q Transform (CQT), Mel-frequency cepstral coefficients (MFCCs), MFCCs delta and Chromagram, as detailed in Table 4. Each kind of features was extracted at the sampling rate of 44,100Hz, with a Hamming window size of 2048 samples (≈ 46 ms) and a hop size of 1024 samples (≈ 23 ms). The Mel Spectrogram and CQT features were transformed to log amplitude with S_0 = ln(10 · S + ), where S,  represents the feature and an extremely small number, respectively. Then Cepstral Mean and
Variance Normalization (CMVN) [29, 35] were applied to the extracted features for minimizing distortion caused by noise contamination. Finally these normalized features were concatenated to a 324-dim feature, which was later used as the input of MusiCoder.

In [4]:
print(x.dtype)

float32


In [5]:
# Extract feats
sr = 44100
window = 'hamming'
win_length=2048
hop_length=1024

mel = librosa.feature.melspectrogram(y=x, sr=sr, hop_length=hop_length, win_length=win_length, window=window)
cqt = librosa.feature.chroma_cqt(y=x, sr=sr, hop_length=hop_length, n_chroma=144)
mfcc = librosa.feature.mfcc(y=x, sr=sr, hop_length=hop_length, win_length=win_length, window=window)
delta_mfcc = librosa.feature.delta(mfcc)
chroma = librosa.feature.chroma_stft(y=x, sr=sr, hop_length=hop_length, win_length=win_length, window=window)

print("mel:", mel.shape)
print("cqt:", cqt.shape)
print("mfcc:", mfcc.shape)
print("delta_mfcc:", delta_mfcc.shape)
print("chroma:", chroma.shape)

  if not j.flags.writeable or j.dtype not in (np.int32, np.int64):
  b = a[a_slice]


mel: (128, 10202)
cqt: (144, 10202)
mfcc: (20, 10202)
delta_mfcc: (20, 10202)
chroma: (12, 10202)


In [8]:
def log_scale(x):
    epsilon = 1e-6
    return (np.log(10*x+epsilon))

In [12]:
mel = speechpy.processing.cmvn(log_scale(mel))
cqt = speechpy.processing.cmvn(log_scale(cqt))
mfcc = speechpy.processing.cmvn(mfcc)
delta_mfcc = speechpy.processing.cmvn(delta_mfcc)
chroma = speechpy.processing.cmvn(chroma)

print("mel:", mel.shape)
print("cqt:", cqt.shape)
print("mfcc:", mfcc.shape)
print("delta_mfcc:", delta_mfcc.shape)
print("chroma:", chroma.shape)

mel: (128, 10202)
cqt: (144, 10202)
mfcc: (20, 10202)
delta_mfcc: (20, 10202)
chroma: (12, 10202)


  app.launch_new_instance()


In [13]:
mel.dtype

dtype('float32')