In [12]:
import os
import numpy as np
import pandas as pd
import scipy
import librosa
import shutil
import keras


import keras.backend as K
import tensorflow as tf
from scipy import signal


from keras import regularizers, optimizers
from keras.layers import *
from keras.models import *
from keras.applications import *
from keras.utils import *
from keras.callbacks import *
from sklearn.model_selection import *
# from sklearn.cross_validation import StratifiedKFold
os.environ['CUDA_VISIBLE_DEVICES']=''
import tensorflow as tf
import keras.backend.tensorflow_backend as KTF

config = tf.ConfigProto()  
config.gpu_options.allow_growth=True   #不全部占满显存, 按需分配
session = tf.Session(config=config)

# 设置session
KTF.set_session(session)

In [13]:
os.listdir('../audio-data/')
train_path = '../audio-data/audio_train/'
test_path = '../audio-data/audio_test/'
train = pd.read_csv('../audio-data/train.csv')
test = pd.read_csv('../audio-data/sample_submission.csv')
# print('training samples: ', len(os.listdir(train_path)))
# print('test samples: ', len(os.listdir(test_path)))
# print('training labels: ', len(train.label.unique()))
print(train.head())
LABELS = list(train.label.unique())
label_idx = {label: i for i, label in enumerate(LABELS)}
train.set_index('fname', inplace=True)
test.set_index('fname', inplace=True)
train['label_idx'] = train.label.apply(lambda x: label_idx[x])

os.listdir('../audio-data/')
train_path = '../audio-data/audio_train/'
test_path = '../audio-data/audio_test/'
train = pd.read_csv('../audio-data/train.csv')
test = pd.read_csv('../audio-data/sample_submission.csv')
# print('training samples: ', len(os.listdir(train_path)))
# print('test samples: ', len(os.listdir(test_path)))
# print('training labels: ', len(train.label.unique()))
print(train.head())
LABELS = list(train.label.unique())
label_idx = {label: i for i, label in enumerate(LABELS)}
train.set_index('fname', inplace=True)
test.set_index('fname', inplace=True)
train['label_idx'] = train.label.apply(lambda x: label_idx[x])

          fname         label  manually_verified
0  00044347.wav        Hi-hat                  0
1  001ca53d.wav     Saxophone                  1
2  002d256b.wav       Trumpet                  0
3  0033e230.wav  Glockenspiel                  1
4  00353774.wav         Cello                  1
          fname         label  manually_verified
0  00044347.wav        Hi-hat                  0
1  001ca53d.wav     Saxophone                  1
2  002d256b.wav       Trumpet                  0
3  0033e230.wav  Glockenspiel                  1
4  00353774.wav         Cello                  1


In [14]:
class Config(object):
    def __init__(self,
                sampling_rate=44100, audio_duration=2, n_classes=41,
                use_mfcc=False, n_folds=10, learning_rate=0.0001,
                max_epochs=50, n_mfcc=20, use_log_sp=False, 
                use_mixup=False, alpha=0.2, use_log_mel_sp=False, use_cqt=False):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.n_classes = n_classes
        self.use_log_sp = use_log_sp
        self.use_mfcc = use_mfcc
        self.use_mixup = use_mixup
        self.use_log_mel_sp = use_log_mel_sp
        self.use_cqt = use_cqt
        self.alpha = alpha
        self.n_mfcc = n_mfcc
        self.n_folds = n_folds
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.audio_length = self.sampling_rate * self.audio_duration
        if self.use_mfcc:
            # np.floor 计算比每一个元素小或相等的最大的整数
            self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
        elif self.use_log_sp:
            self.dim = (self.audio_duration*100-1, self.sampling_rate//100+1, 1)
        elif self.use_log_mel_sp or self.use_cqt:
            self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
        else:
            self.dim = (self.audio_length, 1)
            





In [15]:
# spectrogram.shape[0] = time*100 - 1
# spectrogram.shape[1] = rate/100 + 1
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

def audio_norm(data):
    max_data = np.max(data)
    min_data = np.min(data)
    data = (data - min_data) / (max_data - min_data + 1e-6)
    return data - 0.5

In [18]:
%%time
def prepare_data(df, config, data_dir):
    X = np.empty(shape=(df.shape[0], config.dim[0], config.dim[1], 1))
    input_length = config.audio_length
    count = 0
    for i, fname in enumerate(df.index):
        file_path = data_dir + str(fname)
        data, _ = librosa.core.load(file_path, sr=config.sampling_rate, 
                                    duration = config.audio_duration,
                                    res_type="kaiser_fast")

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
        if config.use_mfcc:
            data = librosa.feature.mfcc(data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)
            data = np.expand_dims(data, axis=-1)
        elif config.use_log_sp:
            freqs, times, data = log_specgram(data, config.sampling_rate)
            data = np.expand_dims(data, axis=-1)
        elif config.use_log_mel_sp:
            mel_spec = librosa.feature.melspectrogram(data, sr=config.sampling_rate, n_mels=config.n_mfcc)
            log_mel_spec = librosa.core.power_to_db(mel_spec)
            data = np.expand_dims(log_mel_spec, axis=-1)
        elif config.use_cqt:
            chroma_cq = librosa.feature.chroma_cqt(data, sr=config.sampling_rate, n_chroma=config.n_mfcc)
            data = np.expand_dims(chroma_cq, axis=-1)
            
        X[i,] = data
        count += 1
        if count % 1000 == 0:
            print('%d/%d'%(count, len(df)))
    return X

config = Config(use_log_sp=True, sampling_rate=44100, audio_duration=5, n_classes=41, n_mfcc=200)
print(config.dim)
X_train = prepare_data(train, config, '../audio-data/audio_train/')
X_test = prepare_data(test, config, '../audio-data/audio_test/')
# y_train = to_categorical(train.label_idx, num_classes=41)

np.save("X_44100x5_ls_train.npy", X_train)
np.save("X_44100x5_ls_test.npy", X_test)
# np.save("y_train.npy", y_train)

# X_train = np.load("X_train.npy")
# X_test = np.load("X_test.npy")
# y_train = np.load("y_train.npy")
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)




(499, 442, 1)
1000/9473
2000/9473
3000/9473
4000/9473
5000/9473
6000/9473
7000/9473
8000/9473
9000/9473
1000/9400
2000/9400
3000/9400
4000/9400
5000/9400
6000/9400
7000/9400
8000/9400
9000/9400
(9473, 499, 442, 1)
(9400, 499, 442, 1)
(9473, 41)
CPU times: user 7min 8s, sys: 54.6 s, total: 8min 3s
Wall time: 14min 58s
