In [2]:
import os 
from scipy.io import wavfile
import pandas as pd 
import numpy as np
from tqdm import tqdm
from python_speech_features import mfcc, logfbank
from tensorflow.keras.utils import to_categorical
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import Sequential
from tensorflow.keras.layers import  Conv2D, LSTM, MaxPool2D, Dropout, Dense, Flatten, TimeDistributed

In [3]:
df = pd.read_csv('instruments.csv')
df.set_index('fname', inplace=True)

In [4]:
for f in df.index:
    rate, signal = wavfile.read('clean/'+f)
    df.at[f,'length'] = signal.shape[0]/rate

classes = list(np.unique(df.label))    # liste des classes disponibles pour les instruments
class_dist = df.groupby(['label'])['length'].mean() # regroupe les signaux par instruments (label) 
# et donne la moyenne de la longueur de chaque signaux 

In [5]:
# each audio will be used 1/10 second 
# randomly sample the audio 

n_samples = 2 * int(df['length'].sum()/0.1)
# total length of the data converted in the number of possible sample 
prob_dist = class_dist/class_dist.sum()   # densité de prop pour la prise de morceaux de signaux à utiliser par le CNN
choices = np.random.choice(class_dist.index, p=prob_dist)  # choix de instruments aléatoire en tenant compte de la dist de prob

In [6]:
# change coeff here and it changes it everywhere
class Config:
    def __init__(self, mode='conv', nfilt=26, nfeat=13, nfft=512, rate=16000):
        self.mode=mode
        self.nfilt=nfilt
        self.nfft=nfft
        self.nfeat=nfeat
        self.rate=rate
        self.step=int(rate/10)

In [7]:
def build_rand_feat():
    X = []
    y = []
    _min, _max = float('inf'), -float('inf')
    for _ in tqdm(range(n_samples)):
        rand_class = np.random.choice(class_dist.index, p=prob_dist) # choisit un instrument en tenant compte de la dist de prob
        file = np.random.choice(df[df.label==rand_class].index) 
        # choisit aléatoirement un nom de fichier dont l'index correspond à l'instrument choisi précédemment 
        rate, wav = wavfile.read('clean/'+file)
        # lit le fichier en question 
        label = df.at[file, 'label'] # recharge l'instrument du signal chargé 
        rand_index = np.random.randint(0, wav.shape[0]-config.step)
        # un nombre aléatoire entre 0 et ... pour éviter de prendre un échantillon sur le bord du signal
        sample = wav[rand_index:rand_index+config.step]
        # prélève l'échantillon sur le signal
        X_sample = mfcc(sample, rate, numcep=config.nfeat, nfilt=config.nfilt, nfft=config.nfft).T
        # calcule les mfcc de l'échantillon
        _min = min(np.amin(X_sample), _min)
        _max = max(np.amax(X_sample), _max)
        X.append(X_sample if config.mode == 'conv' else X_sample.T)
        y.append(classes.index(label))
        # ajoute le chiffre correspond à l'instrument de l'échantillon 
    X, y = np.array(X), np.array(y)
    X = ( X - _min)/(_max - _min)
    # normalize l'échantillon sur toutes les valeurs de min et max possible
    if config.mode == 'conv':
        X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
    elif config.mode == 'time':
        X = X.reshape(X.shape[0], X.shape[1], X.shape[2])
    y = to_categorical(y, num_classes=10)
    return X, y

In [8]:
config = Config(mode='conv')

In [9]:
def get_conv_model():
    model = Sequential()
    model.add(Conv2D(16, (3,3), activation='relu', strides=(1,1), padding='same', input_shape=input_shape))
    model.add(Conv2D(32, (3,3), activation='relu', strides=(1,1), padding='same'))
    model.add(Conv2D(64, (3,3), activation='relu', strides=(1,1), padding='same'))
    model.add(Conv2D(128, (3,3), activation='relu', strides=(1,1), padding='same'))
    model.add(MaxPool2D((2,2)))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(10, activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [10]:
def get_recurrent_model():
    # shape of data for RNN is (n, time, feat)
    model = Sequential()
    model.add(LSTM(128, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(64, activation='relu')))
    model.add(TimeDistributed(Dense(32, activation='relu')))
    model.add(TimeDistributed(Dense(16, activation='relu')))
    model.add(TimeDistributed(Dense(8, activation='relu')))
    model.add(Flatten())
    model.add(Dense(10, activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [11]:
if config.mode == 'conv':
    X, y = build_rand_feat()
    y_flat = np.argmax(y, axis=1)
    input_shape = (X.shape[1], X.shape[2], 1)
    model = get_conv_model()
    
elif config.mode =='time':
    X,y = build_rand_feat()
    y_flat = np.argmax(y, axis=1)
    input_shape = (X.shape[1], X.shape[2])
    model = get_recurrent_model()

100%|███████████████████████████████████████████████████████████████████████████| 26410/26410 [01:29<00:00, 293.50it/s]


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 13, 9, 16)         160       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 13, 9, 32)         4640      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 13, 9, 64)         18496     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 13, 9, 128)        73856     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 6, 4, 128)         0         
_________________________________________________________________
dropout (Dropout)            (None, 6, 4, 128)         0         
_________________________________________________________________
flatten (Flatten)            (None, 3072)              0

In [12]:
class_weight = compute_class_weight('balanced', np.unique(y_flat), y_flat)
# le gradient descent va être effectué en prenant en compte la distribution des instruments pour que aucune direction ne soit 
# privilégiée étant donné que certaines produiront plus d'infos que d'autres
model.fit(X, y, epochs=10, batch_size=32, shuffle=True, class_weight=class_weight)

Train on 26410 samples
Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x192a8d0a438>