In [1]:
import pandas as pd
import os
import torch
import librosa
import librosa.display
import pywt
import csv
import pickle
import keras

import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.layers import *
from keras.callbacks import EarlyStopping

In [2]:
FOLDER_PATH = './dataset/fr/clips/'
FEATURES_FILE = 'mfcc_features.pkl'

In [3]:
def padding(data, axis, length):
    pad_width = [(0, 0)] * len(data.shape)
    pad_width[axis] = (0, max(0, length - data.shape[axis]))
    padded_data = np.pad(data, pad_width, mode='constant', constant_values=0)
    return padded_data

def get_features(df_in, max_len):
    features = []
    labels = []

    for index in range(0, len(df_in)):
        filename = FOLDER_PATH + df_in.path[index]
        label = df_in.sentence[index]

        # load the file
        y, sr = librosa.load(filename)
        
        # Trim the audio file to remove leading/trailing silence
        y, _ = librosa.effects.trim(y)

        # Compute MFCC
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  # n_mfcc: nombre de coefficients MFCC à conserver
        
        print(mfccs.shape)
        
        # Pad/truncate MFCCs
        if (mfccs.shape[1] < max_len):
            pad_width = max_len - mfccs.shape[1]
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_len]
        
        features.append(mfccs.T)
        labels.append(label)

    return (features, labels)

def preprocess_data(X, y):
    print(np.mean(X))
    print(np.std(X))
    
    X = np.array((X - np.mean(X)) / np.std(X))
    y = np.array(y)
    return X, y

def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=123)
    return X_train, X_test, X_val, y_train, y_test, y_val

In [4]:
data = pd.read_csv('./dataset/fr/test.tsv', delimiter='\t')
data = data[['path', 'sentence']]

In [5]:
max_len = 200

if os.path.exists(FEATURES_FILE):
    with open(FEATURES_FILE, 'rb') as f:
        X, y = pickle.load(f)
else:
    X, y = get_features(data, max_len)
    with open(FEATURES_FILE, 'wb') as f:
        pickle.dump((X, y), f)

In [6]:
X, y = preprocess_data(X, y)

-17.961946
107.19872


In [7]:
def create_model(input_shape):
    model = keras.Sequential()

    # Couche Conv1D
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.5))

    # Couche LSTM pour traiter chaque trame
    model.add(LSTM(64, return_sequences=True))
    model.add(LSTM(32))
    
    # Couche Dense
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))

    # Couche de sortie
    model.add(Dense(4, activation='softmax'))

    # Choix de l'optimiseur avec un taux d'apprentissage adapté
    opt = keras.optimizers.Adam(learning_rate=0.01)

    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

    return model


In [8]:
# Divisez les données
X_train, X_test, X_val, y_train, y_test, y_val = split_data(X, y)

# Maintenant, supposons que X a la forme (nombre_d'échantillons, nombre_de_trames, n_mfcc)
num_samples, num_frames, n_mfcc = X_train.shape
input_shape = (num_frames, n_mfcc)

# Convertir les étiquettes en format catégorique
num_classes = 4
y_train = to_categorical(y_train - 1, num_classes)
y_val = to_categorical(y_val - 1, num_classes)
y_test = to_categorical(y_test - 1, num_classes)

# Créez le modèle
model = create_model(input_shape)
model.summary()

# Ajoutez le critère d'arrêt précoce à la liste des callbacks pour l'entraînement
history = model.fit(X_train, y_train, epochs=300, batch_size=512, validation_data=(X_val, y_val))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 198, 64)           2560      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 99, 64)           0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 99, 64)            0         
                                                                 
 lstm (LSTM)                 (None, 99, 64)            33024     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 256)               8448      
                                                        

In [9]:
# Évaluez le modèle
TrainLoss, Trainacc = model.evaluate(X_train, y_train)
TestLoss, Testacc = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)



In [10]:
print('Confusion_matrix: ', tf.math.confusion_matrix(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))

# Sauvegardez le modèle
model.save("model.h5")

Confusion_matrix:  tf.Tensor(
[[ 90   0   0   4]
 [  4 104   1   0]
 [  0   0 102   2]
 [  6   1   0  95]], shape=(4, 4), dtype=int32)
