In [1]:
import pandas as pd
import os
import torch
import librosa
import librosa.display
import pywt
import csv
import pickle
import keras
import glob

import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.layers import *
from keras.callbacks import EarlyStopping

In [2]:
FOLDER_PATH = './dataset/fr/clips/'
FEATURES_FILE = 'transformer_features.pkl'

In [3]:
def padding(data, axis, length):
    pad_width = [(0, 0)] * len(data.shape)
    pad_width[axis] = (0, max(0, length - data.shape[axis]))
    padded_data = np.pad(data, pad_width, mode='constant', constant_values=0)
    return padded_data

def get_features(df_in, max_len):
    features = []
    labels = []

    for index in range(0, len(df_in)):
        filename = FOLDER_PATH + df_in.path[index]
        label = df_in.sentence[index]

        # load the file
        y, sr = librosa.load(filename)
        
        # Trim the audio file to remove leading/trailing silence
        y, _ = librosa.effects.trim(y)

        # Compute MFCC
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  # n_mfcc: nombre de coefficients MFCC à conserver
        
        print(mfccs.shape)
        
        # Pad/truncate MFCCs
        if (mfccs.shape[1] < max_len):
            pad_width = max_len - mfccs.shape[1]
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_len]
        
        features.append(mfccs.T)
        labels.append(label)

    return (features, labels)

def preprocess_data(X, y):
    print(np.mean(X))
    print(np.std(X))
    
    X = np.array((X - np.mean(X)) / np.std(X))
    y = np.array(y)
    return X, y

def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=123)
    return X_train, X_test, X_val, y_train, y_test, y_val

In [4]:
files = glob.glob('./dataset/fr/*.tsv')

dfs = []  # an empty list to store the data frames
for file in files:
    df = pd.read_csv(file, delimiter='\t')  # read each csv file
    dfs.append(df)
    
data = pd.concat(dfs, ignore_index=True)
data = data[['path', 'sentence']]

In [5]:
max_len = 300

if os.path.exists(FEATURES_FILE):
    with open(FEATURES_FILE, 'rb') as f:
        X, y = pickle.load(f)
else:
    X, y = get_features(data, max_len)
    with open(FEATURES_FILE, 'wb') as f:
        pickle.dump((X, y), f)

(13, 39)
(13, 26)
(13, 76)
(13, 111)
(13, 105)
(13, 103)
(13, 99)
(13, 101)
(13, 90)
(13, 102)
(13, 97)
(13, 93)
(13, 125)
(13, 143)
(13, 30)
(13, 39)
(13, 100)
(13, 105)
(13, 125)
(13, 83)
(13, 149)
(13, 136)
(13, 89)
(13, 107)
(13, 130)
(13, 88)
(13, 78)
(13, 123)
(13, 114)
(13, 107)
(13, 99)
(13, 130)
(13, 128)
(13, 135)
(13, 140)
(13, 140)
(13, 85)
(13, 101)
(13, 112)
(13, 104)
(13, 125)
(13, 101)
(13, 104)
(13, 29)
(13, 34)
(13, 161)
(13, 97)
(13, 140)
(13, 117)
(13, 94)
(13, 95)
(13, 66)
(13, 74)
(13, 89)
(13, 82)
(13, 158)
(13, 135)
(13, 274)
(13, 84)
(13, 203)
(13, 111)
(13, 106)
(13, 110)
(13, 121)
(13, 167)
(13, 82)
(13, 148)
(13, 127)
(13, 140)
(13, 83)
(13, 75)
(13, 50)
(13, 138)
(13, 100)
(13, 79)
(13, 86)
(13, 101)
(13, 112)
(13, 91)
(13, 80)
(13, 219)
(13, 153)
(13, 92)
(13, 70)
(13, 84)
(13, 85)
(13, 94)
(13, 109)
(13, 103)
(13, 88)
(13, 92)
(13, 84)
(13, 83)
(13, 86)
(13, 209)
(13, 312)
(13, 147)
(13, 103)
(13, 97)
(13, 134)
(13, 110)
(13, 120)
(13, 118)
(13, 82)
(13, 

In [6]:
X, y = preprocess_data(X, y)

-10.683214
83.52029


In [7]:
from tensorflow.keras import layers

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalisation par couches
    x = layers.LayerNormalization(epsilon=1e-6)(inputs)
    # Attention multi-têtes
    x = layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = layers.Dropout(dropout)(x)
    res = x + inputs

    # Normalisation par couches
    x = layers.LayerNormalization(epsilon=1e-6)(res)
    # Feed Forward
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation='relu')(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    return x + res

def create_transformer(input_shape, num_classes, num_layers=2):
    inputs = keras.Input(shape=input_shape)
    x = inputs
    for _ in range(num_layers):
        x = transformer_encoder(x, head_size=64, num_heads=4, ff_dim=256, dropout=0.5)

    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    
    # Choix de l'optimiseur avec un taux d'apprentissage adapté
    opt = keras.optimizers.Adam(learning_rate=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model


In [11]:
# Divisez les données
X_train, X_test, X_val, y_train, y_test, y_val = split_data(X, y)

# Maintenant, supposons que X a la forme (nombre_d'échantillons, nombre_de_trames, n_mfcc)
num_samples, num_frames, n_mfcc = X_train.shape
input_shape = (num_frames, n_mfcc)
print(input_shape)

# Convertir les étiquettes en format catégorique
num_classes = 4
y_train = to_categorical(y_train - 1, num_classes)
y_val = to_categorical(y_val - 1, num_classes)
y_test = to_categorical(y_test - 1, num_classes)

# Créez le modèle
model = create_transformer(input_shape, num_classes)
model.summary()

# Ajoutez le critère d'arrêt précoce à la liste des callbacks pour l'entraînement
history = model.fit(X_train, y_train, epochs=35, batch_size=512, validation_data=(X_val, y_val))

(300, 13)
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 300, 13)]    0           []                               
                                                                                                  
 layer_normalization_4 (LayerNo  (None, 300, 13)     26          ['input_2[0][0]']                
 rmalization)                                                                                     
                                                                                                  
 multi_head_attention_2 (MultiH  (None, 300, 13)     14093       ['layer_normalization_4[0][0]',  
 eadAttention)                                                    'layer_normalization_4[0][0]']  
                                                                                  

KeyboardInterrupt: 

In [9]:
# Évaluez le modèle
TrainLoss, Trainacc = model.evaluate(X_train, y_train)
TestLoss, Testacc = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)



In [10]:
print('Confusion_matrix: ', tf.math.confusion_matrix(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))

# Sauvegardez le modèle
model.save("transformer_model.h5")

Confusion_matrix:  tf.Tensor(
[[603  36  11  41]
 [  2 742   1   1]
 [  3   6 711   7]
 [ 29 173  17 504]], shape=(4, 4), dtype=int32)
