## Import des libraries

In [None]:
import numpy as np
import torch
import torch.nn as nn
import os
from scipy.io import wavfile

## Téléchargement du dataset

In [None]:
!gdown 1Q3sbordKpx65ExOvqtM4fif7rDXhIXwS
!unzip datasetaudio.zip

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
  inflating: train/y_train/2324-a.wav  
  inflating: train/y_train/1031-b.wav  
  inflating: train/y_train/3347-b.wav  
  inflating: train/y_train/1872-b.wav  
  inflating: train/y_train/407-a.wav  
  inflating: train/y_train/2361-a.wav  
  inflating: train/y_train/1074-b.wav  
  inflating: train/y_train/895-a.wav  
  inflating: train/y_train/1837-b.wav  
  inflating: train/y_train/3302-b.wav  
  inflating: train/y_train/421-b.wav  
  inflating: train/y_train/2347-b.wav  
  inflating: train/y_train/1483-a.wav  
  inflating: train/y_train/2796-b.wav  
  inflating: train/y_train/1052-a.wav  
  inflating: train/y_train/3324-a.wav  
  inflating: train/y_train/1811-a.wav  
  inflating: train/y_train/464-b.wav  
  inflating: train/y_train/2302-b.wav  
  inflating: train/y_train/1017-a.wav  
  inflating: train/y_train/1854-a.wav  
  inflating: train/y_train/3361-a.wav  
  inflating: train/y_train/798-b

Sinon, le lien est ici: https://drive.google.com/file/d/1Q3sbordKpx65ExOvqtM4fif7rDXhIXwS/view?usp=sharing

## Load le dataset

### Load le dataset de train
Pour load un fichier .wav, on utilise la fonction `wavfile.read()` de `scipy.io` qui nous retourne un tuple `(samplerate, data)`. `data` est un array numpy de dimension 1, contenant les valeurs du signal audio. Le nombre d'échantillons par seconde pour nos données est de 4000, donc chaque fichier .wav contient 6000 valeurs car la durée du signal est de 1.5 secondes. `data` est donc de shape `(6000,)`.

Il y a 4096 fichiers .wav dans le dossier `train/x_train` et 2048 fichiers .wav dans le dossier `test/x_test`. Donc on peut créer un array numpy de shape `(4096, 6000)` pour les données d'entraînement et un array numpy de shape `(2048, 6000)` pour les données de test.

In [None]:
X_train = []
Y1_train = []
Y2_train = []

N_TRAIN = 4096

for i in range(N_TRAIN):
    x = wavfile.read(f"train/x_train/{i}.wav")[1]
    y1 = wavfile.read(f"train/y_train/{i}-a.wav")[1]
    y2 = wavfile.read(f"train/y_train/{i}-b.wav")[1]

    X_train.append(x)
    Y1_train.append(y1)
    Y2_train.append(y2)

X_train = np.array(X_train)
Y1_train = np.array(Y1_train)
Y2_train = np.array(Y2_train)

print(X_train.shape)
print(Y1_train.shape)
print(Y2_train.shape)

(4096, 6000)
(4096, 6000)
(4096, 6000)


### Load le dataset de test

In [None]:
X_test = []

N_TEST = 512

for i in range(N_TEST):
    x = wavfile.read(f"test/x_test/{i}.wav")[1]
    X_test.append(x)

X_test = np.array(X_test)

print(X_test.shape)

(512, 6000)


## Lecture des données

In [None]:
import IPython.display as ipd

SAMPLERATE = 4000

In [None]:

ipd.Audio(X_train[0], rate=SAMPLERATE)

In [None]:
ipd.Audio(Y1_train[0], rate=SAMPLERATE)

In [None]:
ipd.Audio(Y2_train[0], rate=SAMPLERATE)

## Batch les données

In [None]:
batch_size = 16
X_train_reshaped = X_train.reshape(-1, batch_size, 6000)
Y1_train_reshaped = Y1_train.reshape(-1, batch_size, 6000)
Y2_train_reshaped = Y2_train.reshape(-1, batch_size, 6000)

print(X_train_reshaped.shape)
print(Y1_train_reshaped.shape)
print(Y2_train_reshaped.shape)

(256, 16, 6000)
(256, 16, 6000)
(256, 16, 6000)


## Convertir les données en torch.tensor

In [None]:
X_train_torch = torch.from_numpy(X_train_reshaped).float()
Y1_train_torch = torch.from_numpy(Y1_train_reshaped).float()
Y2_train_torch = torch.from_numpy(Y2_train_reshaped).float()

## Faire la même chose pour le dataset de test

In [None]:
X_test_reshaped = X_test.reshape(-1, batch_size, 6000)
X_test_torch = torch.from_numpy(X_test_reshaped).float()

print(X_test_torch.shape)

torch.Size([32, 16, 6000])


## Créer le modèle

In [105]:

class Encoder(nn.Module):
  def __init__(self):
    super().__init__()
    self.conv1d_1=nn.Conv1d(1,16,4,2,1) #3000
    self.conv1d_2=nn.Conv1d(16,32,4,2,1) #1500
    self.conv1d_3=nn.Conv1d(32,64,4,2,1) #750
    self.conv1d_4=nn.Conv1d(64,128,4,2,1) #375
    self.conv1d_4bis=nn.Conv1d(128,256,3,1,1) #375
    self.conv1d_44=nn.Conv1d(256,256,3,1,1) #375
    self.conv1d_4bisbis=nn.ConvTranspose1d(256,128,3,1,1) #375
    self.ReLU=nn.LeakyReLU(0.2)

  def forward(self, x):
    c1=self.conv1d_1(x)
    c1=self.ReLU(c1)
    c2=self.conv1d_2(c1)
    c2=self.ReLU(c2)
    c3=self.conv1d_3(c2)
    c3=self.ReLU(c3)
    c4=self.conv1d_4(c3)
    c4=self.ReLU(c4)
    c4=self.conv1d_4bis(c4)
    c4=self.ReLU(c4)
    c4=self.conv1d_44(c4)
    c4=self.ReLU(c4)
    c4=self.conv1d_4bisbis(c4)
    c4=self.ReLU(c4)


    return [c1, c2, c3, c4]


class Decoder(nn.Module):
  def __init__(self):
    super().__init__()
    self.Tconv1d_1=nn.ConvTranspose1d(128,64,4,2,1) #750
    self.conv1d_1=nn.Conv1d(128,64,3,1,1)

    self.Tconv1d_2=nn.ConvTranspose1d(64,32,4,2,1) #1500
    self.conv1d_2=nn.Conv1d(64,32,3,1,1)

    self.Tconv1d_3=nn.ConvTranspose1d(32,16,4,2,1) #3000
    self.conv1d_3=nn.Conv1d(32,16,3,1,1)

    self.Tconv1d_4=nn.ConvTranspose1d(16,1,4,2,1) #6000


  def forward(self, c1, c2, c3, c4):
    z=self.Tconv1d_1(c4)
    z=torch.cat([z,c3],dim=1)
    z=self.conv1d_1(z)

    z=self.Tconv1d_2(z)
    z=torch.cat([z,c2],dim=1)
    z=self.conv1d_2(z)

    z=self.Tconv1d_3(z)
    z=torch.cat([z,c1],dim=1)
    z=self.conv1d_3(z)

    z=self.Tconv1d_4(z)

    return z


class Baseline(nn.Module):
  def __init__(self):
    super().__init__()
    self.encoder = Encoder()
    self.decoder = Decoder()

  def forward(self, x):
    y=x.unsqueeze(1)
    outs = self.encoder(y)
    out = self.decoder(*outs)
    out=out.squeeze(1)
    y2=x-out
    return out, y2

In [106]:
#!pip install torchinfo
from torchinfo import summary

summary(Baseline(), input_size=(batch_size,6000))

Layer (type:depth-idx)                   Output Shape              Param #
Baseline                                 [16, 6000]                --
├─Encoder: 1-1                           [16, 16, 3000]            --
│    └─Conv1d: 2-1                       [16, 16, 3000]            80
│    └─LeakyReLU: 2-2                    [16, 16, 3000]            --
│    └─Conv1d: 2-3                       [16, 32, 1500]            2,080
│    └─LeakyReLU: 2-4                    [16, 32, 1500]            --
│    └─Conv1d: 2-5                       [16, 64, 750]             8,256
│    └─LeakyReLU: 2-6                    [16, 64, 750]             --
│    └─Conv1d: 2-7                       [16, 128, 375]            32,896
│    └─LeakyReLU: 2-8                    [16, 128, 375]            --
│    └─Conv1d: 2-9                       [16, 256, 375]            98,560
│    └─LeakyReLU: 2-10                   [16, 256, 375]            --
│    └─ConvTranspose1d: 2-11             [16, 128, 375]            98,4

## Boucle d'entraînement

In [109]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_fn = nn.MSELoss(reduction='none')
model = Baseline().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for i in range(35) :
  for i in range(len(X_train_torch)):
      optimizer.zero_grad()
      X = X_train_torch[i].to(device)
      Y1 = Y1_train_torch[i].to(device)
      Y2 = Y2_train_torch[i].to(device)

      Y1_pred, Y2_pred = model(X)
      loss=0
      t1 = torch.mean(loss_fn(Y1_pred, Y1), dim=1)
      t2 = torch.mean(loss_fn(Y1_pred, Y2), dim=1)

      loss=torch.mean(torch.min(t1,t2))

      loss.backward()
      optimizer.step()
      if i % 10 == 0:
          print(f"Loss {loss.item():.4f}")

Loss 0.0669
Loss 0.0224
Loss 0.0138
Loss 0.0146
Loss 0.0152
Loss 0.0121
Loss 0.0124
Loss 0.0134
Loss 0.0149
Loss 0.0106
Loss 0.0133
Loss 0.0128
Loss 0.0122
Loss 0.0135
Loss 0.0125
Loss 0.0133
Loss 0.0125
Loss 0.0119
Loss 0.0126
Loss 0.0135
Loss 0.0123
Loss 0.0134
Loss 0.0133
Loss 0.0107
Loss 0.0122
Loss 0.0104
Loss 0.0122
Loss 0.0132
Loss 0.0105
Loss 0.0132
Loss 0.0126
Loss 0.0113
Loss 0.0112
Loss 0.0126
Loss 0.0143
Loss 0.0098
Loss 0.0131
Loss 0.0125
Loss 0.0119
Loss 0.0138
Loss 0.0124
Loss 0.0130
Loss 0.0125
Loss 0.0117
Loss 0.0124
Loss 0.0133
Loss 0.0120
Loss 0.0131
Loss 0.0131
Loss 0.0106
Loss 0.0121
Loss 0.0102
Loss 0.0121
Loss 0.0133
Loss 0.0105
Loss 0.0129
Loss 0.0125
Loss 0.0110
Loss 0.0110
Loss 0.0124
Loss 0.0141
Loss 0.0096
Loss 0.0130
Loss 0.0124
Loss 0.0117
Loss 0.0135
Loss 0.0120
Loss 0.0128
Loss 0.0123
Loss 0.0116
Loss 0.0126
Loss 0.0132
Loss 0.0119
Loss 0.0124
Loss 0.0130
Loss 0.0106
Loss 0.0121
Loss 0.0103
Loss 0.0119
Loss 0.0131
Loss 0.0106
Loss 0.0129
Loss 0.0123
Loss

## Prédiction

In [110]:
model.eval()

predictions = np.array([])
predictions = predictions.reshape(0, 2, 6000)

for i in range(len(X_test_torch)):
    X = X_test_torch[i].to(device)
    with torch.no_grad():
        Y1_pred, Y2_pred = model(X)
        Y_pred = torch.stack([Y1_pred, Y2_pred], dim=1)
    predictions = np.concatenate([predictions, Y_pred.cpu().numpy()])

np.save("predictions.npy", predictions)
!zip predictions.zip predictions.npy

updating: predictions.npy (deflated 46%)
