In [2]:
import torch
torch.zeros(1).cuda()

tensor([0.], device='cuda:0')

In [3]:
import torch
from pathlib import Path
import numpy as np
from torchvision import transforms
import torchaudio
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
import librosa
import librosa.display

In [4]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [8]:
class AudioUtil:
    """Static class for audio processing helper functions."""
    
    @staticmethod
    def open(audio_file: str):
        """Load an audio file. Return the signal as a tensor and the sample rate"""
        sig, sr = librosa.load(audio_file, sr=256000)
        return (sig, sr)
    
    @staticmethod
    def get_audio_duration(sig, sr):
        """Return the duration of an audio signal in seconds"""
        return librosa.get_duration(sig, sr)
    
    @staticmethod
    def mel_spectro_gram(sig: np.array, sr: int, n_mels=32, n_fft=1024):
        """Generate a Spectrogram"""
        # get mel spectrogram
        spec = librosa.feature.melspectrogram(y=sig, sr=sr)
        spec = librosa.amplitude_to_db(spec)
        return spec
    
    @staticmethod
    def get_audio_specs_size(spec):
        """Return the size of a spectrogram image"""
        return spec.shape
    
    @staticmethod
    def plot_mel_spectro_gram(spec: np.array, sr: int):
        """Plot a Spectrogram"""
        # plot mel spectrogram
        fig, ax = plt.subplots()
        S_dB = librosa.power_to_db(spec, ref=np.max)
        img = librosa.display.specshow(S_dB, x_axis='time',
                                y_axis='mel', sr=sr,
                                ax=ax)
        fig.colorbar(img, ax=ax, format='%+2.0f dB')
        ax.set(title='Mel-frequency spectrogram')
        

In [9]:
download_path = Path.cwd() / ".dataset"

# Read metadata file
labels_file = download_path / "Y_train_ofTdMHi.csv"
df = pd.read_csv(labels_file)

# Construct file path by concatenating fold and file name
df["relative_path"] = str(download_path) + "/X_train/" + df["id"]
df.drop(columns=["id"], inplace=True)
df.rename(columns={"pos_label": "label"}, inplace=True)
# invert relative_path and label columns positions
df = df[["relative_path", "label"]]
print(f"There are ** {len(df)} ** audio files in the dataset.")
df.head()


There are ** 23168 ** audio files in the dataset.


Unnamed: 0,relative_path,label
0,c:\Users\Tristan Gonçalves\Documents\spe_IA_cl...,0.0
1,c:\Users\Tristan Gonçalves\Documents\spe_IA_cl...,1.0
2,c:\Users\Tristan Gonçalves\Documents\spe_IA_cl...,1.0
3,c:\Users\Tristan Gonçalves\Documents\spe_IA_cl...,1.0
4,c:\Users\Tristan Gonçalves\Documents\spe_IA_cl...,1.0


In [10]:
audio_util = AudioUtil()
sig, sr = audio_util.open(df.loc[0, "relative_path"])
spec = audio_util.mel_spectro_gram(sig, sr)
spectrograme_shape = audio_util.get_audio_specs_size(spec)

  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


In [11]:
def save_spectrograms(nb_files = len(df)):
    audio_util = AudioUtil()
    audio_specs = np.empty((0, spectrograme_shape[0], spectrograme_shape[1]))
    label_files = np.empty(0)
    print("Starting spectrograms generation...")
    for line_num in range(nb_files):
        print(f"Generating spectrogram {line_num+1}/{nb_files}", end='\r')
        sig, sr = audio_util.open(df.loc[line_num, "relative_path"])
        spec = audio_util.mel_spectro_gram(sig, sr)
        audio_specs = np.append(audio_specs, [spec] , axis=0)
        label_files = np.append(label_files, df.loc[line_num, "label"])
    print("Spectrograms generated !", end='\n\n')

    print("Saving spectrograms...")
    os.mkdir("numpy_data") if not os.path.exists("numpy_data") else None
    np.save(os.path.join("numpy_data", "audio_specs.npy"), audio_specs)
    np.save(os.path.join("numpy_data", "label_files.npy"), label_files)
    print("Spectrograms saved !")
    print("Global shape : ", audio_specs.shape)

save_spectrograms(10000)

Starting spectrograms generation...
Spectrograms generated !0000/10000

Saving spectrograms...
Spectrograms saved !
Global shape :  (10000, 128, 101)


In [12]:
def get_spectrograms_from_file(file_path):
    audio_specs = np.load(file_path)
    return audio_specs

def get_labels_from_file(file_path):
    label_files = np.load(file_path)
    return label_files

spectro = get_spectrograms_from_file(os.path.join(os.getcwd(),"numpy_data", "audio_specs.npy"))
labels = get_labels_from_file(os.path.join(os.getcwd(),"numpy_data", "label_files.npy"))

print(spectro.shape)
print(labels.shape)


(10000, 128, 101)
(10000,)


In [13]:
size_train = int(0.8 * len(spectro))
size_test = len(spectro) - size_train

train_spectro = spectro[:size_train]
train_labels = labels[:size_train]

test_spectro = spectro[size_train:]
test_labels = labels[size_train:]

print(f"Number of spectrograms for training : {len(train_spectro)}")
print(f"Number of spectrograms for testing : {len(test_spectro)}")

Number of spectrograms for training : 8000
Number of spectrograms for testing : 2000


In [14]:
class MyDataset(Dataset):
    def __init__(self, spectro, labels, transform=None):
        self.spectro = spectro
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.spectro)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        spectro = self.spectro[idx]
        label = self.labels[idx].astype(np.int64)

        if self.transform:
            spectro = self.transform(spectro)

        return spectro, label

training_dataset = MyDataset(train_spectro, train_labels)
testing_dataset = MyDataset(test_spectro, test_labels)

In [15]:
train_generator = torch.utils.data.DataLoader(training_dataset, batch_size=128, shuffle=True)
val_generator = torch.utils.data.DataLoader(testing_dataset, batch_size=128, shuffle=False)

In [16]:
class AudioClassifier(nn.Module):
    def __init__(self, input_channels, num_classes):
        super(AudioClassifier, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=input_channels, out_channels=256, kernel_size=(3, 3), padding=1)
        self.conv2 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=(3, 3), padding=1)
        self.conv3 = nn.Conv2d(in_channels=128, out_channels=64, kernel_size=(3, 3), padding=1)
        # self.pool = nn.MaxPool2d(kernel_size=(2, 2))
        # self.fc1 = nn.Linear(64 * 31 * 2, 128)  
        self.fc1 = nn.Linear(827392 , 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.Flatten = nn.Flatten()

    def forward(self, x):
        x = x.unsqueeze(1)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.Flatten(x)
        # x = x.view(64 * 31 * 2)  # Adjust the input size based on your mel spectrogram size
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [17]:
# Create the model and put it on the GPU if available
input_channels = 1
num_classes = 2
model = AudioClassifier(input_channels, num_classes).double() # .to(torch.float32)
print(model)

# device = "cpu"
# Load model from pth file
# model.load_state_dict(torch.load("model.pth"))
model = model.to(device)
# Check that it is on Cuda
next(model.parameters()).device

AudioClassifier(
  (conv1): Conv2d(1, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=827392, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
  (Flatten): Flatten(start_dim=1, end_dim=-1)
)


device(type='cuda', index=0)

In [18]:
criterion = nn.BCELoss() # binary crossentropy loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [19]:
import time
from tqdm import tqdm
## Training loop

num_epochs = 10
train_loader = train_generator
val_loader = val_generator

for epoch in range(num_epochs):
    # Set the model to training mode
    model.train()
    
    # Initialize the progress bar
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)

    for inputs, labels in progress_bar:
        optimizer.zero_grad()  # Zero the gradients

        outputs = model(inputs.double().to(device)).to(device)  # Forward pass
        loss = criterion(outputs.to(device), labels.to(device)).to(device)  # Compute the loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

        # Update the progress bar description with the current loss
        progress_bar.set_postfix({'Loss': loss.item()})

    # Close the progress bar
    progress_bar.close()

    # Validation (optional)
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        val_loss = 0.0
        correct = 0
        total = 0
        for inputs, labels in val_loader:
            outputs = model(inputs.double().to(device))
            val_loss += criterion(outputs.to(device), labels.to(device)).item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels.to(device)).sum().item()

        accuracy = correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {100 * accuracy:.2f}%')

# Save the model
torch.save(model.state_dict(), "model.pth")

                                                                         

Epoch [1/10], Loss: 0.5240, Validation Accuracy: 4.40%


                                                                              

KeyboardInterrupt: 

In [None]:

for inputs, labels in train_loader:
    # Check the shape of inputs
    print(inputs.shape)


torch.Size([80, 128, 101])
