<a href="https://colab.research.google.com/github/utkarsh-mishra19/pytorch/blob/main/Audio_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install opendatasets
import opendatasets as od
od.download("https://www.kaggle.com/datasets/mohammedalrajeh/quran-recitations-for-audio-classification")

Skipping, found downloaded files in "./quran-recitations-for-audio-classification" (use force=True to force download)


In [16]:
from os import times
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader,Dataset
import librosa
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import time
from skimage.transform import resize

device = "cuda" if torch.cuda.is_available() else "cpu"


In [17]:
data_df = pd.read_csv("/content/quran-recitations-for-audio-classification/files_paths.csv")


print(data_df["Class"].unique())
data_df.head()

['Mohammed_Aluhaidan' 'Yasser_Aldossary' 'Maher_Almuaiqly'
 'Nasser_Alqutami' 'AbdulBari_Althubaity' 'Bander_Balilah'
 'Ali_Alhothaify' 'Saud_Alshuraim' 'Mohammed_Ayoub' 'AbdulRahman_Alsudais'
 'Saad_Alghamdi' 'Abdullah_Albuaijan']


Unnamed: 0,FilePath,Class
0,./Dataset/Mohammed_Aluhaidan/lohaidan_171.wav,Mohammed_Aluhaidan
1,./Dataset/Mohammed_Aluhaidan/lohaidan_159.wav,Mohammed_Aluhaidan
2,./Dataset/Mohammed_Aluhaidan/lohaidan_401.wav,Mohammed_Aluhaidan
3,./Dataset/Mohammed_Aluhaidan/lohaidan_367.wav,Mohammed_Aluhaidan
4,./Dataset/Mohammed_Aluhaidan/lohaidan_373.wav,Mohammed_Aluhaidan


In [18]:
base_dir = "/content/quran-recitations-for-audio-classification/Dataset"
data_df['FilePath'] = data_df['FilePath'].apply(lambda x: os.path.join(base_dir, x.split('/')[-2], x.split('/')[-1]))
data_df.head()

Unnamed: 0,FilePath,Class
0,/content/quran-recitations-for-audio-classific...,Mohammed_Aluhaidan
1,/content/quran-recitations-for-audio-classific...,Mohammed_Aluhaidan
2,/content/quran-recitations-for-audio-classific...,Mohammed_Aluhaidan
3,/content/quran-recitations-for-audio-classific...,Mohammed_Aluhaidan
4,/content/quran-recitations-for-audio-classific...,Mohammed_Aluhaidan


In [19]:
print(data_df.shape)

(6687, 2)


In [20]:
label_encoder = LabelEncoder()
label_encoder.fit_transform(data_df["Class"])
data_train = data_df.sample(frac = 0.7,random_state=7)
data_test = data_df.drop(data_train.index)
data_val = data_test.sample(frac = 0.5,random_state=7)
data_test = data_test.drop(data_val.index)
print(data_train.shape,data_val.shape,data_test.shape)

(4681, 2) (1003, 2) (1003, 2)


In [21]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, label_encoder, device="cpu"):
        self.dataframe = dataframe.reset_index(drop=True)
        self.label_encoder = label_encoder
        self.device = device
        self.labels = torch.tensor(
            self.label_encoder.transform(self.dataframe["Class"]),
            dtype=torch.long
        )

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        file_path = self.dataframe.iloc[idx]["FilePath"]
        label = self.labels[idx]

        # compute spectrogram on the fly
        audio = self.get_spectrogram(file_path)
        audio = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)  # (1, H, W)

        return audio.to(self.device), label.to(self.device)

    def get_spectrogram(self, file_path):
        sr = 22050
        duration = 5
        img_height, img_width = 128, 256

        signal, sr = librosa.load(file_path, sr=sr, duration=duration)
        spec = librosa.feature.melspectrogram(
            y=signal, sr=sr, n_fft=2048, hop_length=512, n_mels=128
        )
        spec_db = librosa.power_to_db(spec, ref=np.max)

        # resize to fixed shape
        spec_resized = resize(spec_db, (img_height, img_width), anti_aliasing=True)
        return spec_resized

In [22]:
train_dataset = CustomDataset(dataframe=data_train, label_encoder=label_encoder, device=device)
val_dataset   = CustomDataset(dataframe=data_val, label_encoder=label_encoder, device=device)
test_dataset  = CustomDataset(dataframe=data_test, label_encoder=label_encoder, device=device)

In [23]:
LR = 1e-4
BATCH_SIZE = 16
EPOCHS = 25

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [24]:
class Net(nn.Module):
  def __init__(self):
    super().__init__()
    self.conv1 = nn.Conv2d(1, 16, kernel_size = 3, padding = 1) # First Convolution layer
    self.conv2 = nn.Conv2d(16, 32, kernel_size = 3, stride = 1, padding = 1) # Second Convolution layer
    self.conv3 = nn.Conv2d(32, 64, kernel_size = 3, stride = 1, padding = 1) # Third Convolution layer
    self.pooling = nn.MaxPool2d(2,2) # The pooling layer, we will be using the same layer after each conv2d.
    self.relu = nn.ReLU() # ReLU Activation function

    self.flatten = nn.Flatten() # Flatten and vectorize the output feature maps that somes from the final convolution layer.
    self.linear1 = nn.Linear((64 * 16 * 32), 4096) # Traditional Dense (Linear)
    self.linear2 = nn.Linear(4096, 1024) # Traditional Dense (Linear)
    self.linear4 = nn.Linear(1024, 512) # Traditional Dense (Linear)
    self.output = nn.Linear(512, len(data_df['Class'].unique())) # Output Linear Layer

    self.dropout = nn.Dropout(0.5)

  def forward(self, x):
    x = self.conv1(x) # -> Outputs: (16, (128, 256))
    x = self.pooling(x)# -> Outputs: (16, (64, 128))
    x = self.relu(x)
    x = self.dropout(x)
    x = self.conv2(x) # -> Outputs: (32, (64, 128))
    x = self.pooling(x) # -> Outputs: (32, (32, 64))
    x = self.relu(x)
    x = self.dropout(x)
    x = self.conv3(x) # -> Outputs: (64, (32, 64))
    x = self.pooling(x) # -> Outputs: (64, (16, 32))
    x = self.relu(x)
    x = self.dropout(x)
    x = x.view(x.size(0), -1)

    x = self.flatten(x)

    x = self.linear1(x)
    x = self.dropout(x)

    x = self.linear2(x)
    x = self.dropout(x)


    x = self.linear4(x)
    x = self.dropout(x)

    x = self.output(x)

    return x

model = Net().to(device) # Create an instance of the model and move it to the GPU Device


print(model)



Net(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pooling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (relu): ReLU()
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear1): Linear(in_features=32768, out_features=4096, bias=True)
  (linear2): Linear(in_features=4096, out_features=1024, bias=True)
  (linear4): Linear(in_features=1024, out_features=512, bias=True)
  (output): Linear(in_features=512, out_features=12, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [25]:
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=LR)


In [26]:
for epoch in range(EPOCHS):
  total_acc_train = 0
  total_acc_val = 0
  total_loss_train = 0
  total_loss_val = 0
  for inputs,labels in train_loader:
    output = model(inputs)
    train_loss = criterion(output,labels)
    total_loss_train += train_loss.item()
    train_loss.backward()
    train_acc = (torch.argmax(output,axis = 1)==labels).sum().item()
    total_acc_train += train_acc
    optimizer.step()
    optimizer.zero_grad()
  with torch.no_grad():
    for inputs,labels in val_loader:
      output = model(inputs)
      val_loss = criterion(output,labels)
      total_loss_val+=val_loss.item()
      val_acc = (torch.argmax(output)==labels).sum().item()
      total_acc_val+=val_acc

  signal, sr = librosa.load(file_path, sr=sr, duration=duration)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: '/content/quran-recitations-for-audio-classification/Dataset/Yasser_Aldossary/Yasser_202.wav'