In [1]:
import torch
from torch.utils.data import Dataset
import torchaudio
import matplotlib.pyplot as plt
import pandas as pd
from os import listdir
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader

In [2]:
audio_files = listdir('drive/MyDrive/mp3s/')

def get_species(mp3_dir):
  split_file = mp3_dir.split('.')
  species_name = split_file[0].rstrip('0123456789')
  return species_name

species_list = list(map(get_species, audio_files))

label_encoder = LabelEncoder()
label_encoder.fit(species_list)

In [3]:
test_frame = pd.DataFrame({'species': species_list,
                           'labels': label_encoder.transform(species_list),
                           'audio_file': audio_files})

In [61]:
class AudioDataSet(Dataset):
  def __init__(self, file_df, audio_dir, audio_length = 10, resample_rate = 48000):
    self.file_df = file_df
    self.audio_dir = audio_dir
    self.audio_length = audio_length
    self.resample_rate = resample_rate
    self.transform = torchaudio.transforms.MelSpectrogram(self.resample_rate/2,
                                                          n_fft=1024)

  def __len__(self):
    return(self.file_df.shape[0])

  def __getitem__(self, idx):
    idx_row = self.file_df.loc[idx]

    label = idx_row.labels

    file_dir = self.audio_dir + idx_row.audio_file
    wav, sample_rate = torchaudio.load(file_dir)
    wav = wav[0,0:self.audio_length*sample_rate]
    # pad short audio clips
    if wav.size()[0] < (self.audio_length*sample_rate):
      pad_tensor = torch.zeros((self.audio_length*sample_rate - wav.size()[0]))
      wav = torch.cat((wav, pad_tensor))

    wav = torchaudio.functional.resample(wav, sample_rate, self.resample_rate)
    wav += 0.001

    mel_spec = self.transform(wav)
    mel_spec = mel_spec.log2()
    mel_spec = torch.unsqueeze(mel_spec, 0)
    return mel_spec, label


In [62]:
test_set = AudioDataSet(test_frame, 'drive/MyDrive/mp3s/')
len(test_set)

2140

In [63]:
loader = DataLoader(test_set, batch_size = 64, shuffle = True)

In [64]:
test_set[0][0].size()

torch.Size([1, 128, 938])

In [96]:
import torch.nn.functional as F
class AudioModel(torch.nn.Module):
  def __init__(self, n_species):
    super(AudioModel, self).__init__()
    self.n_species = n_species

    self.conv_layer1 = torch.nn.Conv2d(1, 1, 3)
    self.conv_layer2 = torch.nn.Conv2d(1, 1, 3)
    self.conv_layer3 = torch.nn.Conv2d(1, 1, 3)
    self.conv_layer4 = torch.nn.Conv2d(1, 1, 3)
    self.conv_layer5 = torch.nn.Conv2d(1, 1, 3)

    self.batchnorm = torch.nn.BatchNorm1d(54)
    self.linear_layer = torch.nn.Linear(54 , 54)
    self.classification_layer = torch.nn.Linear(54, self.n_species)

  def n_features(self, x):
    size = x.size()[1:]
    num_features = 1
    for s in size:
      num_features *= s
    return num_features

  def forward(self, x):
    x = F.max_pool2d(F.relu(self.conv_layer1(x)), (2, 2))
    x = F.max_pool2d(F.relu(self.conv_layer2(x)), (2, 2))
    x = F.max_pool2d(F.relu(self.conv_layer3(x)), (2, 2))
    x = F.max_pool2d(F.relu(self.conv_layer4(x)), (2, 2))
    x = F.max_pool2d(F.relu(self.conv_layer5(x)), (2, 2))

    x = x.view(-1, self.n_features(x))
    x = self.batchnorm(x)
    x = F.relu(self.linear_layer(x))
    x = self.classification_layer(x)
    return x

In [97]:
model = AudioModel(22)

In [98]:
model(next(iter(loader))[0])

tensor([[-0.0446,  0.0786,  0.1160,  ..., -0.0550, -0.0040,  0.0203],
        [-0.0446,  0.0786,  0.1160,  ..., -0.0550, -0.0040,  0.0203],
        [-0.0446,  0.0786,  0.1160,  ..., -0.0550, -0.0040,  0.0203],
        ...,
        [-0.0446,  0.0786,  0.1160,  ..., -0.0550, -0.0040,  0.0203],
        [-0.0446,  0.0786,  0.1160,  ..., -0.0550, -0.0040,  0.0203],
        [-0.0446,  0.0786,  0.1160,  ..., -0.0550, -0.0040,  0.0203]],
       grad_fn=<AddmmBackward0>)