In [8]:
import torch
import torch.utils.data as data
import torch.nn.functional as F

import librosa
import soundfile as sf
import numpy as np
import cv2

from fastprogress import progress_bar

In [9]:
class Dataset(data.Dataset):
    def __init__(self,audio_list):
        self.audio_list = audio_list
        self.n_dft = 2048
        self.n_mels = 128
        self.n_hop = 242
        self.asr = 48000
    def __len__(self):
        return len(self.audio_list)
   
    def __getitem__(self, idx: int):
        audio = audio_list[idx]
        melspec_librosa = librosa.feature.melspectrogram(audio,sr=self.asr,n_fft=self.n_dft,
                                hop_length=self.n_hop,win_length=None,center=True,power=2.0,n_mels=self.n_mels,
                                norm='slaney',htk=True)
        image = self.norm_func(melspec_librosa)
        image = cv2.resize(melspec_librosa, (199,128))
        image = np.reshape(image,(128,199,1))
        image = np.moveaxis(image, 2, 0)
        image = (image / 255.0).astype(np.float32)
        return image
    def norm_func(self,X: np.ndarray,mean=None,std=None,
                  norm_max=None,norm_min=None,eps=1e-6):

        mean = mean or X.mean()
        X = X - mean
        std = std or X.std()
        Xstd = X / (std + eps)
        _min, _max = Xstd.min(), Xstd.max()
        norm_max = norm_max or _max
        norm_min = norm_min or _min
        if (_max - _min) > eps:
            V = Xstd
            V[V < norm_min] = norm_min
            V[V > norm_max] = norm_max
            V = 255 * (V - norm_min) / (norm_max - norm_min)
            V = V.astype(np.uint8)
        else:
            V = np.zeros_like(Xstd, dtype=np.uint8)
        return V

In [10]:
audio, _ = sf.read('/content/Fanfare60.wav')
audio_list = [audio]
dataset = Dataset(audio_list=audio_list)
loader = data.DataLoader(dataset, batch_size=10, shuffle=False)

In [11]:
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.BN1 = torch.nn.BatchNorm2d(1)

        self.C1 = torch.nn.Conv2d(1,64,3,padding=1)
        self.BN2 = torch.nn.BatchNorm2d(64)
        self.C2 = torch.nn.Conv2d(64,64,3,padding=1)
        self.BN3 = torch.nn.BatchNorm2d(64)
        self.maxpool1 = torch.nn.MaxPool2d(2,2)

        self.C3 = torch.nn.Conv2d(64,128,3,padding=1)
        self.BN4 = torch.nn.BatchNorm2d(128)
        self.C4 = torch.nn.Conv2d(128,128,3,padding=1)
        self.BN5 = torch.nn.BatchNorm2d(128)
        self.maxpool2 = torch.nn.MaxPool2d(2,2)

        self.C5 = torch.nn.Conv2d(128,256,3,padding=1)
        self.BN6 = torch.nn.BatchNorm2d(256)
        self.C6 = torch.nn.Conv2d(256,256,3,padding=1)
        self.BN7 = torch.nn.BatchNorm2d(256)
        self.maxpool3 = torch.nn.MaxPool2d(2,2)

        self.C7 = torch.nn.Conv2d(256,512,3,padding=1)
        self.BN8 = torch.nn.BatchNorm2d(512)
        self.C8 = torch.nn.Conv2d(512,512,3,padding=1)
        self.BN9 = torch.nn.BatchNorm2d(512)
        
        self.C9 = torch.nn.Conv2d(512,512,3,padding=1)

        self.maxpool4 = torch.nn.MaxPool2d(kernel_size=(4,8))
    def forward(self, x):
        x = self.BN1(x)

        x = F.relu(self.BN2(self.C1(x)))
        x = F.relu(self.BN3(self.C2(x)))
        x = self.maxpool1(x)

        x = F.relu(self.BN4(self.C3(x)))
        x = F.relu(self.BN5(self.C4(x)))
        x = self.maxpool2(x)

        x = F.relu(self.BN6(self.C5(x)))
        x = F.relu(self.BN7(self.C6(x)))
        x = self.maxpool3(x)

        x = F.relu(self.BN8(self.C7(x)))
        x = F.relu(self.BN9(self.C8(x)))
        x = self.C9(x)

        x = self.maxpool4(x)
        x = torch.flatten(x,start_dim=1)
        return x

In [12]:
model = Model()

In [13]:
for audio in progress_bar(loader):
    pred = model(audio)

In [14]:
pred.shape

torch.Size([1, 6144])