In [20]:
import sys
import librosa
import numpy as np
import matplotlib.pyplot as plt
import glob
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchaudio
import torchaudio.transforms as T

# Dataloader
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import KFold
import pandas as pd

# Models
# from '../models import CNN

from IPython.display import Audio

In [11]:
# Show all wav files
directory = '../Dataset/Audio Dataset/Cats and Dogs/data'
files = sorted(glob.glob(directory+'/*'))
print(files)

['../Dataset/Audio Dataset/Cats and Dogs/data/cat_1.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_10.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_100.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_101.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_102.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_103.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_105.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_106.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_107.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_108.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_109.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_11.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_110.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_112.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_113.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_114.wav', '../Dataset/Audio Dataset/Cats and Dogs/data/cat_115.wav', 

### Sample Cat Audio

In [12]:
print(files[0])
Audio(files[0])

../Dataset/Audio Dataset/Cats and Dogs/data/cat_1.wav


### Sample Dog Audio

In [13]:
print(files[200])
Audio(files[200])

../Dataset/Audio Dataset/Cats and Dogs/data/dog_barking_3.wav


### Preparing dataset

In [14]:
# Labelling the wav file and its classification
df = pd.DataFrame(columns=['idx', 'audio_file', 'classification'])
for idx, audio_file in enumerate(files):
    audio_path = audio_file.split('/')[-1]
    classification = audio_path.split('_')[0]
    row = pd.DataFrame([{"idx":idx, "audio_file": audio_file, "classification": 1 if classification=='dog' else 0}])
    df = pd.concat([df, row])

pd.set_option('display.max_rows', None)    # Show all rows
pd.set_option('display.max_columns', None)  
print(df)
# pd.reset_option()

   idx                                         audio_file classification
0    0  ../Dataset/Audio Dataset/Cats and Dogs/data/ca...              0
0    1  ../Dataset/Audio Dataset/Cats and Dogs/data/ca...              0
0    2  ../Dataset/Audio Dataset/Cats and Dogs/data/ca...              0
0    3  ../Dataset/Audio Dataset/Cats and Dogs/data/ca...              0
0    4  ../Dataset/Audio Dataset/Cats and Dogs/data/ca...              0
0    5  ../Dataset/Audio Dataset/Cats and Dogs/data/ca...              0
0    6  ../Dataset/Audio Dataset/Cats and Dogs/data/ca...              0
0    7  ../Dataset/Audio Dataset/Cats and Dogs/data/ca...              0
0    8  ../Dataset/Audio Dataset/Cats and Dogs/data/ca...              0
0    9  ../Dataset/Audio Dataset/Cats and Dogs/data/ca...              0
0   10  ../Dataset/Audio Dataset/Cats and Dogs/data/ca...              0
0   11  ../Dataset/Audio Dataset/Cats and Dogs/data/ca...              0
0   12  ../Dataset/Audio Dataset/Cats and Dogs/data

### Creating custom dataloader class

In [15]:
class CatsAndDogsDataset(Dataset):
    def __init__(self, target_length, n_fft, hop, dataframe):
        self.df = dataframe
        self.n_fft = n_fft
        self.hop = hop
        self.target_length = target_length
 
    def __getitem__(self, index):
        audio_path = self.df.iloc[index]['audio_file']
        signal, sr = torchaudio.load(audio_path)
        modified_signal = self._process_signal(signal)
        classification = self.df.iloc[index]['classification']
        return modified_signal, classification
    
    def __len__(self):
        return len(self.df)
    
    def _process_signal(self, signal):
        # Right pad the audio files that are shorter than target, cut if longer.
        if len(signal) > self.target_length:
            print(signal.shape)
            signal = signal[:, :self.target_length]
            print(signal.shape)
        elif len(signal) < self.target_length:
            signal = F.pad(signal, [0, self.target_length-signal.shape[1]])
            
        spectrogram = T.Spectrogram(n_fft=self.n_fft, hop_length=self.hop)(signal)
        return spectrogram


### Train and Test functions

In [16]:
def train_model(model, lr, epochs, dataset, dataframe, loss_fn, batch_size):
    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr)
    # train_loader = DataLoader(dataset, batch_size=batch_size)
    kf = KFold(n_splits=2, shuffle=True, random_state=10)

    for fold, (train_idx, val_idx) in enumerate(kf.split(dataframe)):
        print(f"Fold: {fold}")
        train_subset = Subset(dataset, train_idx)
        # val_subset = Subset(dataset, val_idx)

        train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
        # val_loader = DataLoader(val_subset, batch_size=32, shuffle=False)

        # Training loop for this fold
        losses = []
        for epoch in range(epochs):
            model.train()
            for mfccs, labels in train_loader:
                optimizer.zero_grad()
                output = model(mfccs)
                loss = loss_fn(output, labels)
                loss.backward()
                optimizer.step()
            print(loss.item())
            losses.append(loss.item())
        
        plt.figure(figsize=(16,8))
        plt.plot(np.arange(epochs), losses)
        plt.title(f'Training Loss over epochs for fold {fold}')
        plt.xlabel('Loss')
        plt.ylabel('Epochs')
        plt.show()
            

### Trying out different models

In [21]:
# Baseline CNN
class CNN(nn.Module):
    def __init__(self, n_fft, hop):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2,
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2,
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=2,
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=2,
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(116480, 2)
        self.softmax = nn.Softmax()

    def forward(self, input):
        # print(f"Input shape: {input.shape}")
        x = self.conv1(input)
        # print(f"After conv1: {x.shape}")
        x = self.conv2(x)
        # print(f"After conv2: {x.shape}")
        x = self.conv3(x)
        # print(f"After conv3: {x.shape}")
        x = self.conv4(x)
        # print(f"After conv4: {x.shape}")
        x = self.flatten(x)
        # print(f"After flatten: {x.shape}")
        logits = self.linear(x)
        # print(f"After linear: {x.shape}")
        predictions = self.softmax(logits)
        # print(f"After softmax: {x.shape}")
        return predictions

In [25]:
# Hyperparameters
target_length = 200000  # Check data_exploration.ipynb. average length is approximately this number
target_sr = 16000  # All audio are at 16K Hz
n_fft = 2048 # Power 2
hop = 1024 # Typically 0.5/0.75 of n_fft
train_dataframe = df.sample(frac=0.8, random_state=42) # From above
test_dataframe = df.loc[~df.idx.isin(train_dataframe.idx)]
# train_dataframe = dataframe[]
lr = 0.001
epochs = 50
batch_size = 32

# # Create dataloader class
train_dataset = CatsAndDogsDataset(target_length=target_length, n_fft=n_fft, hop=hop, dataframe=train_dataframe)
test_dataset = CatsAndDogsDataset(target_length=target_length, n_fft=n_fft, hop=hop, dataframe=test_dataframe)

# Create model
cnn_dict = torch.load('../models/CNN/cnn.pth')
cnn = CNN(n_fft=n_fft, hop=hop)
cnn.load_state_dict(cnn_dict)


<All keys matched successfully>

In [26]:
def test_model(model, test_dataframe):
    labels = {
        0: "Cat",
        1: "Dog"
    }
    model.eval()
    with torch.no_grad():
        correct = 0
        for idx in range(len(test_dataframe)):
            row = test_dataframe.iloc[idx]
            audio_path = row['audio_file']
            signal, sr = torchaudio.load(audio_path)
            if len(signal) > 200000:
                print(signal.shape)
                signal = signal[:, :200000]
                print(signal.shape)
            elif len(signal) < 200000:
                signal = F.pad(signal, [0, 200000-signal.shape[1]])
            
            spectrogram = T.Spectrogram(n_fft=2048, hop_length=1024)(signal)
            classification = row['classification']
            prediction = model(spectrogram.unsqueeze(0))
            if labels[torch.argmax(prediction,dim=1).item()] == labels[classification]:
                correct += 1
            print(f"Prediction: {labels[torch.argmax(prediction,dim=1).item()]}, Actual: {labels[classification]}")

    print(len(train_dataframe))
    print(f"Total accuracy: {correct/len(test_dataframe)}")
            


test_model(cnn, test_dataframe)

  predictions = self.softmax(logits)


Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Cat, Actual: Cat
Prediction: Dog, Actual: Dog
Prediction: Ca

In [42]:

# len(train_dataframe)
# len(df)
len(test_dataframe)

55