In [1]:
import os.path

import torch
import torch.optim as optim
from torch import nn

from core.dataset_handler import DatasetHandler
from core.sound_classifier import SoundClassifier

In [2]:
speech_path = os.path.abspath('../datasets/speech')
sounds_path = os.path.abspath('../datasets/sounds')

dataset_handler = DatasetHandler()
dataset_handler.set_speech_params(speech_path, 'mp3', 250)
dataset_handler.set_sounds_params(sounds_path, 'wav', 100)
dataset_handler.set_target_spectrogram_shape((64, 128))
dataset_handler.set_batch_size(32)
train_loader, test_loader = dataset_handler.read_datasets()

print('Finished Data Preparation')

Processed 1000 speech samples with background sounds.
Processed 1000 speech samples without background sounds.
Processed 1000 environmental sound samples.
Finished Data Preparation


In [5]:
model = SoundClassifier(64 * 128)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-6)

for epoch in range(30):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {running_loss / len(train_loader)}")

print('Finished Training')

Epoch 1, Loss: 0.10294275305544337
Epoch 2, Loss: 0.0859392684896011
Epoch 3, Loss: 0.06447867205987375
Epoch 4, Loss: 0.0757234081812203
Epoch 5, Loss: 0.06080119969633718
Epoch 6, Loss: 0.06324945238942746
Epoch 7, Loss: 0.059974186098358286
Epoch 8, Loss: 0.05269342844820737
Epoch 9, Loss: 0.05859625888677935
Epoch 10, Loss: 0.0512810198799707
Epoch 11, Loss: 0.04574579782007883
Epoch 12, Loss: 0.04821821768845742
Epoch 13, Loss: 0.04356157996458933
Epoch 14, Loss: 0.047396373535351206
Epoch 15, Loss: 0.046754635632193335
Epoch 16, Loss: 0.04557840727890531
Epoch 17, Loss: 0.042627858206008874
Epoch 18, Loss: 0.040384229809666675
Epoch 19, Loss: 0.04228926615323871
Epoch 20, Loss: 0.040983844405661024
Epoch 21, Loss: 0.03712891456166593
Epoch 22, Loss: 0.04673700354081423
Epoch 23, Loss: 0.04506859319439779
Epoch 24, Loss: 0.03879142104958495
Epoch 25, Loss: 0.03961212448620548
Epoch 26, Loss: 0.03877459025476128
Epoch 27, Loss: 0.04246601503381195
Epoch 28, Loss: 0.0392243855431055

In [6]:
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on test files: {100 * correct // total} %')

Accuracy of the network on test files: 98 %


In [7]:
torch.save(model.state_dict(), 'sound_classifier_weights.pth')
print('Model saved')

Model saved
