In [1]:
import torch
import numpy as np
import json
import os

import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
import scaleogram as scg

import pytest
import time
import torch.nn.functional as F
from scipy.io import wavfile

In [12]:
def get_signal(sound_path: str):
    samplerate, x = wavfile.read(sound_path)
    return samplerate, x

def from_string_to_label(object_name:str):
    label_map = {
        'big_drone': 0,
        'bird': 1,
        'free_space': 2,
        'human': 3,
        'small_copter': 4
    }
    
    return label_map.get(object_name, -1)

def convert_gray2rgb(image):
    width, height = image.shape
    out = np.empty((width, height, 3), dtype=np.uint8)
    out[:, :, 0] = image
    out[:, :, 1] = image
    out[:, :, 2] = image

    return out

def normalize_scaleogram(coefs):
    min_coefs = np.min(coefs)
    max_coefs = np.max(coefs)
    normalized_coefs = np.int8(((coefs - min_coefs) / (max_coefs - min_coefs)) * 255)
    
    normalized_image = normalized_coefs.astype(np.uint8)

    return normalized_image

def load_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

def get_scaleogram(sound_path, spectrum=None, wavelet=None, scales=None):
    sample_rate, signal = get_signal(sound_path)

    if not scales:
        scales = scg.periods2scales(np.logspace(np.log10(2), np.log10(1000)), wavelet)

    signal_length = signal.shape[0] / sample_rate
    time = np.linspace(0, signal_length, signal.shape[0])
    cwt = scg.CWT(time=time, signal=signal, scales=scales, wavelet=wavelet)

    if spectrum == 'amp':
        return np.abs(cwt.coefs), cwt.scales_freq
    elif spectrum == 'real':
        return np.real(cwt.coefs), cwt.scales_freq
    elif spectrum == 'imag':
        return np.imag(cwt.coefs), cwt.scales_freq
    return cwt.coefs, cwt.scales_freq

In [13]:
transform = transforms.Compose([
    transforms.ToTensor()
])

In [16]:
num_classes = 5

model = models.resnet18(pretrained=True) 
model.fc = nn.Linear(model.fc.in_features, num_classes)
model.load_state_dict(torch.load('resnet18_cmor_model.pth'))

device = torch.device("cuda:1") 
model.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [17]:
wavelet = 'cmor1.2-3'

def test_audio(model, sound_path, transform, wavelet):
    start = time.time()
    coefs, _ = get_scaleogram(sound_path, spectrum='amp', wavelet=wavelet)
    coefs = np.int32(coefs[len(coefs) // 4:, ::96])
    coefs = normalize_scaleogram(coefs)
    coefs_rgb = convert_gray2rgb(coefs)
    end = time.time()
    print("\nCalculating scaleogram for: {:.3f} seconds".format(end - start))
    coefs_tensor = transform(coefs_rgb).unsqueeze(0)
    
    model.eval()
    
    with torch.no_grad():
        outputs = model(coefs_tensor.to(device))
        
    _, predicted = torch.max(outputs, 1)
    predicted_class = predicted.item()
    
    probabilities = F.softmax(outputs, dim=1)
    
    return predicted_class, probabilities.squeeze().cpu().numpy()

In [18]:
test_data = os.path.join(os.getcwd(), 'test')
for sound_data in os.listdir(test_data):
    sound_path = os.path.join(test_data, sound_data)
    start = time.time()
    pred_class, probabilities = test_audio(model, sound_path, transform, wavelet)
    end = time.time()
    
    print(pred_class, probabilities)
    print("\nProcessing for: {:.3f} seconds".format(end - start))




Calculating scaleogram for: 0.151 seconds
1 [4.9500555e-09 1.0000000e+00 3.6139938e-14 2.1430164e-08 4.0605416e-10]

Processing for: 0.158 seconds

Calculating scaleogram for: 0.119 seconds
4 [6.7559633e-08 3.5100004e-07 1.3896968e-10 4.2742151e-05 9.9995673e-01]

Processing for: 0.126 seconds

Calculating scaleogram for: 0.119 seconds
1 [5.0333835e-04 9.6132636e-01 4.4731816e-04 3.7719853e-02 3.1667378e-06]

Processing for: 0.126 seconds

Calculating scaleogram for: 0.121 seconds
3 [2.3857026e-06 1.0635946e-06 2.9985259e-08 9.9999559e-01 8.8750954e-07]

Processing for: 0.130 seconds

Calculating scaleogram for: 0.120 seconds
3 [2.6499245e-07 4.7847095e-08 3.7782797e-08 9.9999964e-01 2.9120978e-08]

Processing for: 0.127 seconds

Calculating scaleogram for: 0.118 seconds
2 [8.8950305e-09 5.9533656e-11 9.9999237e-01 1.6093721e-10 7.6353372e-06]

Processing for: 0.125 seconds

Calculating scaleogram for: 0.117 seconds
3 [4.2261627e-07 1.7533871e-08 4.2247277e-09 9.9999952e-01 1.5262950e