In [1]:
from torchaudio.datasets import LIBRISPEECH
import torchaudio
from IPython.display import Audio, display
import pyroomacoustics as pra
from pyroomacoustics.doa import spher2cart, circ_dist
import numpy as np
from typing import Tuple
from torch import Tensor
import torch
import random
from tqdm import tqdm
from librosa.effects import split
import cfg
from data import remove_silence, LibriSpeechLocationsDataset, RoomSimulator, cartesian_to_polar


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# create datasets
print(f"len(source_locs_train): {len(cfg.source_locs)}")
dataset = LibriSpeechLocationsDataset(cfg.source_locs, split="test-clean")
print(f"len(dataset): {len(dataset)}")


len(source_locs_train): 2620
len(dataset): 2620


In [3]:
# # Get a sample of created dataset
# print('Total data set size: ' + str(len(dataset))) 
# (waveform, sample_rate, transcript, speaker_id, utterance_number), pos, seed = dataset[0]
# transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=cfg.fs)
# transformed = transform(waveform)
# # write code to play transformed with Audio
# Audio(transformed.numpy(), rate=cfg.fs)

In [4]:
# remove silence and keep only waveforms longer than MIN_SIG_LEN seconds
valid_idx = [i if len(remove_silence(waveform, frame_length=cfg.sig_len)) > cfg.fs * cfg.MIN_SIG_LEN else None for i, ((waveform, sample_rate,transcript, speaker_id, utterance_number), pos, seed) in enumerate(dataset)]
inds = [i for i in valid_idx if i is not None]

In [5]:
print(f"len(valid_idx): {len(valid_idx)} -  valid_idx: {valid_idx}")
print(f"len(inds): {len(inds)} - valid inds: {inds}")

len(valid_idx): 2620 -  valid_idx: [0, 1, 2, None, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, None, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, None, 39, 40, None, 42, 43, 44, 45, 46, 47, None, 49, 50, 51, 52, None, 54, 55, None, 57, 58, 59, 60, 61, None, 63, 64, 65, 66, 67, 68, 69, None, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, None, 95, 96, 97, 98, None, 100, 101, 102, 103, 104, 105, None, 107, 108, 109, 110, 111, 112, 113, None, 115, 116, 117, 118, 119, 120, 121, None, None, 124, None, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, None, 139, 140, 141, 142, None, 144, 145, 146, 147, None, 149, None, 151, 152, 153, 154, None, 156, 157, 158, 159, 160, 161, 162, 163, None, 165, 166, None, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, None, 203, 204, 205, 206, 207

In [6]:

dataset = torch.utils.data.dataset.Subset(dataset, inds)
print('Total data set size after removing silence: ' + str(len(dataset)))


Total data set size after removing silence: 2296


In [7]:
# (waveform, sample_rate, transcript, speaker_id, utterance_number), pos, seed = dataset[0]
# transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=cfg.fs)
# transformed = transform(waveform)
# print(f"waveform.shape: {waveform.shape}")
# print(f"sample_rate: {sample_rate}")
# print(f"transcript: {transcript}")
# print(f"speaker_id: {speaker_id}")
# print(f"utterance_number: {utterance_number}")
# print(f"pos: {pos}")
# print(f"seed: {seed}")
# # write code to play transformed with Audio
# Audio(transformed.numpy(), rate=cfg.fs)
# print(f"transformed.shape: {transformed.shape}")


In [8]:
train_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, pin_memory=False)


In [9]:
# pbar_update = cfg.batch_size
# with tqdm(total=len(dataset)) as pbar:
#     for batch_idx, sample in enumerate(train_loader):
#         (waveform, sample_rate, transcript, speaker_id, utterance_number), pos, seed = sample
#         print(f"batch_idx: {batch_idx}")
#         print(f"waveform.shape: {waveform.shape}")
#         print(f"sample_rate: {sample_rate}")
#         print(f"transcript: {transcript}")
#         print(f"speaker_id: {speaker_id}")
#         print(f"utterance_number: {utterance_number}")
#         print(f"pos: {pos}")
#         print(f"seed: {seed}")
#         pbar.update(pbar_update)

In [10]:

room_simulator = RoomSimulator()



In [11]:
dataset_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=cfg.batch_size,
    shuffle=True,
    collate_fn=room_simulator,
)


In [24]:
# with tqdm(total=len(dataset_loader)) as pbar:
for batch_idx, (microphone_signals, source_locs, mic_locs) in enumerate(dataset_loader):
        print(f"batch_idx: {batch_idx}")
        print(f"source_locs.shape: {source_locs[0].shape}")
        source_locs = source_locs[0].squeeze(0)
        print(f"source_locs: {source_locs}")
        print("-----------------")
        print(f"microphone_signals.shape: {microphone_signals.shape}")
        microphone_signals = microphone_signals.squeeze(0).squeeze(0)
        print(f"microphone_signals.shape: {microphone_signals.shape}")
        print(f"mic_locs: {mic_locs}")
        print(f"mic_locs.shape: {mic_locs.shape}")
        Audio(microphone_signals[1].numpy(), rate=cfg.fs)
        display(Audio(microphone_signals[0].numpy(), rate=cfg.fs))
        display(Audio(microphone_signals[1].numpy(), rate=cfg.fs))

        microphone_signals = microphone_signals.numpy().T
        print(f"----- microphone_signals.shape: {microphone_signals.shape}")
        X = pra.transform.stft.analysis(microphone_signals, cfg.nfft, cfg.nfft // 2)
        print(f"X.shape: {X.shape}")
        X = X.transpose([2, 1, 0])

        # algo_names = ['SRP', 'MUSIC', 'FRIDA', 'TOPS']
        algo_names = ['SRP', 'MUSIC', 'TOPS','NormMUSIC', 'WAVES']

        spatial_resp = dict()

        # Conversione alle coordinate polari
        r, theta, phi = cartesian_to_polar(source_locs[0] - (cfg.dx/2), source_locs[1] - (cfg.dy/2), source_locs[2]-(cfg.dz/2))
        print(f"theta:{theta}")

        azimuth = [theta]


        # loop through algos
        for algo_name in algo_names:
                # Construct the new DOA object
                # the max_four parameter is necessary for FRIDA only
                doa = pra.doa.algorithms[algo_name](mic_locs, cfg.fs, cfg.nfft, c=cfg.c, num_src=1, max_four=4)

                # this call here perform localization on the frames in X
                doa.locate_sources(X, freq_range=cfg.freq_range)
                
                # store spatial response
                if algo_name == 'FRIDA':
                        spatial_resp[algo_name] = np.abs(doa._gen_dirty_img())
                else:
                        spatial_resp[algo_name] = doa.grid.values
                        
                # normalize   
                min_val = spatial_resp[algo_name].min()
                max_val = spatial_resp[algo_name].max()
                spatial_resp[algo_name] = (spatial_resp[algo_name] - min_val) / (max_val - min_val)
                print(algo_name)
                print("  Recovered azimuth:", doa.azimuth_recon / np.pi * 180.0, "degrees")
                print("  Real azimuth:", theta / np.pi * 180.0, "degrees")
                print("  Error:", circ_dist(theta, doa.azimuth_recon) / np.pi * 180.0, "degrees")
        break

[[1.4  1.6 ]
 [1.25 1.25]
 [1.8  1.8 ]]
batch_idx: 0
source_locs.shape: torch.Size([1, 3])
source_locs: tensor([1.7766, 2.1747, 0.2195])
-----------------
microphone_signals.shape: torch.Size([1, 1, 2, 64972])
microphone_signals.shape: torch.Size([2, 64972])
mic_locs: [[1.4  1.6 ]
 [1.25 1.25]
 [1.8  1.8 ]]
mic_locs.shape: (3, 2)


----- microphone_signals.shape: (64972, 2)
X.shape: (508, 129, 2)
theta:1.2801533937454224
SRP
  Recovered azimuth: [264.] degrees
  Real azimuth: tensor(73.3474) degrees
  Error: tensor([169.3474], dtype=torch.float64) degrees
MUSIC
  Recovered azimuth: [22.] degrees
  Real azimuth: tensor(73.3474) degrees
  Error: tensor([51.3474], dtype=torch.float64) degrees
TOPS
  Recovered azimuth: [77.] degrees
  Real azimuth: tensor(73.3474) degrees
  Error: tensor([3.6526], dtype=torch.float64) degrees
NormMUSIC
  Recovered azimuth: [278.] degrees
  Real azimuth: tensor(73.3474) degrees
  Error: tensor([155.3474], dtype=torch.float64) degrees
WAVES
  Recovered azimuth: [110.] degrees
  Real azimuth: tensor(73.3474) degrees
  Error: tensor([36.6526], dtype=torch.float64) degrees
