<a href="https://colab.research.google.com/github/unicamp-dl/IA025_2022S1/blob/main/Final_project/Karen_Rosero/preprocessing_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sound classification and localization using transformers
## Notebook for preprocessing the ANSYN, REAL, and L3DAS21 datasets

### Author: Karen Rosero

O objetivo deste notebook é mudar a frequência de amostragem para 16000Hz visando a compatibilidade com o modelo pré-treinado wav2vec2. Também é modificado o formato das etiquetas que indicava o tempo de começo e finalização do evento de som, para um formato de amostras. 
Os sons contidos em cada áudio são separados e armazenados em diferentes arquivos.
Os diretorios dos arquivos devem ser modificados para a correta utilização deste notebook.

Importo bibliotecas

In [None]:
import os 
import numpy as np
import shutil
import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
import csv

## 1. Pré-procesando a base de dados ANSYN 

A base de dados *TUT Sound Events 2018 - Ambisonic, Anechoic and Synthetic Impulse Response Dataset (ANSYN)* pode ser obtida no link: https://zenodo.org/record/1237703#.YthZoPvQ9Qw

Declaração de variáveis

In [None]:
wav_fold = '/home/lab_acustica/Documentos/ANSYN_Dataset/ov1_split3/wav_ov1_split3_30db/'
label_fold = '/home/lab_acustica/Documentos/ANSYN_Dataset/ov1_split3/desc_ov1_split3/'
new_path = '/home/lab_acustica/Documentos/ANSYN_Dataset/wav_separate_sounds_ov1/des_ov1s3/'
tg_sr = 16000
_unique_classes = \
                {
                    'clearthroat': '02',
                    'cough': '08',
                    'doorslam': '09',
                    'drawer': '01',
                    'keyboard': '06',
                    'keysDrop': '04',
                    'knock': '00',
                    'laughter': '10',
                    'pageturn': '07',
                    'phone': '03',
                    'speech': '05'
                }

Funções de processamento

In [None]:
def _read_desc_file(desc_filename):
    desc_file = {
        'class': list(), 'start': list(), 'end': list(), 'ele': list(), 'azi': list()
    }
    fid = open(desc_filename, 'r')
    next(fid)
    for line in fid:
        split_line = line.strip().split(',') 
        desc_file['class'].append(_unique_classes[split_line[0].split('.')[0][:-3]])
        desc_file['start'].append(int(np.floor(float(split_line[1])*16000)))
        desc_file['end'].append(int(np.ceil(float(split_line[2])*16000)))
        desc_file['ele'].append(int(split_line[3]))
        desc_file['azi'].append(int(split_line[4]))
    
    fid.close()
    return desc_file

def process_audio(audio_path, desc_file):
    audio, sr =  torchaudio.load(audio_path)
    resampler = T.Resample(sr, 16000)
    audio = resampler(audio)
    audio_list = []
    classe = []
    elev = []
    azim = []
    for i in range(len(desc_file['start'])):
        audio_list.append(audio[:, desc_file['start'][i]:desc_file['end'][i]])
    return audio_list

Loop principal de processamento. Salva os arquivos de áudio e as etiquetas no novo formato e com novos nomes.

In [None]:
for file in os.listdir(wav_fold):
    if 'test' in file and len(file)==22:
        dicc_labels = _read_desc_file(label_fold +file[:-3]+"csv")
        signal_list = process_audio(wav_fold+file, dicc_labels)
        for i in range(len(signal_list)):
            new_name = 'tst_00'+file[5]+'_'+str(i)+'_'+"ov1"+'_s3_'+str(dicc_labels["class"][i])
            _fid = open(new_path + new_name+'.csv', 'w')
            _fid.write('{},{},{}\n'.format(int(dicc_labels["class"][i]), int(dicc_labels["ele"][i]), int(dicc_labels["azi"][i])))   
            _fid.close()            
    elif 'test' in file and len(file)==23:
        dicc_labels = _read_desc_file(label_fold +file[:-3]+"csv")
        signal_list = process_audio(wav_fold+file, dicc_labels)
        for i in range(len(signal_list)):
            new_name = 'tst_0'+file[5:7]+'_'+str(i)+'_'+"ov1"+'_s3_'+str(dicc_labels["class"][i])
            torchaudio.save(new_path+new_name , signal_list[i], 16000)  
            _fid = open(new_path +new_name+'.csv', 'w')
            _fid.write('{},{},{}\n'.format(int(dicc_labels["class"][i]), int(dicc_labels["ele"][i]), int(dicc_labels["azi"][i])))   
            _fid.close()              
    elif 'train' in file and len(file)==23:      
        dicc_labels = _read_desc_file(label_fold +file[:-3]+"csv")
        signal_list = process_audio(wav_fold+file, dicc_labels)
        for i in range(len(signal_list)):
            new_name = 'tra_00'+file[6]+'_'+str(i)+'_'+"ov1"+'_s3_'+str(dicc_labels["class"][i])
            torchaudio.save(new_path+new_name , signal_list[i], 16000)   
            _fid = open(new_path +new_name+'.csv', 'w')
            _fid.write('{},{},{}\n'.format(int(dicc_labels["class"][i]), int(dicc_labels["ele"][i]), int(dicc_labels["azi"][i])))   
            _fid.close()              
    elif 'train' in file and len(file)==24:           
        dicc_labels = _read_desc_file(label_fold +file[:-3]+"csv")
        signal_list = process_audio(wav_fold+file, dicc_labels)
        for i in range(len(signal_list)):
            new_name = 'tra_0'+file[6:8]+'_'+str(i)+'_'+"ov1"+'_s3_'+str(dicc_labels["class"][i])
            torchaudio.save(new_path+new_name , signal_list[i], 16000) 
            _fid = open(new_path +new_name+'.csv', 'w')
            _fid.write('{},{},{}\n'.format(int(dicc_labels["class"][i]), int(dicc_labels["ele"][i]), int(dicc_labels["azi"][i])))   
            _fid.close()              
    elif 'train' in file and len(file)==25:   
        dicc_labels = _read_desc_file(label_fold +file[:-3]+"csv")
        signal_list = process_audio(wav_fold+file, dicc_labels)
        for i in range(len(signal_list)):
            new_name = 'tra_'+file[6:9]+'_'+str(i)+'_'+"ov1"+'_s3_'+str(dicc_labels["class"][i])
            torchaudio.save(new_path+new_name , signal_list[i], 16000) 
            _fid = open(new_path +new_name+'.csv', 'w')
            _fid.write('{},{},{}\n'.format(int(dicc_labels["class"][i]), int(dicc_labels["ele"][i]), int(dicc_labels["azi"][i])))   
            _fid.close()              

## 2. Pré-procesando a base de dados REAL 

A base de dados *TUT Sound Events 2018 - Ambisonic, Reverberant and Real-life Impulse Response Dataset (REAL)* pode ser obtida no link: https://zenodo.org/record/1237793#.YthaOPvQ9Qw

Declaração de variáveis

In [None]:
wav_fold = '/home/lab_acustica/Documentos/REAL_Dataset/ov1_split1/wav_ov1_split1_30db/'
label_fold = '/home/lab_acustica/Documentos/REAL_Dataset/ov1_split1/desc_ov1_split1/'
new_path = '/home/lab_acustica/Documentos/REAL_Dataset/wav_separate_sounds_ov1/des_ov1s1/'
wav_new_path = '/home/lab_acustica/Documentos/REAL_Dataset/wav_separate_sounds_ov1/wav_ov1s1/'

tg_sr = 16000
_unique_classes = \
                {
                    '1': 0,
                    '3': 1,
                    '4': 2,
                    '5': 3,
                    '6': 4,
                    '7': 5,
                    '8': 6,
                    '9': 7
                }

Funções de processamento

In [None]:
def _read_desc_file(desc_filename):
    desc_file = {
        'class': list(), 'start': list(), 'end': list(),  'ele': list(), 'azi': list()
    }
    fid = open(desc_filename, 'r')
    next(fid)
    for line in fid:
        split_line = line.strip().split(',') 
        desc_file['class'].append(_unique_classes[split_line[0].split('.')[0].split('-')[1]])
        desc_file['start'].append(int(np.floor(float(split_line[1])*16000)))
        desc_file['end'].append(int(np.ceil(float(split_line[2])*16000))+1)
        desc_file['ele'].append(int(split_line[3]))
        desc_file['azi'].append(int(split_line[4]))        
    
    fid.close()
    return desc_file

def process_audio(audio_path, desc_file):
    audio, sr =  torchaudio.load(audio_path)
    resampler = T.Resample(sr, 16000)
    audio = resampler(audio)
    audio_list = []
    classe = []
    elev = []
    azim = []
    for i in range(len(desc_file['start'])):
        audio_list.append(audio[:, desc_file['start'][i]:desc_file['end'][i]])
    return audio_list

Loop principal de processamento. Salva os arquivos de áudio e as etiquetas no novo formato e com novos nomes.

In [None]:
for file in os.listdir(wav_fold):
    if 'test' in file and len(file)==22:
        dicc_labels = _read_desc_file(label_fold +file[:-3]+"csv")
        signal_list = process_audio(wav_fold+file, dicc_labels)
        for i in range(len(signal_list)):
            new_name = 'tst_00'+file[5]+'_'+str(i)+'_'+"ov1"+'_s1_'+str(dicc_labels["class"][i])
            _fid = open(new_path + new_name+'.csv', 'w')
            _fid.write('{},{},{}\n'.format(int(dicc_labels["class"][i]), int(dicc_labels["ele"][i]), int(dicc_labels["azi"][i])))   
            _fid.close()       
            torchaudio.save(wav_new_path+new_name +'.wav', signal_list[i], 16000) 
    elif 'test' in file and len(file)==23:
        dicc_labels = _read_desc_file(label_fold +file[:-3]+"csv")
        signal_list = process_audio(wav_fold+file, dicc_labels)
        for i in range(len(signal_list)):
            new_name = 'tst_0'+file[5:7]+'_'+str(i)+'_'+"ov1"+'_s1_'+str(dicc_labels["class"][i])
            torchaudio.save(new_path+new_name , signal_list[i], 16000)  
            _fid = open(new_path +new_name+'.csv', 'w')
            _fid.write('{},{},{}\n'.format(int(dicc_labels["class"][i]), int(dicc_labels["ele"][i]), int(dicc_labels["azi"][i])))   
            _fid.close()      
            torchaudio.save(wav_new_path+new_name +'.wav', signal_list[i], 16000) 
    elif 'train' in file and len(file)==23:      
        dicc_labels = _read_desc_file(label_fold +file[:-3]+"csv")
        signal_list = process_audio(wav_fold+file, dicc_labels)
        for i in range(len(signal_list)):
            new_name = 'tra_00'+file[6]+'_'+str(i)+'_'+"ov1"+'_s1_'+str(dicc_labels["class"][i])
            torchaudio.save(new_path+new_name , signal_list[i], 16000)   
            _fid = open(new_path +new_name+'.csv', 'w')
            _fid.write('{},{},{}\n'.format(int(dicc_labels["class"][i]), int(dicc_labels["ele"][i]), int(dicc_labels["azi"][i])))   
            _fid.close()       
            torchaudio.save(wav_new_path+new_name +'.wav', signal_list[i], 16000) 
    elif 'train' in file and len(file)==24:           
        dicc_labels = _read_desc_file(label_fold +file[:-3]+"csv")
        signal_list = process_audio(wav_fold+file, dicc_labels)
        for i in range(len(signal_list)):
            new_name = 'tra_0'+file[6:8]+'_'+str(i)+'_'+"ov1"+'_s1_'+str(dicc_labels["class"][i])
            torchaudio.save(new_path+new_name , signal_list[i], 16000) 
            _fid = open(new_path +new_name+'.csv', 'w')
            _fid.write('{},{},{}\n'.format(int(dicc_labels["class"][i]), int(dicc_labels["ele"][i]), int(dicc_labels["azi"][i])))   
            _fid.close()    
            torchaudio.save(wav_new_path+new_name +'.wav', signal_list[i], 16000) 
    elif 'train' in file and len(file)==25:   
        dicc_labels = _read_desc_file(label_fold +file[:-3]+"csv")
        signal_list = process_audio(wav_fold+file, dicc_labels)
        for i in range(len(signal_list)):
            new_name = 'tra_'+file[6:9]+'_'+str(i)+'_'+"ov1"+'_s1_'+str(dicc_labels["class"][i])
            torchaudio.save(new_path+new_name , signal_list[i], 16000) 
            _fid = open(new_path +new_name+'.csv', 'w')
            _fid.write('{},{},{}\n'.format(int(dicc_labels["class"][i]), int(dicc_labels["ele"][i]), int(dicc_labels["azi"][i])))   
            _fid.close()   
            torchaudio.save(wav_new_path+new_name +'.wav', signal_list[i], 16000) 

## 3. Pré-procesando a base de dados L3DAS21 

A base de dados *L3DAS21: MACHINE LEARNING FOR 3D AUDIO SIGNAL PROCESSING Task for 3D Sound Event Localization and Detection* pode ser obtida no link: https://zenodo.org/record/4642005#.YthaZfvQ9Qw

Declaração de variáveis

In [None]:
dataset_fold = '/content/drive/MyDrive/Especial/IA025/Projeto_IA025/L3DAS21_Dataset/'
new_path = '/content/drive/MyDrive/Especial/IA025/Projeto_IA025/L3DAS21_Dataset/L3DAS21_separate_sounds_ov1/'

tg_sr = 16000
_unique_classes = \
                {'Chink_and_clink':0,
                           'Computer_keyboard':1,
                           'Cupboard_open_or_close':2,
                           'Drawer_open_or_close':3,
                           'Female_speech_and_woman_speaking':4,
                           'Finger_snapping':5,
                           'Keys_jangling':6,
                           'Knock':7,
                           'Laughter':8,
                           'Male_speech_and_man_speaking':9,
                           'Printer':10,
                           'Scissors':11,
                           'Telephone':12,
                           'Writing':13}

Funções de processamento

In [None]:
def _read_desc_file(desc_filename):
    desc_file = {
        'class': list(), 'start': list(), 'end': list(), 'x': list(), 'y': list(), 'z': list()
    }
    fid = open(desc_filename, 'r')
    next(fid)
    for line in fid:
        split_line = line.strip().split(',')
        desc_file['class'].append(_unique_classes[split_line[3]])
        desc_file['start'].append(int(np.floor(float(split_line[1])*16000)))
        desc_file['end'].append(int(np.ceil(float(split_line[2])*16000)))
        desc_file['x'].append(float(split_line[4]))
        desc_file['y'].append(float(split_line[5]) )
        desc_file['z'].append(float(split_line[6]) )     
    fid.close()
    return desc_file

def process_audio(audio_path, desc_file):
    audio, sr =  torchaudio.load(audio_path)
    resampler = T.Resample(sr, 16000)
    audio = resampler(audio)
    audio_list = []
    classe = []
    elev = []
    azim = []
    for i in range(len(desc_file['start'])):
        audio_list.append(audio[:, desc_file['start'][i]:desc_file['end'][i]])
    return audio_list


Loop principal de processamento. Salva os arquivos de áudio e as etiquetas no novo formato e com novos nomes.

In [None]:
for fold in os.listdir(dataset_fold):
    if 'dev' in fold:
        for filename in os.listdir(dataset_fold+fold+'/labels/'):
            dicc_labels = _read_desc_file(dataset_fold+fold+'/labels/'+filename)
            wav_name = dataset_fold+fold+'/data/'+filename[6:-4]+'_A.wav'
            signal_list = process_audio(wav_name, dicc_labels)
            for i in range(len(signal_list)):        
                if len(filename[6:-4])==12:
                    new_name = 'tst_0'+filename[-5]+'_'+str(i)+'_'+"ov"+filename[15]+'_s'+filename[11]
                    print(new_name)
                    torchaudio.save(new_path+'wav_files/'+new_name+'.wav', signal_list[i], 16000)
                    _fid = open(new_path +'des_files/'+ new_name+'.csv', 'w')
                    _fid.write('{},{},{},{}\n'.format(int(dicc_labels["class"][i]), dicc_labels["x"][i], dicc_labels["y"][i], dicc_labels["z"][i]))   
                    _fid.close()    
                if len(filename[6:-4])==13:
                    new_name = 'tst_'+filename[-6:-4]+'_'+str(i)+'_'+"ov"+filename[15]+'_s'+filename[11]
                    print(new_name)
                    torchaudio.save(new_path+'wav_files/'+new_name+'.wav', signal_list[i], 16000)
                    _fid = open(new_path +'des_files/'+ new_name+'.csv', 'w')
                    _fid.write('{},{},{},{}\n'.format(int(dicc_labels["class"][i]), dicc_labels["x"][i], dicc_labels["y"][i], dicc_labels["z"][i]))   
                    _fid.close()      
    elif 'train' in fold:
        for filename in os.listdir(dataset_fold+fold+'/labels/'):
            dicc_labels = _read_desc_file(dataset_fold+fold+'/labels/'+filename)
            wav_name = dataset_fold+fold+'/data/'+filename[6:-4]+'_A.wav'
            signal_list = process_audio(wav_name, dicc_labels)
            for i in range(len(signal_list)):        
                if len(filename[6:-4])==12:
                    new_name = 'tra_0'+filename[-5]+'_'+str(i)+'_'+"ov"+filename[15]+'_s'+filename[11]
                    print(new_name)
                    torchaudio.save(new_path+'wav_files/'+new_name+'.wav', signal_list[i], 16000)
                    _fid = open(new_path +'des_files/'+ new_name+'.csv', 'w')
                    _fid.write('{},{},{},{}\n'.format(int(dicc_labels["class"][i]), dicc_labels["x"][i], dicc_labels["y"][i], dicc_labels["z"][i]))   
                    _fid.close() 
                if len(filename[6:-4])==13:
                    new_name = 'tra_'+filename[-6:-4]+'_'+str(i)+'_'+"ov"+filename[15]+'_s'+filename[11]
                    print(new_name)
                    torchaudio.save(new_path+'wav_files/'+new_name+'.wav', signal_list[i], 16000)
                    _fid = open(new_path +'des_files/'+ new_name+'.csv', 'w')
                    _fid.write('{},{},{},{}\n'.format(int(dicc_labels["class"][i]), dicc_labels["x"][i], dicc_labels["y"][i], dicc_labels["z"][i]))   
                    _fid.close()               