<a href="https://colab.research.google.com/github/unicamp-dl/IA025_2022S1/blob/main/Final_project/Karen_Rosero/wav2vec2_Preprocess_ANSYN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sound detection and classification using transformers
## Data processing notebook

## Projeto final da disclipina IA025
## Autora: Karen Rosero

Importo bibliotecas

In [None]:
import os 
import matplotlib.pyplot as plt
import random
import numpy as np
import pandas as pd
import shutil
import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
from scipy.io.wavfile import write
from sklearn.model_selection import train_test_split
import csv
import librosa 

## 1. Pré-procesando a base de dados ANSYN 

#### O objetivo deste processo é mudar a frequência de amostragem de 44100Hz para 16000Hz visando a compatibilidade com o modelo pré-treinado wav2vec2. Também é modificado o formato das etiquetas que indicava o tempo de começo e finalização do evento de som, para um formato de amostras. Tantos os áudios, como as etiquetas foram fixados em 30s, seja com padding ou cortando amostras adicionais

In [None]:
path_fold = '/home/lab_acustica/Documentos/ANSYN_Pilot'

Funções para modificar os sinais de áudio

In [None]:
def process_audio(signal, sr, new_sr):
    resampler = T.Resample(sr, new_sr)
    signal = resampler(signal)
    # cut if necessary 
    length_signal = signal.shape[1]
    if length_signal > new_sr*30:
        signal = signal[:, :new_sr*30]
    # right pad if neccesary    
    if length_signal < new_sr*30:
        num_missing_samples = new_sr*30 - length_signal
        last_dim_padding = (0, num_missing_samples)
        signal = torch.nn.functional.pad(signal, last_dim_padding)
    return signal

Funções para modificar o formato das etiquetas

In [None]:
tg_sr = 16000
_label_hop_len_s=0.1
_max_audio_len_s=30         
_label_hop_len = int(tg_sr * _label_hop_len_s)
_label_frame_res = tg_sr / float(_label_hop_len)
_nb_label_frames_1s = int(_label_frame_res)
_max_label_frames = int(np.ceil((tg_sr*_max_audio_len_s) / float(_label_hop_len)))
_unique_classes = \
                {
                    'clearthroat': 2,
                    'cough': 8,
                    'doorslam': 9,
                    'drawer': 1,
                    'keyboard': 6,
                    'keysDrop': 4,
                    'knock': 0,
                    'laughter': 10,
                    'pageturn': 7,
                    'phone': 3,
                    'speech': 5
                }
new_audio_fold = '/home/lab_acustica/Documentos/ANSYN_Pilot/audio/'
new_label_fold = '/home/lab_acustica/Documentos/ANSYN_Pilot/label/'

def _read_desc_file(desc_filename):
    desc_file = {
        'class': list(), 'start': list(), 'end': list(), 'ele': list(), 'azi': list()
    }
    fid = open(desc_filename, 'r')
    next(fid)
    for line in fid:
        split_line = line.strip().split(',')
        desc_file['class'].append(split_line[0].split('.')[0][:-3])
        desc_file['start'].append(int(np.floor(float(split_line[1])*_label_frame_res)))
        desc_file['end'].append(int(np.ceil(float(split_line[2])*_label_frame_res)))
        desc_file['ele'].append(int(float(split_line[3])))
        desc_file['azi'].append(int(float(split_line[4])))     
    fid.close()
    return desc_file

def _get_se_labels(_desc_file):
    se_label = np.zeros((_max_label_frames, len(_unique_classes)))
    for i, se_class in enumerate(_desc_file['class']):
        start_frame = _desc_file['start'][i]
        end_frame = _max_label_frames if _desc_file['end'][i] > _max_label_frames else _desc_file['end'][i]
        se_label[start_frame:end_frame + 1, _unique_classes[se_class]] = 1
    se_list = []
    for i in range(len(se_label)):
        k = np.where(se_label[i])
        se_list.append(k)
    return se_list
 
def write_new_format(label_name, se_label):
    
    _fid = open(label_name, 'w')
    # _fid.write('{},{},{},{}\n'.format('frame number with 20ms hop (int)', 'class index (int)', 'azimuth angle (int)', 'elevation angle (int)'))
    count = 0
    for ind in range(len(se_label)):
        if len(se_label[ind][0])==1:
            _fid.write('{},{}\n'.format(int(count), int(se_label[ind][0][0])))
        elif len(se_label[ind][0])==2:
            _fid.write('{},{},{}\n'.format(int(count), int(se_label[ind][0][0]), int(se_label[ind][0][1])))
        elif len(se_label[ind][0])==3:
            _fid.write('{},{},{},{}\n'.format(int(count), int(se_label[ind][0][0]), int(se_label[ind][0][1]), int(se_label[ind][0][2])))        
        elif len(se_label[ind][0])==0:
            _fid.write('{}\n'.format(int(count)))    
        count = count +1
    _fid.close()

def process_labels(orig_name, new_name):
    dicc = _read_desc_file(orig_name)
    se_label = _get_se_labels(dicc)
    write_new_format(new_name, se_label)


Esta célula salva a nova versão da base de dados modificada

In [None]:
for fold in os.listdir(path_fold):
    if 'wav' in fold:
        for file in os.listdir(path_fold + '/' + fold):
            if 'test' in file and len(file)==22:
                new_name = 'tst_00'+file[5]+'_'+fold[4:7]+'_s'+fold[13]+'.wav'
                #process audio and save the new one
                signal, sr = torchaudio.load(path_fold + '/' + fold+'/'+file)
                signal = process_audio(signal, sr, tg_sr)
                torchaudio.save(new_audio_fold+new_name, signal, tg_sr)
                #print(path_fold+'/'+fold+'/'+file, path_fold+'/audio/test/'+new_name)
            elif 'test' in file and len(file)==23:
                new_name = 'tst_0'+file[5:7]+'_'+fold[4:7]+'_s'+fold[13]+'.wav'
                #process audio and save the new one
                signal, sr = torchaudio.load(path_fold + '/' + fold+'/'+file)
                signal = process_audio(signal, sr, tg_sr)
                torchaudio.save(new_audio_fold+new_name, signal, tg_sr)
#                #print(file, new_name)
            elif 'train' in file and len(file)==23:               
                new_name = 'tra_00'+file[6]+'_'+fold[4:7]+'_s'+fold[13]+'.wav'
                #process audio and save the new one
                signal, sr = torchaudio.load(path_fold + '/' + fold+'/'+file)
                signal = process_audio(signal, sr, tg_sr)
                torchaudio.save(new_audio_fold+new_name, signal, tg_sr)            
                #print(file, new_name)
            elif 'train' in file and len(file)==24:            
                new_name = 'tra_0'+file[6:8]+'_'+fold[4:7]+'_s'+fold[13]+'.wav'
                #process audio and save the new one
                signal, sr = torchaudio.load(path_fold + '/' + fold+'/'+file)
                signal = process_audio(signal, sr, tg_sr)
                torchaudio.save(new_audio_fold+new_name, signal, tg_sr)                
                #print(file, new_name)      
            elif 'train' in file and len(file)==25:          
                new_name = 'tra_'+file[6:9]+'_'+fold[4:7]+'_s'+fold[13]+'.wav'
                #process audio and save the new one
                signal, sr = torchaudio.load(path_fold + '/' + fold+'/'+file)
                signal = process_audio(signal, sr, tg_sr)
                torchaudio.save(new_audio_fold+new_name, signal, tg_sr)                
                #print(file, new_name)                
            #shutil.copy(path_fold+'/'+fold+'/'+file, path_fold+'/audio/test/'+new_name)
    if 'desc' in fold:
        for file in os.listdir(path_fold + '/' + fold):
            if 'test' in file and len(file)==22:       
                new_name = 'tst_00'+file[5]+'_'+fold[5:8]+'_s'+fold[14]+'.csv'
                #process label
                process_labels(path_fold+'/'+fold+'/'+file, new_label_fold+new_name)
                #print(file, new_name)
            elif 'test' in file and len(file)==23:                
                new_name = 'tst_0'+file[5:7]+'_'+fold[5:8]+'_s'+fold[14]+'.csv'
                #process label
                process_labels(path_fold+'/'+fold+'/'+file, new_label_fold+new_name)                
                #print(file, new_name)
            elif 'train' in file and len(file)==23:               
                new_name = 'tra_00'+file[6]+'_'+fold[5:8]+'_s'+fold[14]+'.csv'
                #process label
                process_labels(path_fold+'/'+fold+'/'+file, new_label_fold+new_name)                
                #print(file, new_name)
            elif 'train' in file and len(file)==24:                
                new_name = 'tra_0'+file[6:8]+'_'+fold[5:8]+'_s'+fold[14]+'.csv'
                #process label
                process_labels(path_fold+'/'+fold+'/'+file, new_label_fold+new_name)                
                #print(file, new_name)      
            elif 'train' in file and len(file)==25:               
                new_name = 'tra_'+file[6:9]+'_'+fold[5:8]+'_s'+fold[14]+'.csv'
                #process label
                process_labels(path_fold+'/'+fold+'/'+file, new_label_fold+new_name)                
                #print(file, new_name)                
        

## 2. Carregando os dados da nova versão da base ANSYN 30s

### 2.1. Separo os dados em treinamento, validação e teste

In [None]:
path_new = "/home/lab_acustica/Documentos/ANSYN_Dataset/label"

In [None]:
train_total_list = []
X_test = []
for file in os.listdir(path_new):
    if 'tra' in file:
        train_total_list.append(file[:-4])
    elif 'tst' in file:
        X_test.append(file[:-4])

In [None]:
train_set, val_set = torch.utils.data.random_split(train_total_list, [int(len(train_total_list)*0.7), int(len(train_total_list)*0.3)])

In [None]:
X_train, X_val = train_test_split(train_total_list,  test_size=0.3, random_state=42, shuffle = True)
print(len(X_train), len(X_val), len(X_test))

1512 648 540


In [None]:
print(X_train[0], X_val[0], X_test[0])

tra_220_ov1_s1 tra_216_ov2_s3 tst_046_ov2_s2


### 2.2. Dataloader para sinais de 30s

In [None]:
class ANSYN_Dataset(torch.utils.data.Dataset):

    def __init__(self, filenames):    
        self.filenames = filenames
        self.audio_path = "/home/lab_acustica/Documentos/ANSYN_Dataset/audio/"
        self.label_path = "/home/lab_acustica/Documentos/ANSYN_Dataset/label/"
    
    def normalize_layer(self, feats):    # função para normalizar
        with torch.no_grad():
            feats = torch.nn.functional.layer_norm(feats, feats.shape)
        return feats
    
    def read_label(self, labelfile):     # função para ler o conteúdo de cada csv
        ex_label_df = csv.reader(open(labelfile))
        csv_list = []
        for line in ex_label_df:
            if len(line)==1:
                csv_list.append(tuple())
            elif len(line)==2:
                csv_list.append(tuple((line[1])))
            elif len(line)==3:
                csv_list.append(tuple((line[1], line[2])))
            elif len(line)==4:
                csv_list.append(tuple((line[1], line[2], line[3])))    
        return csv_list    

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, index):

        feats, _ = torchaudio.load(self.audio_path + self.filenames[index]+'.wav')
        feats = self.normalize_layer(feats)
        target = self.read_label(self.label_path + self.filenames[index]+'.csv')
        return {"Audio": feats, "Class":target}

In [None]:
train_dataset = ANSYN_Dataset(X_train)                           
val_dataset =  ANSYN_Dataset(X_val)                              

In [None]:
print('Número de amostras de treinamento:', len(train_dataset))
print('Número de amostras de validação:', len(val_dataset))

Número de amostras de treinamento: 1512
Número de amostras de validação: 648


In [None]:
train_dataset[0]["Audio"]

tensor([[-0.0148,  0.0102,  0.0176,  ...,  0.0011,  0.0011,  0.0011],
        [-0.0353, -0.0142, -0.0224,  ...,  0.0011,  0.0011,  0.0011],
        [ 0.0088,  0.0058, -0.0155,  ...,  0.0011,  0.0011,  0.0011],
        [ 0.0269,  0.0086, -0.0007,  ...,  0.0011,  0.0011,  0.0011]])

In [None]:
train_dataset[0]["Class"]

[(),
 (),
 (),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 ('3',),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('4',),
 ('4',),
 ('4',),
 ('4',),
 ('4',),
 (),
 (),
 (),
 (),
 (),
 ('7',),
 ('7',),
 ('7',),
 ('7',),
 ('7',),
 ('7',),
 ('7',),
 ('7',),
 ('7',),
 ('7',),
 ('7',),
 ('7',),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('1',),
 ('1',),
 ('1',),
 ('1',),
 ('1',),
 ('1',),
 ('1',),
 ('1',),
 ('1',),
 ('1',),
 ('1',),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 ('5',),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('6',),
 ('6',),
 ('6',),
 ('6',),
 ('6',),
 ('6',),
 ('6',),
 ('6',),
 ('6',),
 ('6',),
 ('6',),
 ('6',),
 ('6',),
 ('6',),
 ('6',),
 (),
 (),
 (),
 (),
 ('7',),
 ('7',),
 ('7',),
 ('7',),
 ('7',),
 ('7',)

# 3. Processando a base de dados em trechos de 200ms 

#### Considerando que precisamos predecir o que acontece no som a cada 200ms, processamos a base e salvamos os trechos de 200ms para economizar memória RAM no treinamento.

### 3.1. Processando os arquivos de áudio

In [None]:
# Executar essa célula uma vez só 
file_dest = '/home/lab_acustica/Documentos/ANSYN_Dataset/audio_200ms_ov1/'
files = os.listdir('/home/lab_acustica/Documentos/ANSYN_Dataset/audio/')
count  = 0
for audios in files:
    if 'ov1' in audios:
        signal, _= torchaudio.load('/home/lab_acustica/Documentos/ANSYN_Dataset/audio/'+audios)
        audio_30s = torch.tensor_split(signal, 150, dim=1)
        if count%100 ==0: 
            print('Files processed:' , count)
        for j in range(len(audio_30s)):
            name = file_dest+audios[:-4]+'_'+str(j)+'.wav'
            torchaudio.save(name, audio_30s[j], 16000) 
        count = count + 1

Files processed: 0
Files processed: 100
Files processed: 200
Files processed: 300
Files processed: 400
Files processed: 500
Files processed: 600
Files processed: 700
Files processed: 800


### 3.2. Separando em treino validação e teste

In [None]:
path_200ms = '/home/lab_acustica/Documentos/ANSYN_Dataset/audio_200ms_ov1/'

In [None]:
train_total_list = []
X_test = []
for file in os.listdir(path_200ms):
    if 'tra' in file:
        train_total_list.append(file[:-4])
    elif 'tst' in file:
        X_test.append(file[:-4])

In [None]:
train_set, val_set = torch.utils.data.random_split(train_total_list, [int(len(train_total_list)*0.7), int(len(train_total_list)*0.3)])

In [None]:
X_train, X_val = train_test_split(train_total_list,  test_size=0.3, random_state=42, shuffle = True)
print(len(X_train), len(X_val), len(X_test))

75600 32400 27000


In [None]:
print(X_train[0], X_val[0], X_test[0])

tra_045_ov1_s3_106 tra_065_ov1_s1_1 tst_032_ov1_s2_5


Crio um dataframe para cada grupo de dados 

In [None]:
df_train = pd.DataFrame(X_train, columns=['Filename'])
print(df_train.shape)
df_train.head()

(75600, 1)


Unnamed: 0,Filename
0,tra_045_ov1_s3_106
1,tra_165_ov1_s2_17
2,tra_102_ov1_s1_139
3,tra_105_ov1_s2_131
4,tra_023_ov1_s3_90


In [None]:
df_val = pd.DataFrame(X_val, columns=['Filename'])
print(df_val.shape)
df_val.head()

(32400, 1)


Unnamed: 0,Filename
0,tra_065_ov1_s1_1
1,tra_096_ov1_s3_1
2,tra_126_ov1_s1_9
3,tra_067_ov1_s1_61
4,tra_115_ov1_s3_56


In [None]:
df_test = pd.DataFrame(X_test, columns=['Filename'])
print(df_test.shape)
df_test.head()

(27000, 1)


Unnamed: 0,Filename
0,tst_032_ov1_s2_5
1,tst_051_ov1_s1_45
2,tst_026_ov1_s2_64
3,tst_027_ov1_s3_53
4,tst_005_ov1_s2_44


### 3.3. Processando as labels em arquivos separados cada 200ms

In [None]:
#código para o subconjunto ov1 só 

#label_dest = '/home/lab_acustica/Documentos/ANSYN_Dataset/label_200ms_ov1/'
#files = '/home/lab_acustica/Documentos/ANSYN_Dataset/label/'

for label in os.listdir(files):
    if 'ov1' in label:
        label_file = csv.reader(open(files+label, 'r'))
        for line in label_file:
            if int(line[0])%2==0 and len(line)==2: 
                _fid = open(label_dest+label[:-4]+'_'+str(int(int(line[0])/2))+'.csv', 'w')
                _fid.write('{}\n'.format(int(line[1])))
                _fid.close()
            elif int(line[0])%2==0 and len(line)==1:  
                _fid = open(label_dest+label[:-4]+'_'+str(int(int(line[0])/2))+'.csv', 'w')
                _fid.close()                



Baseado nos dataframes de treino validação e teste, abro o arquivo csv correspondente e salvo uma lista

In [None]:
labels_path = '/home/lab_acustica/Documentos/ANSYN_Dataset/label_200ms_ov1/'

In [None]:
test_classes = []
for i in range(len(df_test)):    
    if os.path.getsize(labels_path+df_test['Filename'][i]+'.csv')==0:
        test_classes.append(str(-1))
    else:
        label_file = csv.reader(open(labels_path+df_test['Filename'][i]+'.csv', 'r'))
        for count , line in enumerate(label_file):
            test_classes.append(line[0])
print(len(test_classes))

27000


In [None]:
df_test['Class'] = test_classes
df_test.head()

Unnamed: 0,Filename,Class
0,tst_032_ov1_s2_5,7
1,tst_051_ov1_s1_45,-1
2,tst_026_ov1_s2_64,5
3,tst_027_ov1_s3_53,3
4,tst_005_ov1_s2_44,2


In [None]:
val_classes = []
for i in range(len(df_val)):    
    if os.path.getsize(labels_path+df_val['Filename'][i]+'.csv')==0:
        val_classes.append(str(-1))
    else:
        label_file = csv.reader(open(labels_path+df_val['Filename'][i]+'.csv', 'r'))
        for count , line in enumerate(label_file):
            val_classes.append(line[0])
print(len(val_classes))

32400


In [None]:
df_val['Class'] = val_classes
df_val.head()

Unnamed: 0,Filename,Class
0,tra_065_ov1_s1_1,1
1,tra_096_ov1_s3_1,0
2,tra_126_ov1_s1_9,1
3,tra_067_ov1_s1_61,3
4,tra_115_ov1_s3_56,1


In [None]:
train_classes = []
for i in range(len(df_train)):    
    if os.path.getsize(labels_path+df_train['Filename'][i]+'.csv')==0:
        train_classes.append(str(-1))
    else:
        label_file = csv.reader(open(labels_path+df_train['Filename'][i]+'.csv', 'r'))
        for count , line in enumerate(label_file):
            train_classes.append(line[0])
print(len(train_classes))

75600


In [None]:
df_train['Class'] = train_classes
df_train.head()

Unnamed: 0,Filename,Class
0,tra_045_ov1_s3_106,4
1,tra_165_ov1_s2_17,0
2,tra_102_ov1_s1_139,2
3,tra_105_ov1_s2_131,4
4,tra_023_ov1_s3_90,0


Finalmente tenho um dataframe para cada grupo de dados, mesmo que será enviado como parâmetro para cada dataloader

### 3.4. Dataloader para base de dados dividida em 200ms

In [None]:
class ANSYN_Dataset_20ms(torch.utils.data.Dataset):

    def __init__(self, filenames):    
        self.filenames = filenames
        self.audio_path = "/home/lab_acustica/Documentos/ANSYN_Dataset/audio_200ms_ov1/"
    
    def normalize_layer(self, feats):    # função para normalizar
        with torch.no_grad():
            feats = torch.nn.functional.layer_norm(feats, feats.shape)
        return feats

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, index):

        feats, _ = torchaudio.load(self.audio_path + self.filenames['Filename'][index]+'.wav')
        feats = self.normalize_layer(feats)
        return {"Audio": feats, "Class":self.filenames['Class'][index]}

In [None]:
train_dataset = ANSYN_Dataset_20ms(df_train)                           
val_dataset =  ANSYN_Dataset_20ms(df_val)  
test_dataset = ANSYN_Dataset_20ms(df_test)

In [None]:
print('Número de amostras de treinamento:', len(train_dataset))
print('Número de amostras de validação:', len(val_dataset))
print('Número de amostras de teste:', len(test_dataset))

Número de amostras de treinamento: 75600
Número de amostras de validação: 32400
Número de amostras de teste: 27000


In [None]:
train_dataset[0]["Audio"]

tensor([[ 0.0256, -0.4013,  0.8544,  ..., -0.0391, -0.0111,  0.1783],
        [-0.0034, -0.0026,  0.0420,  ..., -0.0236, -0.0217, -0.0097],
        [-0.0157,  0.5152, -0.8648,  ...,  0.0439,  0.0293, -0.2325],
        [ 0.0245, -0.5398,  1.0974,  ..., -0.0475, -0.0875,  0.3067]])

In [None]:
train_dataset[0]["Class"]

'4'