### Prepocesamiento y preparación del dataset.

En primer lugar los audios se convierten a formato mp3 para que estén igual a las grabaciones del call center. Después se aplica un filtro de 300 a 3300 Hz porque es el rango de frecuencias en el que trabajan los codecs de audio más utilizados para telefonía IP. Por último, a los audios se les calcula el los features.
Los casos positivos se incrementan agregando ruido y desplazándolos en tiempo.

#### Imports

In [2]:
import os
import glob
import random
import numpy as np
import statistics
import librosa
from sklearn.model_selection import train_test_split
from database import Database
from audio_process import butter_bandpass_filter, feature_extraction, augment_data



#### Armado del dataset completo

- Se eliminan los audios de menos de 1 segundo

In [3]:

def process_dataset(directory, name, n_mfcc, n_start, augment=True ):
    db = Database("Dataset_bruto")
    f0min = 300
    f0max = 3300
    directory = directory + '*.mp3'
    print(directory)
    files = glob.glob(directory)
    anger = 0
    files = files[n_start :]
    try:
        for n, path in enumerate(files):
            if n % 1000 == 0:
                print(n)

            file_name = os.path.basename(path)
            file_name = file_name.replace("-", "_")
            in_db = db.select_by_id(file_name, like=True)
            if in_db:
                continue
            emotion = file_name.split('_')[2]
            file_name = f'{name}_{file_name}'
            if emotion == '05' or emotion == 'anger.mp3' or emotion =='ANG':
                emotion = 1
                anger = anger + 1
            else:
                emotion = 0
            
            audio, sr = librosa.load_mp3(path)
            if len(list(audio))/sr < 0.5:
                    continue
            features = feature_extraction(audio, sr, f0min, f0max, n_mfcc, unit="Hertz" )
            if features != 'skip':
                 db.post(file_name, emotion, features, augmentation='')
            names = ['clipping', 'time_stretch', 'pitch_shift', 'reverb']
            augmented_audios = augment_data(audio, sr)
            for audio, aug in zip(augmented_audios, names):
                features = feature_extraction(audio, sr, f0min, f0max, n_mfcc, unit="Hertz" )
                if features != 'skip':
                    db.post(file_name, emotion, features, augmentation=aug)
                else:
                    print(features)
    except Exception as e:
        print(path)
        raise(e)


In [6]:
Meld = "Data/Meld/*/"
Enterface = "Data/Enterface/*/"
Crema = "Data/CREMA-D/"
IEMOCAP = "Data/IEMOCAP/"
datasets= [Meld, Enterface, Crema, IEMOCAP]
names = ['Meld', 'Enterface', 'Crema', 'IEMOCAP']

for name, dataset_directory in zip(names, datasets):
    process_dataset(dataset_directory, name, 16, 0)


/home/francoj/Documentos/Reconocimiento de emociones/tesis/Data/Meld/*/*.mp3
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
/home/francoj/Documentos/Reconocimiento de emociones/tesis/Data/Enterface/*/*.mp3
0
1000
/home/francoj/Documentos/Reconocimiento de emociones/tesis/Data/CREMA-D/*.mp3
0
1000
2000
3000
4000
5000
6000
7000
/home/francoj/Documentos/Reconocimiento de emociones/tesis/Data/IEMOCAP/*.mp3
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [None]:
db = Database("Dataset_bruto")
rgx = re.compile("Enterface_.._.._05.*")
result = db.collection.find({"_id": rgx})
for r in result:
    id = r["_id"]
    r["label"] = 1
    db.collection.update_one({"_id": id}, {"$set": r})
rgx = re.compile("Enterface.*")
result = db.collection.find({"_id": rgx, "label": 1})
print(len(list(result)))

rgx = re.compile("Meld_.*_anger.*")
result = db.collection.find({"_id": rgx})
for r in result:
    id = r["_id"]
    r["label"] = 1
    db.collection.update_one({"_id": id}, {"$set": r})
rgx = re.compile("Meld.*")
result = db.collection.find({"_id": rgx, "label": 1})
print(len(list(result)))

rgx = re.compile("Crema_.*_ANG.*")
result = db.collection.find({"_id": rgx})
for r in result:
    id = r["_id"]
    r["label"] = 1
    db.collection.update_one({"_id": id}, {"$set": r})
rgx = re.compile("Crema.*")
result = db.collection.find({"_id": rgx, "label": 1})
print(len(list(result)))

rgx = re.compile("IEMOCAP_.*_ang.*")
result = db.collection.find({"_id": rgx})
for r in result:
    id = r["_id"]
    r["label"] = 1
    db.collection.update_one({"_id": id}, {"$set": r})
rgx = re.compile("IEMOCAP.*")
result = db.collection.find({"_id": rgx, "label": 1})
print(len(list(result)))


In [None]:
l = [{field: np.nan} for field in db.feature_names]
r = {"$or": l}
db.collection.delete_many(r)

In [None]:
rgx = re.compile(".*xxx.*")
db.collection.delete_many({"_id": rgx})

In [None]:
results = list(db.collection.find({"augmented": False}))
n = int(len(results) * 0.4)
n_sample = n if n % 2 == 0 else n - 1

test_val = np.random.choice(results, n_sample, False)

test = list(test_val[0 : int(len(test_val) / 2)])
val = list(test_val[int(len(test_val) / 2) :])
db_val = Database("Dataset_validation")
db_test = Database("Dataset_test")
db_val.collection.insert_many(val)
db_test.collection.insert_many(test)