# Introduction

Do not spend too much time trying to get very tiny metrics improvement. Once you have a model with a correct predictive power, you should better spend time explaining your data cleaning & preparation pipeline as well as explanations & visualizations of the results.

The goal is to see your fit with our company culture & engineering needs, spending 50h on an over-complicated approach will not give you bonus points compared to a simple, yet effective, to-the-point solution.

## About the data

The dataset you will be working with is called Emo-DB and can be found [here](http://emodb.bilderbar.info/index-1280.html).

It is a database containing samples of emotional speech in German. It contains samples labeled with one of 7 different emotions: Anger, Boredom, Disgust, Fear, Happiness, Sadness and Neutral. 

Please download the full database and refer to the documentation to understand how the samples are labeled (see "Additional information")
   
The goal of this project is to develop a model which is able to **classify samples of emotional speech**. Feel free to use any available library you would need, but beware of re-using someone else's code without mentionning it!

## Deliverable

The end-goal is to deliver us a zip file containing:
* This report filled with your approach, in the form of an **iPython Notebook**.
* A **5-10 slides PDF file**, containing a technical presentation covering the important aspects of your work
* A Dockerfile which defines a container for the project. The container should handle everything (download the data, run the code, etc...). When running the container it should expose the jupyter notebook on one port and expose a Flask API on another one. The Flask app contains two endpoints:
  - One for training the model
  - One for querying the last trained model with an audio file of our choice in the dataset
* A README.md which should contain the commands to build and run the docker container, as well as how to perform the queries to the API. 
* Any necessary .py, .sh or other files needed to run your code.

# Libraries Loading

In [1]:
import glob
from scipy.io import wavfile
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from IPython.display import clear_output
from pathlib import Path
import numpy as np
from flask import render_template
import traceback
from datetime import datetime

import pickle
import pandas as pd
from IPython.core.debugger import set_trace
from speechpy.feature import mfcc

# Data Preparation & Cleaning

In [3]:
DATA_DIR = "./data"
WAV_DIR = os.path.join(DATA_DIR,"wav")
MFCC_DIR  = os.path.join(DATA_DIR,"mfcc")

DE2EN = {'W':'A', #Wut-Anger
         'L':'B', #Langeweile-Bordom
         'E':'D', #Ekel-Disgust
         'A':'F', #Angst-Fear
         'F':'H', #Freude-Happiness
         'T':'S',
         'N':'N'} #Traueer-Sadness
EN2DE = {value:key for key,value in DE2EN.items()}
EN2NUM = {item[1]:num for item,num in zip(DE2EN.items(),range(len(DE2EN)))}
NUM2EN = {value:key for key,value in EN2NUM.items()}
FULL_EM = {'A':'Anger',
          'B': 'Bordom',
          'D':'Disgust',
          'F':'Fear',
          'H':'Happiness',
          'S':'Sadness',
          'N':'Neutral'}

DE2NUM = {item[0]:num for item,num in zip(DE2EN.items(),range(len(DE2EN)))}

In [40]:
def zeropadd(data,mode='max'):
    if mode == 'max':
        new_len = max([x.shape[0] for x in data])
    else:
        new_len = int(np.round(np.mean([x.shape[0] for x in data])))
    def padd(x):
        diff = abs(new_len - x.shape[0])
        shift = diff %2
        diff //=2
        if x.shape[0] < new_len:
            return np.pad(x,(diff,diff+shift),'constant')
        else:
            return x[diff:-(diff+shift)]
    data_padded = np.zeros((len(data),new_len))
    for i,x in enumerate(data):
        data_padded[i] = padd(x)
    return data_padded

def load_wav_data(wav_dir=WAV_DIR):
    data,sfs,targets,file_names = [],[],[],[]
    for root, dirs, files in os.walk(wav_dir, topdown=False):
        for file in files:
            sf,audio_data = wavfile.read(os.path.join(root,file))
            data.append(audio_data)
            sfs.append(sf)
            target = DE2NUM[file[5].capitalize()]
            targets.append(target)
            file_names.append(file.split(".")[0])
    data = zeropadd(data,mode='mean')
    file_names = np.array(file_names)
    sfs = np.array(sfs)
    targets = np.array(targets)
    order = np.argsort(file_names)
    return file_names[order],sfs[order],data[order],targets[order]

def get_mfcc(data,sfs):
    ret = np.array([mfcc(x,sf,num_cepstral=39) for x,sf in zip(data,sfs)])
    return np.expand_dims(ret,axis=1)

def save_mfcc_data(file_names,data,targets):
    Path("data/mfcc").mkdir(parents=False, exist_ok=True)
    for file,smple,target in zip(file_names,data,targets):
        file_name = file +".pkl"
        with open(os.path.join("data/mfcc",file_name),'wb') as f:
            save_data = (smple,target)
            pickle.dump(save_data,f)
    print("Saving done!")

def load_mfcc_data(mfcc_dir):
    data = []
    targets = []
    filenames = []
    for root, dirs, files in os.walk(mfcc_dir, topdown=False):
        for file in files:
            with open(os.path.join(root,file),'rb') as f:
                temp = pickle.load(f)
            data.append(temp[0])
            targets.append(temp[1])
            filenames.append(file.split('.')[0])
    data = np.array(data)
    targets = np.array(targets)
    filenames = np.array(filenames)
    order = np.argsort(filenames)
    return filenames[order],data[order],targets[order]

# Feature Engineering & Modeling

In [31]:
def dummy_model(filename):
    data = wavfile.read(os.path.join(WAV_DIR,filename))[1]
    return filename[5]

In [32]:
class CNN_classif(nn.Module):
    def __init__(self):
        super(CNN_classif,self).__init__()
        self.convblock1 = nn.Sequential(
                                nn.Conv2d(1,8,kernel_size=13),
                                nn.BatchNorm2d(8),
                                nn.ReLU())
        self.convblock2 = nn.Sequential(
                                nn.Conv2d(8,8,kernel_size=13),
                                nn.BatchNorm2d(8),
                                nn.ReLU(),
                                nn.MaxPool2d(kernel_size=(2,1)))
        self.convblock3 = nn.Sequential(
                                nn.Conv2d(8,8,kernel_size=13),
                                nn.BatchNorm2d(8),
                                nn.ReLU())
        self.convblock4 = nn.Sequential(
                                nn.Conv2d(8,8,kernel_size=2),
                                nn.BatchNorm2d(8),
                                nn.ReLU(),
                                nn.MaxPool2d(kernel_size=(2,1)))
        self.linblock = nn.Sequential(
                                nn.Flatten(),
                                nn.Linear(896,64),
                                nn.ReLU(),
                                nn.Dropout(0.2),
                                nn.Linear(64,7)
        )        
    def forward(self,x):
        #set_trace()
        x = self.convblock1(x)
        x = self.convblock2(x)
        x = self.convblock3(x)
        x = self.convblock4(x)
        x = self.linblock(x)
        return x

In [79]:
from keras.layers import LSTM as KERAS_LSTM

Using TensorFlow backend.


In [None]:
KERAS_LSTM()

In [117]:
class LSTM_classif(nn.Module):
    def __init__(self):
        super(LSTM_classif,self).__init__()
        #self.lstm = nn.LSTM(num_layers=128,input_size = (int(275),int(39)),
                           #hidden_size=128)
        self.lstm = nn.LSTM([int(3),int(2)],3)
        self.out_layer = nn.Sequential(nn.Dropout(0.5),
                                       nn.Linear(1,32),
                                       nn.ReLU(),
                                       nn.Linear(32,16),
                                       nn.Tanh())
    

In [118]:
lstm = LSTM_classif()

TypeError: new(): argument 'size' must be tuple of ints, but found element of type list at pos 2

In [89]:
%debug

> [0;32m/home/ymentha/anaconda3/envs/ML/lib/python3.7/site-packages/torch/nn/modules/rnn.py[0m(69)[0;36m__init__[0;34m()[0m
[0;32m     67 [0;31m                [0mlayer_input_size[0m [0;34m=[0m [0minput_size[0m [0;32mif[0m [0mlayer[0m [0;34m==[0m [0;36m0[0m [0;32melse[0m [0mhidden_size[0m [0;34m*[0m [0mnum_directions[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     68 [0;31m[0;34m[0m[0m
[0m[0;32m---> 69 [0;31m                [0mw_ih[0m [0;34m=[0m [0mParameter[0m[0;34m([0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m([0m[0mgate_size[0m[0;34m,[0m [0mlayer_input_size[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     70 [0;31m                [0mw_hh[0m [0;34m=[0m [0mParameter[0m[0;34m([0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m([0m[0mgate_size[0m[0;34m,[0m [0mhidden_size[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     71 [0;31m                [0mb_ih[0m [0;34m=[0m [0mParamete

ipdb>  gate_size


512


ipdb>  layer_input_size


(275, 39)


ipdb>  q


In [33]:
def train_model(model, inputs, targets,nb_epochs=5):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr = 1e-4)
    batch_size = 20
    for e in range(nb_epochs):
        clear_output(wait=True)
        outputs = model(inputs)
        _, predicted = outputs.max(1)
        accuracy = (predicted == targets).sum().item() / inputs.shape[0] * 100
        print("Progression:{} % Accuracy: {:.2f}% ".format(e/nb_epochs*100,accuracy))
        for train_batch,target_batch in zip(inputs.split(batch_size),
                                targets.split(batch_size)):
            output_batch = model(train_batch)
            loss = criterion(output_batch,target_batch)
            
            model.zero_grad()
            loss.backward()
            optimizer.step()

In [34]:
def run_model(data_f,targets,nb_epochs=5):
    """
    train and save the modle
    """
    model = CNN_classif()
    data_f = torch.Tensor(data_f)
    targets = torch.Tensor(targets).long()
    train_model(model,data_f,targets.long(),nb_epochs)
    name = datetime.now().strftime("%m_%d_%H%M")
    torch.save(model.state_dict(), "./models/{}".format(name))
    return True


In [35]:
def load_most_recent(model,model_dir):
    filename = max([file  for root, dirs, files in os.walk('./models/', topdown=False) for file in files])
    model2.load_state_dict(torch.load(os.path.join(model_dir,filename)))

In [41]:
file_names,sfs,data,targets = load_wav_data()
data_f = get_mfcc(data,sfs)
save_mfcc_data(file_names,data_f,targets)
file_names,data_f,targets = load_mfcc_data('./data/mfcc')

Saving done!


In [73]:
model = CNN_classif()

In [69]:
model.load_state_dict(torch.load("./models/04_16_1614"))

<All keys matched successfully>

In [74]:
model.eval()

CNN_classif(
  (convblock1): Sequential(
    (0): Conv2d(1, 8, kernel_size=(13, 13), stride=(1, 1))
    (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (convblock2): Sequential(
    (0): Conv2d(8, 8, kernel_size=(13, 13), stride=(1, 1))
    (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
  )
  (convblock3): Sequential(
    (0): Conv2d(8, 8, kernel_size=(13, 13), stride=(1, 1))
    (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (convblock4): Sequential(
    (0): Conv2d(8, 8, kernel_size=(2, 2), stride=(1, 1))
    (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
  )
  (linblock): Sequent

In [78]:
model(torch.Tensor(data_f[:1]))

tensor([[-0.0087, -0.0942,  0.0515, -0.1189, -0.1157, -0.0567,  0.0173]],
       grad_fn=<AddmmBackward>)

In [84]:
data_f[:1].shape

(1, 1, 275, 39)

In [21]:
run_model(data_f,targets,nb_epochs=3)

# Results & Visualizations

## Sleeping Code

In [283]:
import tensorflow as tf

loss  = tf.keras.losses.BinaryCrossentropy()

loss([0., 0., 1., 1.], [1., 1., 1., 0.])

loss2 = nn.NLLLoss()
input2 = torch.Tensor([[1.0,0.0],[1.0,0.0], [0.0,1.0],[0.0,1.0]])
input2.requires_grad_ = True
target2 = torch.Tensor([1,1,1,0]).long()
loss2(input2, target2)

loss3 = nn.BCELoss(reduction='mean')
input3 = torch.Tensor([0., 0., 1., 1.])
target3 = torch.Tensor([1., 1., 1., 0.])
loss3(input3,target3 )

In [6]:
def load_pd_data(wav_dir=WAV_DIR):
    for root, dirs, files in os.walk(wav_dir, topdown=False):
        paths = [os.path.join(root,file) for file in files]
        data = []
        for file in files:
            audio_data = wavfile.read(os.path.join(root,file))[1]
            speaker_id,text_id,emotion_en = parse_filename(file)
            row = [speaker_id,text_id,emotion_en,audio_data]
            data.append(row)
    res = pd.DataFrame(data,columns=["speaker_id","text_id","emotion","data"])
    return res.join(speaker_data,on="speaker_id")

In [4]:
def parse_filename(filename):
    """
    parses the attributes of a given sample based on its filename
    """
    speaker_id = int(filename[:2])
    text_id = filename[2:5]
    emotion_de = filename[5]
    #emotion_en = DE2EN[emotion_de]
    #return speaker_id,text_id,emotion_en
    return text_id

In [5]:
speaker_data = [[3  , 'male',  31],
                [8  , 'female',34 ],
                [9  , 'female',21 ],
                [10 , 'male',  32 ],
                [11 , 'male',  26 ],
                [12 , 'male',  30] ,
                [13 , 'female',32], 
                [14 , 'female',35] ,
                [15 , 'male',  25] ,
                [16 , 'female',31]]
speaker_data = pd.DataFrame(speaker_data,columns = ['speaker_id','sex','age'])
speaker_data.set_index('speaker_id',inplace=True)