[View in Colaboratory](https://colab.research.google.com/github/youqad/Neurorobotics_Intent-Recognition/blob/master/TP_Intent_Recognition.ipynb)

# TP 4: Intent recognition

# *Exercise 1*: Automatic detection of speaker’s intention from supra-segmental features

The aim of this exercice is to develop a human feedback classifier : positive (approval) / negative (prohibition). This classifier might be used to teach robots and/or to guide
robot’s learning.

In [0]:
import urllib.request
import numpy as np
import pandas as pd
from google.colab import files as google_files

In [0]:
def list_from_URL(file_URL, function_applied=None):
  lines_bytes = urllib.request.urlopen(file_URL).readlines()
  lines = []

  for line in lines_bytes:
    line = line.decode("utf-8").rstrip()
    
    if function_applied is not None:
      line = function_applied(line)
    
    lines.append(line)
   
  return lines

## 1. Extraction of prosodic features ($f_0$ and energy)

In [0]:
# # /!\ NO NEED TO EXECUTE THIS CELL AGAIN !!!
# 
# 
# filenames = list_from_URL('https://raw.githubusercontent.com/youqad/Neurorobotics_Intent-Recognition/master/filenames.txt')
# filenames = list(set(filenames))
# 
# files = []
# indices = []
# 
# for file in filenames:
# 
#     URL_f0 = 'https://raw.githubusercontent.com/youqad/Neurorobotics_Intent-Recognition/master/data_files/{}.f0'.format(file)
#     file_dicts = [{key:val for key, val in zip(['time', 'f0'], map(float, l.split()))} for l in list_from_URL(URL_f0)]
# 
#     URL_en = 'https://raw.githubusercontent.com/youqad/Neurorobotics_Intent-Recognition/master/data_files/{}.en'.format(file)
#     for l, d in zip(list_from_URL(URL_en), file_dicts):
#       d["file"] = file
#       d["en"] = float(l.split()[1])
#       d["label"] = file[-2:]
# 
#     files.extend(file_dicts)
# 
# # How `files` looks like:
# # files = [ 
# #           {"file": "cy0001at", "time": 0.02, "f0": 0., "en": 0.},
# #           {"file": "cy0001at", "time": 1.28, "f0": 0., "en": 0.},
# #           ...
# #           {"file": "li1450at", "time": 0.02, "f0": 0., "en": 0.},
# #           {"file": "li1450at", "time": 1.56, "f0": 404., "en": 65.}
# #         ]
# 
# pd.DataFrame(files).to_csv('data.csv', encoding='utf-8', index=False) # To reuse it next time
# google_files.download('data.csv')

In [18]:
# loading training data
df = pd.read_csv('https://raw.githubusercontent.com/youqad/Neurorobotics_Intent-Recognition/master/data.csv').set_index('file')

df1 = df.loc[df['label'] != 'at']
df1.head()

Unnamed: 0_level_0,en,f0,label,time
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
li1377pw,0.0,0.0,pw,0.02
li1377pw,39.0,0.0,pw,0.04
li1377pw,40.0,0.0,pw,0.06
li1377pw,39.0,0.0,pw,0.08
li1377pw,39.0,0.0,pw,0.1


## 2. Extraction of functionals (statistics) : mean, maximum, range, variance, median, first quartile, third quartile, mean absolute of local derivate

In [19]:
print(df1.columns.values)

#df.groupby('file').mean().head()
#df1.groupby('file').max().head()
#df1.groupby('file').var().head()
#df1.groupby('file').median().head()
df1.groupby('file').quantile([.25, .75]).head()

['en' 'f0' 'label' 'time']


Unnamed: 0_level_0,Unnamed: 1_level_0,en,f0,time
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cy0007pw,0.25,41.0,0.0,0.525
cy0007pw,0.75,66.0,189.5,1.535
cy0008pw,0.25,41.0,0.0,0.27
cy0008pw,0.75,64.5,192.0,0.77
cy0009pw,0.25,40.75,0.0,0.395


In [20]:
list_features  = ['mean', 
                  'max',
                  ('range', lambda x: max(x)-min(x)),
                  'var',
                  'median',
                  ('1st_quantile', lambda x: x.quantile(.25)),
                  ('3rd_quantile', lambda x: x.quantile(.75)),
                  ('mean_absolute_local_derivate', lambda x: abs(x.diff()).mean())
                 ]

df1.groupby('file')['f0','en'].agg(list_features).head()

Unnamed: 0_level_0,f0,f0,f0,f0,f0,f0,f0,f0,en,en,en,en,en,en,en,en
Unnamed: 0_level_1,mean,max,range,var,median,1st_quantile,3rd_quantile,mean_absolute_local_derivate,mean,max,range,var,median,1st_quantile,3rd_quantile,mean_absolute_local_derivate
file,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
cy0007pw,92.284314,257.0,257.0,10372.542128,0.0,0.0,189.5,13.683168,52.313725,71.0,71.0,228.455057,52.0,41.0,66.0,2.970297
cy0008pw,78.431373,250.0,250.0,9930.090196,0.0,0.0,192.0,26.44,47.72549,70.0,70.0,321.963137,43.0,41.0,64.5,3.96
cy0009pw,69.065789,243.0,243.0,8927.182281,0.0,0.0,182.25,12.853333,49.473684,74.0,74.0,260.839298,42.0,40.75,66.0,3.52
cy0010pw,29.196078,221.0,221.0,4696.178994,0.0,0.0,0.0,15.267327,46.04902,77.0,77.0,165.789652,42.0,41.0,50.75,3.306931
cy0011pw,110.74359,230.0,230.0,9290.400932,172.0,0.0,192.5,7.506494,53.653846,71.0,71.0,258.125375,62.0,41.25,66.0,2.337662


## 3. Check functionals for both voiced (i.e. $f_0\neq 0$) and unvoiced segments. Which segments are suited for the approach ?

In [21]:
voiced = df1.loc[df1['f0']!=0].groupby('file')['f0','en'].agg(list_features)
voiced.head()

Unnamed: 0_level_0,f0,f0,f0,f0,f0,f0,f0,f0,en,en,en,en,en,en,en,en
Unnamed: 0_level_1,mean,max,range,var,median,1st_quantile,3rd_quantile,mean_absolute_local_derivate,mean,max,range,var,median,1st_quantile,3rd_quantile,mean_absolute_local_derivate
file,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
cy0007pw,200.276596,257.0,90.0,675.987049,191.0,182.5,213.0,5.869565,65.914894,71.0,16.0,17.775208,66.0,63.5,70.0,1.652174
cy0008pw,200.0,250.0,83.0,538.421053,198.5,179.5,210.0,10.368421,61.0,70.0,70.0,242.736842,66.0,61.5,68.0,5.842105
cy0009pw,194.407407,243.0,77.0,446.94302,190.0,180.0,209.0,7.192308,67.333333,74.0,20.0,17.923077,68.0,66.0,70.0,2.884615
cy0010pw,186.125,221.0,67.0,465.316667,178.5,171.25,204.25,6.466667,65.75,77.0,25.0,50.466667,64.0,62.0,70.75,4.0
cy0011pw,191.955556,230.0,66.0,314.816162,190.0,179.0,204.0,4.113636,65.288889,71.0,19.0,14.710101,65.0,63.0,68.0,0.954545


In [22]:
unvoiced = df1.loc[df1['f0']==0].groupby('file')['en'].agg(list_features)
unvoiced.head()

Unnamed: 0_level_0,mean,max,range,var,median,1st_quantile,3rd_quantile,mean_absolute_local_derivate
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
cy0007pw,40.690909,58.0,58.0,113.550842,41.0,40.5,43.5,3.740741
cy0008pw,39.16129,58.0,58.0,189.606452,42.0,41.0,43.0,5.2
cy0009pw,39.632653,56.0,56.0,119.570578,41.0,40.0,42.0,3.625
cy0010pw,42.383721,68.0,68.0,101.439261,41.0,40.0,43.0,3.105882
cy0011pw,37.787879,51.0,51.0,150.922348,41.0,40.0,42.0,4.0625


## 4. Build two databases by randomly extracting examples : Learning database ($60\%$) and Test database

In [0]:
# Question 3

## Explanation: A sound can be either voiced, like a vowel, or unvoiced, like a consonant. We want to see which of them contains more intention
 
## ps: change "data" by the correct name

def train_test(df=df1, train_percentage=.6, seed=1):
  
  voiced = df.loc[df['f0']!=0].groupby('file')['f0','en'].agg(list_features)
  unvoiced = df.loc[df['f0']==0].groupby('file')['en'].agg(list_features)

  X, Y = {}, {}

  X['voiced'], Y['voiced'] = {}, {}
  X['unvoiced'], Y['unvoiced'] = {}, {}


  X['voiced']['all'] = np.array(df.groupby('file')['f0','en'].agg(list_features))
  Y['voiced']['all'] = np.array(df.loc[df['f0']!=0].groupby(['file']).min().label.values)

  X['unvoiced']['all'] = np.array(unvoiced)
  Y['unvoiced']['all'] = np.array(df.loc[df['f0']==0].groupby(['file']).min().label.values)
  
  np.random.seed(seed)
  
  for type in ['voiced', 'unvoiced']:
    n = len(X[type]['all'])
    ind_rand = np.random.randint(n, size=int(train_percentage*n)) # random indices
    train_mask = np.zeros(n, dtype=bool)
    train_mask[ind_rand] = True
    X[type]['train'], X[type]['test'] = X[type]['all'][train_mask],  X[type]['all'][~train_mask]
    Y[type]['train'], Y[type]['test'] = Y[type]['all'][train_mask],  Y[type]['all'][~train_mask]
  
  return X, Y

X1, Y1 = train_test()

In [24]:
X1['voiced']['train'].shape

(178, 16)

## 5. Train a classifer (k-NN method)

In [32]:
# Scikit Learn's kNN classifier:
# Just to test, but we will implement it ourselves of course!
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def sklearn_knn(k, X, Y):
  for type in ['voiced', 'unvoiced']:
    kNN = KNeighborsClassifier(n_neighbors=k)
    kNN.fit(X[type]['train'], Y[type]['train'])

    print("Accuracy score for {}: {:.2f}".format(type, accuracy_score(Y[type]['test'],
                                                                  kNN.predict(X[type]['test']))))
sklearn_knn(3, X1, Y1)

Accuracy score for voiced: 0.91
Accuracy score for unvoiced: 0.61


In [33]:
# Our own implementation!
from scipy.spatial.distance import cdist
from sklearn.metrics import confusion_matrix
from collections import Counter

def kNN(k, X, Y, labels=["pw", "ap"]):
    # auxiliary function: label prediction (by majority vote)
    # based on the nearest neighbors
    def predicted_label(ind_neighbors):
        label_neighbors = tuple(Y['train'][ind_neighbors])
        return Counter(label_neighbors).most_common(1)[0][0]
    
    # Pairwise distances between test and train data points
    dist_matrix = cdist(X['test'], X['train'], 'euclidean')
    y_predicted = []

    for i in range(len(X['test'])):
        ind_k_smallest = np.argpartition(dist_matrix[i, :], k)[:k]
        y_predicted.append(predicted_label(ind_k_smallest))
    
    # Confusion matrix: C[i, j] is the number of observations 
    # known to be in group i but predicted to be in group j
    return confusion_matrix(Y['test'], np.array(y_predicted), labels=labels)

print(kNN(3, X1['voiced'], Y1['voiced']))
print(kNN(3, X1['unvoiced'], Y1['unvoiced']))

[[92  8]
 [ 9 86]]
[[54 57]
 [20 68]]


# *Exercice 2*: Detection of multiple intents

We consider the following intents : "Approval", "Prohibition" and "Attention"

## 1. Extract the prosodic features ($f_0$ and energy) and their functionals

In [34]:
# Easy-peasy! All the work has been done before: all we have to do now is to use 
# the DataFrame `df` instead of `df1`

df.groupby('file')['f0','en'].agg(list_features).head()

Unnamed: 0_level_0,f0,f0,f0,f0,f0,f0,f0,f0,en,en,en,en,en,en,en,en
Unnamed: 0_level_1,mean,max,range,var,median,1st_quantile,3rd_quantile,mean_absolute_local_derivate,mean,max,range,var,median,1st_quantile,3rd_quantile,mean_absolute_local_derivate
file,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
cy0001at,110.609375,402.0,402.0,27607.511657,0.0,0.0,331.0,23.777778,47.296875,73.0,73.0,327.323165,41.5,40.0,67.0,3.809524
cy0002at,105.640449,430.0,430.0,27108.528345,0.0,0.0,251.0,15.636364,47.337079,76.0,76.0,269.11236,41.0,40.0,62.0,3.272727
cy0005at,110.609375,402.0,402.0,27607.511657,0.0,0.0,331.0,23.777778,47.296875,73.0,73.0,327.323165,41.5,40.0,67.0,3.809524
cy0006at,105.640449,430.0,430.0,27108.528345,0.0,0.0,251.0,15.636364,47.337079,76.0,76.0,269.11236,41.0,40.0,62.0,3.272727
cy0007pw,92.284314,257.0,257.0,10372.542128,0.0,0.0,189.5,13.683168,52.313725,71.0,71.0,228.455057,52.0,41.0,66.0,2.970297


## 2. Develop a classifier for these three classes

In [0]:
X, Y = train_test(df=df)

In [38]:
sklearn_knn(3, X, Y)

Accuracy score for voiced: 0.74
Accuracy score for unvoiced: 0.45


In [39]:
kNN(3, X['voiced'], Y['voiced'])
kNN(3, X['unvoiced'], Y['unvoiced'])

array([[55, 31],
       [24, 46]])