In [5]:
# Importation des librairies

import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

from utils_project import load_saved_file, read_feel, process_label, process_nlp, vectorize, apply_vectorization
from evaluate import load_test_set, evaluate_model
from config_project import *

Using TensorFlow backend.


In [6]:
# Lecture des données
feel = read_feel(FEEL_PATH, limit=LIMIT)
data_test = load_test_set(TEST_SET_PATH)

In [7]:
# Si True alors on prédit sur chaque mot puis on agrège la prédiction
PREDICT_WORD = True

In [8]:
# Traitement des données
feel = process_label(feel)
feel['nlp_sentence'] = process_nlp(feel['sentence'], process_type='stem')
feel['nlp_sentence'] = feel['nlp_sentence'].map(lambda sentence : " ".join(sentence))
feel = feel.drop(['sentence'], axis=1)

data_test['nlp_sentence'] = process_nlp(data_test['phrase'], process_type='stem')
data_test['nlp_sentence'] = data_test['nlp_sentence'].map(lambda sentence : " ".join(sentence))
feel.head()

Unnamed: 0,emotion,nlp_sentence
0,no_emotion,a endroit
1,fear,a hat
2,surprise,a hat
3,sadness,a part
4,fear,a pic


In [9]:
max_features = feel['nlp_sentence'].count()
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(feel['nlp_sentence'].values)
X_train = tokenizer.texts_to_sequences(feel['nlp_sentence'].values)
X_train = pad_sequences(X_train)

print('train shape : {}'.format((len(X_train), len(X_train[0]))))

train shape : (19444, 4)


In [10]:
# Définition du modèle de deep learning à utiliser
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(7, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 128)            2488832   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 4, 128)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 1379      
Total params: 2,745,011
Trainable params: 2,745,011
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Split the output
df_Y = pd.get_dummies(feel['emotion'])
Y = df_Y.values
labels = list(df_Y.columns)
map_label = {}
for i, val in enumerate(labels):
    map_label[i] = val
print(Y)
print(map_label)

[[0 0 0 ... 1 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]]
{0: 'anger', 1: 'disgust', 2: 'fear', 3: 'joy', 4: 'no_emotion', 5: 'sadness', 6: 'surprise'}


In [12]:
# Fit the model
batch_size = 32
model.fit(X_train, Y, epochs = 10, batch_size=batch_size, verbose = 2)

Epoch 1/10
 - 29s - loss: 1.6223 - acc: 0.4228
Epoch 2/10
 - 28s - loss: 1.2281 - acc: 0.5192
Epoch 3/10
 - 28s - loss: 1.0724 - acc: 0.5416
Epoch 4/10
 - 29s - loss: 0.9651 - acc: 0.5837
Epoch 5/10
 - 28s - loss: 0.8885 - acc: 0.6096
Epoch 6/10
 - 28s - loss: 0.8303 - acc: 0.6188
Epoch 7/10
 - 28s - loss: 0.7866 - acc: 0.6243
Epoch 8/10
 - 28s - loss: 0.7537 - acc: 0.6281
Epoch 9/10
 - 28s - loss: 0.7255 - acc: 0.6304
Epoch 10/10
 - 31s - loss: 0.7016 - acc: 0.6371


<keras.callbacks.History at 0x7fc134b22780>

In [13]:
def most_common(lst):
    lst = [el for el in lst if el != 'no_emotion']
    if len(lst) == 0:
        return 'no_emotion'
    else:
        return max(set(lst), key=lst.count)
    
def predict(x_test, tokenizer, model, predict_word):
    
    predictions = []
    
    for sentence in x_test:
        if predict_word:
            list_emotion_word = []
            for word in sentence.split(" "):
                txt = tokenizer.texts_to_sequences([word])
                txt = pad_sequences(txt, maxlen=4)
                sentiment = model.predict(txt, batch_size=1, verbose = 2)[0]
                list_emotion_word.append(map_label[np.argmax(sentiment)])
            predictions.append(most_common(list_emotion_word))
        else:
            txt = tokenizer.texts_to_sequences([sentence])
            txt = pad_sequences(txt, maxlen=4)
            sentiment = model.predict(txt, batch_size=1, verbose = 2)[0]
            predictions.append(map_label[np.argmax(sentiment)])
    
    return predictions

In [16]:
# Calcul du score final
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
pd.set_option('display.max_colwidth', -1)

y_test = y_test = data_test['emotion']

predictions = predict(data_test['nlp_sentence'], tokenizer, model, predict_word=PREDICT_WORD)

accuracy = accuracy_score(y_test, predictions)
conf_mat = confusion_matrix(y_test, predictions)

print(accuracy)
print(conf_mat)

0.26143790849673204
[[ 3  3  4  1  4  8  1]
 [ 1  9  4  6  3  5  1]
 [ 0  1  5  8  4  4  1]
 [ 0  2  4 11  7  4  9]
 [ 0  0  0  0  0  0  0]
 [ 1  1  1  2  1 11  2]
 [ 0  3  2  7  2  6  1]]
