## Emotion analysis and classification of short comments using machine learning techniques
+ Code developed by: Douglas Maia dos Santos
+ Github acess: https://github.com/m-dougl/emotion-analysis

##### Importing libraries for proper code functioning
The purpose of this code is to test the impacto of the oversampling function on the models that were implemented in "main.ipynb"

In [1]:

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier 
from sklearn.ensemble import StackingClassifier, AdaBoostClassifier
                              
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from unicodedata import normalize
from imblearn.over_sampling import SMOTE
import pandas as pd
import nltk
import numpy as np
import string
import emotion_analysis
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
nltk.download('rslp')
nltk.download('stopwords')
plt.style.use('seaborn')

[nltk_data] Downloading package rslp to /home/hit-
[nltk_data]     notebook3/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to /home/hit-
[nltk_data]     notebook3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/hit-
[nltk_data]     notebook3/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to /home/hit-
[nltk_data]     notebook3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Pre processing part

In [2]:
df = emotion_analysis.open_dataset('dataset.xlsx', 'xlsx')
df.Emoção = df.Emoção.str.lower()

In [3]:
df.Comentarios = df.Comentarios.apply(emotion_analysis.remove_characters)
df.Comentarios = df.Comentarios.apply(emotion_analysis.remove_accents)
df.Comentarios = df.Comentarios.apply(emotion_analysis.tokenize)
df.Comentarios = df.Comentarios.apply(emotion_analysis.remove_stopwords)
df.Comentarios = df.Comentarios.apply(emotion_analysis.untokenize)
df.Comentarios = df.Comentarios.apply(emotion_analysis.stemming)

In [4]:
X, y = df.Comentarios, df.Emoção
X = emotion_analysis.vectorizer(X, 'tfidf')

In [5]:
train_size = .8
X_train, X_test, y_train, y_test = emotion_analysis.dataset_split(X, y, train_size)

In [6]:
n_fold = 10
NB  = emotion_analysis.cv_train(classifier_name='NB',  X=X_train, y= y_train, n_fold=n_fold)
SVM = emotion_analysis.cv_train(classifier_name='SVM', X=X_train, y= y_train, n_fold=n_fold)
KNN = emotion_analysis.cv_train(classifier_name='KNN', X=X_train, y= y_train, n_fold=n_fold)
'''
pred_nb  = NB.predict(X_test)   # Naive Bayes prediction
pred_svm = SVM.predict(X_test)  # SVM prediction
pred_knn = KNN.predict(X_test)  # KNN prediction
'''

Naive Bayes best parameters: {'alpha': 100, 'fit_prior': False}
Naive Bayes best accuracy in 10 folds: 64.5054945054945
Support Vector Machine best parameters: {'C': 1, 'gamma': 1, 'kernel': 'sigmoid'}
Support Vector Machine best accuracy in 10 folds: 60.87912087912088
K-Nearest Neighbors best parameters: {'algorithm': 'auto', 'n_neighbors': 6}
K-Nearest Neighbors best accuracy in 10 folds: 55.16483516483517


'\npred_nb  = NB.predict(X_test)   # Naive Bayes prediction\npred_svm = SVM.predict(X_test)  # SVM prediction\npred_knn = KNN.predict(X_test)  # KNN prediction\n'

In [7]:
SVM = SVC(C=1000, gamma= 1e-3, kernel = 'rbf').fit(X_train, y_train)
NB  = MultinomialNB(alpha = 1, fit_prior= False).fit(X_train, y_train)
KNN = KNeighborsClassifier(algorithm='auto', n_neighbors=5).fit(X_train, y_train)
estimators = [('svm', SVM),
              ('nb', NB),
              ('knn', KNN)]

### Ensemble: Votting Classifier

In [8]:
voting = VotingClassifier(estimators = estimators,
                          voting     = 'hard',
                          verbose    = True).fit(X_train, y_train)

[Voting] ...................... (1 of 3) Processing svm, total=   0.0s
[Voting] ....................... (2 of 3) Processing nb, total=   0.0s
[Voting] ...................... (3 of 3) Processing knn, total=   0.0s


In [9]:
pred_voting = voting.predict(X_test)

In [10]:
accuracy_voting  = accuracy_score(y_test, pred_voting)
precision_voting = precision_score(y_test, pred_voting, average='macro')
recall_voting    = recall_score(y_test, pred_voting, average='macro')
fscore_voting    = f1_score(y_test, pred_voting, average='macro')
print('='*20)
print('Voting Classifier Metrics:')
print(f'Accuracy: {accuracy_voting}')
print(f'Precision: {precision_voting}')
print(f'Recall: {recall_voting}')
print(f'F1-Score: {fscore_voting}')
print('='*20)

Voting Classifier Metrics:
Accuracy: 0.5428571428571428
Precision: 0.5760869565217391
Recall: 0.49444444444444446
F1-Score: 0.5040935672514619


### Ensemble: Bagging Classifier

In [11]:
# svm
bagging_svm = BaggingClassifier(base_estimator = SVM,
                            n_estimators   = 500,
                            bootstrap=True,
                            verbose=True).fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s finished


In [12]:
# naive bayes
bagging_nb = BaggingClassifier(base_estimator = NB,
                            n_estimators   = 500,
                            bootstrap=True,
                            verbose=True).fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s finished


In [13]:
# knn
bagging_knn = BaggingClassifier(base_estimator = KNN,
                            n_estimators   = 500,
                            bootstrap=True,
                            verbose=True).fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s finished


In [14]:
bagging = [bagging_svm.predict(X_test),
           bagging_nb.predict(X_test),
           bagging_knn.predict(X_test)]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s finished


In [15]:
bg_accuracy  = []
bg_precision = []
bg_recall    = []
bg_fscore    = []

labels = ['bg_SVM', 'bg_NB', 'bg_KNN']
for k in range(len(bagging)):
    bg_accuracy.append(accuracy_score(y_test, bagging[k]))
    bg_precision.append(precision_score(y_test, bagging[k], average = 'macro'))
    bg_recall.append(recall_score(y_test, bagging[k], average = 'macro'))
    bg_fscore.append(f1_score(y_test, bagging[k], average = 'macro'))
print('Bagging Classifiers Metrics:')
print('='*40)
for k in range(len(bagging)):
    print(f'{labels[k]} accuracy: {bg_accuracy[k]}')
print('-'*40)
for k in range(len(bagging)):
    print(f'{labels[k]} Precision: {bg_precision[k]}')
print('-'*40)
for k in range(len(bagging)):
    print(f'{labels[k]} Recall: {bg_recall[k]}')
print('-'*40)
for k in range(len(bagging)):
    print(f'{labels[k]} F1-Score: {bg_fscore[k]}')
print('='*40)

Bagging Classifiers Metrics:
bg_SVM accuracy: 0.5714285714285714
bg_NB accuracy: 0.6571428571428571
bg_KNN accuracy: 0.4857142857142857
----------------------------------------
bg_SVM Precision: 0.5499999999999999
bg_NB Precision: 0.6657097288676237
bg_KNN Precision: 0.46158730158730155
----------------------------------------
bg_SVM Recall: 0.5222222222222223
bg_NB Recall: 0.6194444444444445
bg_KNN Recall: 0.41944444444444445
----------------------------------------
bg_SVM F1-Score: 0.5242091242091242
bg_NB F1-Score: 0.630401993573349
bg_KNN F1-Score: 0.400956937799043


### Ensemble: RandomForestClassifier

In [16]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

rf_predict = random_forest.predict(X_test)

accuracy_rf  = accuracy_score(y_test, rf_predict)
precision_rf = precision_score(y_test, rf_predict, average='macro')
recall_rf    = recall_score(y_test, rf_predict, average='macro')
fscore_rf    = f1_score(y_test, rf_predict, average='macro')
print('='*20)
print('Random Forest Classifier Metrics:')
print(f'Accuracy: {accuracy_rf}')
print(f'Precision: {precision_rf}')
print(f'Recall: {recall_rf}')
print(f'F1-Score: {fscore_rf}')
print('='*20)

Random Forest Classifier Metrics:
Accuracy: 0.5714285714285714
Precision: 0.4583333333333333
Recall: 0.4833333333333334
F1-Score: 0.4447154471544716


### Ensemble: Gradient Tree boosting

In [17]:
gradient = GradientBoostingClassifier()
gradient.fit(X_train, y_train)

gradient_predict = gradient.predict(X_test)

accuracy_gradient  = accuracy_score(y_test, gradient_predict)
precision_gradient = precision_score(y_test, gradient_predict, average='macro')
recall_gradient    = recall_score(y_test, gradient_predict, average='macro')
fscore_gradient    = f1_score(y_test, gradient_predict, average='macro')
print('='*20)
print('Gradient Boosting Classifier Metrics:')
print(f'Accuracy: {accuracy_gradient}')
print(f'Precision: {precision_gradient}')
print(f'Recall: {recall_gradient}')
print(f'F1-Score: {fscore_gradient}')
print('='*20)

Gradient Boosting Classifier Metrics:
Accuracy: 0.6285714285714286
Precision: 0.6698872785829307
Recall: 0.5666666666666668
F1-Score: 0.5715045188729398


### Ensemble: Stacking Classifier

In [18]:
SVM = SVC(C=1000, gamma= 1e-3, kernel = 'rbf').fit(X_train, y_train)
NB  = MultinomialNB(alpha = 1, fit_prior= False).fit(X_train, y_train)
KNN = KNeighborsClassifier(algorithm='auto', n_neighbors=5).fit(X_train, y_train)
estimators = [('svm', SVM),
              ('nb', NB),
              ('knn', KNN)]

In [19]:
stacking = StackingClassifier(estimators = estimators,
                              final_estimator= GradientBoostingClassifier(),
                              verbose    = True).fit(X_train, y_train)

pred_stacking = stacking.predict(X_test)

accuracy_stacking  = accuracy_score(y_test, pred_stacking)
precision_stacking = precision_score(y_test, pred_stacking, average='macro')
recall_stacking    = recall_score(y_test, pred_stacking, average='macro')
fscore_stacking    = f1_score(y_test, pred_voting, average='macro')
print('='*20)
print('Stacking Classifier Metrics:')
print(f'Accuracy: {accuracy_stacking}')
print(f'Precision: {precision_stacking}')
print(f'Recall: {recall_stacking}')
print(f'F1-Score: {fscore_stacking}')
print('='*20)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


Stacking Classifier Metrics:
Accuracy: 0.5142857142857142
Precision: 0.49427609427609426
Recall: 0.48333333333333334
F1-Score: 0.5040935672514619


### Neural Network Test

In [20]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(hidden_layer_sizes = (500,5),
                   activation         = 'relu',
                   solver             = 'adam',
                   alpha              = 1e-4,
                   batch_size         = 10,
                   learning_rate      = 'adaptive',
                   verbose            = True)
nn.fit(X_train, y_train)

Iteration 1, loss = 1.15909083
Iteration 2, loss = 1.09165743
Iteration 3, loss = 0.99230641
Iteration 4, loss = 0.86676601
Iteration 5, loss = 0.72419439
Iteration 6, loss = 0.59304085
Iteration 7, loss = 0.48627982
Iteration 8, loss = 0.41003334
Iteration 9, loss = 0.34230641
Iteration 10, loss = 0.28531676
Iteration 11, loss = 0.23531892
Iteration 12, loss = 0.19191103
Iteration 13, loss = 0.15817284
Iteration 14, loss = 0.13254473
Iteration 15, loss = 0.11081918
Iteration 16, loss = 0.09463672
Iteration 17, loss = 0.08210177
Iteration 18, loss = 0.07239446
Iteration 19, loss = 0.06558020
Iteration 20, loss = 0.05970599
Iteration 21, loss = 0.05490843
Iteration 22, loss = 0.05175539
Iteration 23, loss = 0.04793102
Iteration 24, loss = 0.04623894
Iteration 25, loss = 0.04411951
Iteration 26, loss = 0.04269187
Iteration 27, loss = 0.03989013
Iteration 28, loss = 0.03776504
Iteration 29, loss = 0.03622000
Iteration 30, loss = 0.03493841
Iteration 31, loss = 0.03537935
Iteration 32, los

In [21]:
pred_nn = nn.predict(X_test)

In [22]:
accuracy_nn  = accuracy_score(y_test, pred_nn)
precision_nn = precision_score(y_test, pred_nn, average='macro')
recall_nn    = recall_score(y_test, pred_nn, average='macro')
fscore_nn    = f1_score(y_test, pred_voting, average='macro')
print('='*20)
print('nn Classifier Metrics:')
print(f'Accuracy: {accuracy_nn}')
print(f'Precision: {precision_nn}')
print(f'Recall: {recall_nn}')
print(f'F1-Score: {fscore_nn}')
print('='*20)

nn Classifier Metrics:
Accuracy: 0.4857142857142857
Precision: 0.5488888888888889
Recall: 0.41944444444444445
F1-Score: 0.5040935672514619
