In [1]:

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier 
from sklearn.ensemble import StackingClassifier, AdaBoostClassifier
                              
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from unicodedata import normalize
from imblearn.over_sampling import SMOTE
import pandas as pd
import nltk
import numpy as np
import string
import emotion_analysis
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
nltk.download('rslp')
nltk.download('stopwords')
plt.style.use('seaborn')

[nltk_data] Downloading package rslp to /home/hit-
[nltk_data]     notebook3/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to /home/hit-
[nltk_data]     notebook3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/hit-
[nltk_data]     notebook3/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to /home/hit-
[nltk_data]     notebook3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = emotion_analysis.open_dataset('dataset.xlsx', 'xlsx')
df.Emoção = df.Emoção.str.lower()

In [3]:
df.Comentarios = df.Comentarios.apply(emotion_analysis.remove_characters)
df.Comentarios = df.Comentarios.apply(emotion_analysis.remove_accents)
df.Comentarios = df.Comentarios.apply(emotion_analysis.tokenize)
df.Comentarios = df.Comentarios.apply(emotion_analysis.remove_stopwords)
df.Comentarios = df.Comentarios.apply(emotion_analysis.untokenize)
df.Comentarios = df.Comentarios.apply(emotion_analysis.stemming)

In [4]:
X, y = df.Comentarios, df.Emoção
X = emotion_analysis.vectorizer(X, 'tfidf')
train_size = .8
X_train, X_test, y_train, y_test = emotion_analysis.dataset_split(X, y, train_size)

In [5]:
oversample = SMOTE(random_state=0)
X_train_ressampled, y_train_ressampled, = oversample.fit_resample(X_train, y_train)

In [6]:
NB  = emotion_analysis.cv_train(classifier_name='NB',  X=X_train_ressampled, y= y_train_ressampled, n_fold=10)
SVM = emotion_analysis.cv_train(classifier_name='SVM', X=X_train_ressampled, y= y_train_ressampled, n_fold=10)
KNN = emotion_analysis.cv_train(classifier_name='KNN', X=X_train_ressampled, y= y_train_ressampled, n_fold=10)

pred_nb  = NB.predict(X_test)   # Naive Bayes prediction
pred_svm = SVM.predict(X_test)  # SVM prediction
pred_knn = KNN.predict(X_test)  # KNN prediction

Naive Bayes best parameters: {'alpha': 0.1, 'fit_prior': False}
Naive Bayes best accuracy in 10 folds: 82.23684210526315
Support Vector Machine best parameters: {'C': 100, 'gamma': 0.001, 'kernel': 'linear'}
Support Vector Machine best accuracy in 10 folds: 87.36842105263158
K-Nearest Neighbors best parameters: {'algorithm': 'auto', 'n_neighbors': 3}
K-Nearest Neighbors best accuracy in 10 folds: 74.47368421052632


In [7]:
NB  = MultinomialNB(alpha = 1, fit_prior= True).fit(X_train_ressampled, y_train_ressampled)
SVM = SVC(C=1, gamma = 1e-4, kernel='linear').fit(X_train_ressampled, y_train_ressampled)
KNN = KNeighborsClassifier(algorithm='auto', n_neighbors=3).fit(X_train_ressampled, y_train_ressampled)

## Voting classifier

In [8]:
estimators = [('svm', SVM),
              ('nb', NB),
              ('knn', KNN)]

voting = VotingClassifier(estimators = estimators,
                          voting     = 'hard',
                          verbose    = True).fit(X_train_ressampled, y_train_ressampled)

[Voting] ...................... (1 of 3) Processing svm, total=   0.0s
[Voting] ....................... (2 of 3) Processing nb, total=   0.0s
[Voting] ...................... (3 of 3) Processing knn, total=   0.0s


In [9]:
pred_voting = voting.predict(X_test)
accuracy_voting  = accuracy_score(y_test, pred_voting)
precision_voting = precision_score(y_test, pred_voting, average='macro')
recall_voting    = recall_score(y_test, pred_voting, average='macro')
fscore_voting    = f1_score(y_test, pred_voting, average='macro')
print('='*20)
print('Voting Classifier Metrics:')
print(f'Accuracy: {accuracy_voting}')
print(f'Precision: {precision_voting}')
print(f'Recall: {recall_voting}')
print(f'F1-Score: {fscore_voting}')
print('='*20)

Voting Classifier Metrics:
Accuracy: 0.6571428571428571
Precision: 0.6364389233954452
Recall: 0.5711739241151005
F1-Score: 0.5833333333333333


## Bagging Classifier

In [10]:
# svm
bagging_svm = BaggingClassifier(base_estimator = SVM,
                            n_estimators   = 500,
                            bootstrap=True,
                            verbose=True).fit(X_train_ressampled, y_train_ressampled)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.8s finished


In [11]:
# naive bayes
bagging_nb = BaggingClassifier(base_estimator = NB,
                            n_estimators   = 500,
                            bootstrap=True,
                            verbose=True).fit(X_train_ressampled, y_train_ressampled)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.6s finished


In [12]:
# knn
bagging_knn = BaggingClassifier(base_estimator = KNN,
                            n_estimators   = 500,
                            bootstrap=True,
                            verbose=True).fit(X_train_ressampled, y_train_ressampled)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s finished


In [13]:
bagging = [bagging_svm.predict(X_test),
           bagging_nb.predict(X_test),
           bagging_knn.predict(X_test)]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s finished


In [14]:
bg_accuracy  = []
bg_precision = []
bg_recall    = []
bg_fscore    = []

labels = ['bg_SVM', 'bg_NB', 'bg_KNN']
for k in range(len(bagging)):
    bg_accuracy.append(accuracy_score(y_test, bagging[k]))
    bg_precision.append(precision_score(y_test, bagging[k], average = 'macro'))
    bg_recall.append(recall_score(y_test, bagging[k], average = 'macro'))
    bg_fscore.append(f1_score(y_test, bagging[k], average = 'macro'))
print('Bagging Classifiers Metrics:')
print('='*40)
for k in range(len(bagging)):
    print(f'{labels[k]} accuracy: {bg_accuracy[k]}')
print('-'*40)
for k in range(len(bagging)):
    print(f'{labels[k]} Precision: {bg_precision[k]}')
print('-'*40)
for k in range(len(bagging)):
    print(f'{labels[k]} Recall: {bg_recall[k]}')
print('-'*40)
for k in range(len(bagging)):
    print(f'{labels[k]} F1-Score: {bg_fscore[k]}')
print('='*40)

Bagging Classifiers Metrics:
bg_SVM accuracy: 0.6857142857142857
bg_NB accuracy: 0.6857142857142857
bg_KNN accuracy: 0.4
----------------------------------------
bg_SVM Precision: 0.6888888888888888
bg_NB Precision: 0.6381766381766382
bg_KNN Precision: 0.46420940170940167
----------------------------------------
bg_SVM Recall: 0.5907817672523555
bg_NB Recall: 0.6228673287496816
bg_KNN Recall: 0.40132416603004833
----------------------------------------
bg_SVM F1-Score: 0.6065678303841862
bg_NB F1-Score: 0.6188311688311688
bg_KNN F1-Score: 0.3993748223927252


## Stacking Classifier

In [15]:
stacking = StackingClassifier(estimators = estimators,
                              final_estimator= SVC(),
                              verbose    = True).fit(X_train_ressampled, y_train_ressampled)

pred_stacking = stacking.predict(X_test)

accuracy_stacking  = accuracy_score(y_test, pred_stacking)
precision_stacking = precision_score(y_test, pred_stacking, average='macro')
recall_stacking    = recall_score(y_test, pred_stacking, average='macro')
fscore_stacking    = f1_score(y_test, pred_voting, average='macro')
print('='*20)
print('Stacking Classifier Metrics:')
print(f'Accuracy: {accuracy_stacking}')
print(f'Precision: {precision_stacking}')
print(f'Recall: {recall_stacking}')
print(f'F1-Score: {fscore_stacking}')
print('='*20)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


Stacking Classifier Metrics:
Accuracy: 0.6857142857142857
Precision: 0.6888888888888888
Recall: 0.5907817672523555
F1-Score: 0.5833333333333333
