# 1. Pre-processing

#### 1.1. Text Pre-processing

The dataset is already lowercased and lacks punctuation. We will tokenize the text and remove stopwords, as well as apply lemmatization to the tokens.

In [26]:
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

from sklearn.feature_extraction import text

#words to keep: no couldnt cry not cant cannot nor except nobody off but serious enough nothing alone down only without


data = pd.read_json('./data/data.jsonl', lines=True)


def pre_process_data(dataset):
    #tokenize
    dataset['text'] = dataset['text'].apply(nltk.word_tokenize)

    #remove stop words
    my_stop_words = text.ENGLISH_STOP_WORDS
    #print(my_stop_words)
    words_to_keep = frozenset(['no', 'couldnt', 'cry', 'not', 'cant', 'cannot', 'nor', 'except', 'nobody', 'off', 'but', 'serious', 'enough', 'nothing', 'alone', 'down', 'only', 'without','hereby'])
    my_stop_words = my_stop_words - words_to_keep
    
    dataset['text'] = dataset['text'].apply(lambda x: [word for word in x if word not in my_stop_words])

    #lemmatize
    lemmatizer = WordNetLemmatizer()
    dataset['text'] = dataset['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    
    return dataset

data = pre_process_data(data)
data.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\inesc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\inesc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\inesc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,label
0,"[feel, awful, s, job, position, succeed, just,...",0
1,"[im, alone, feel, awful]",0
2,"[ive, probably, mentioned, but, really, feel, ...",1
3,"[feeling, little, low, day]",0
4,"[beleive, sensitive, people, feeling, tend, co...",2


In [27]:
test_data = pd.read_json('./data/test.jsonl', lines=True)
train_data = pd.read_json('./data/train.jsonl', lines=True)
validation_data = pd.read_json('./data/validation.jsonl', lines=True)

test_data = pre_process_data(test_data)
train_data = pre_process_data(train_data)
validation_data = pre_process_data(validation_data)
print(test_data['text'][13])

print(test_data["text"])

['just', 'feel', 'extremely', 'comfortable', 'group', 'people', 'dont', 'need', 'hide']
0        [im, feeling, rotten, im, not, ambitious, right]
1                      [im, updating, blog, feel, shitty]
2       [make, separate, don, t, want, feel, like, m, ...
3       [left, bouquet, red, yellow, tulip, arm, feeli...
4                            [feeling, little, vain, did]
                              ...                        
1995    [just, feeling, like, unkind, doing, wrong, th...
1996    [im, feeling, little, cranky, negative, doctor...
1997    [feel, useful, people, give, great, feeling, a...
1998    [im, feeling, comfortable, derby, feel, start,...
1999    [feel, weird, meet, w, people, text, but, like...
Name: text, Length: 2000, dtype: object


# 2. Vectorization
(secção possívelmente temporária, mas queria experimentar as cenas de tf_idf depois do pré-processamento) -- matos
não acho que convenha ser temporaria, dado que efetivamente melhora os resultados ihihihi, e é uma prática comum e recomendada pelo que estivemos a ver -- ines


### TF-IDF

#### Hypertunning for tf-idf

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer




def tfidfVectorizer(data, train_data, validation_data, test_data):
    vectorizer = TfidfVectorizer(stop_words=list(my_stop_words), ngram_range=(1,2), max_features=40000)
    # good idea to use two-grams??
    # print(X.shape)

    x_train = vectorizer.fit_transform(train_data['text'].apply(lambda x: ' '.join(x)))

    x_val = vectorizer.transform(validation_data['text'].apply(lambda x: ' '.join(x)))
    x_test = vectorizer.transform(test_data['text'].apply(lambda x: ' '.join(x)))

    y_train = train_data['label']
    y_val = validation_data['label']
    y_test = test_data['label']
    
    # print(vectorizer.get_feature_names_out())

    return x_train, x_val, x_test, y_train, y_val, y_test



### BOW model
Count vectorizer which is an implementation of the BOW model.

The disadvantage of the BOW model is it does not consider the sequence of words, and as language does involve sequence and context, sometimes the BOW model might not be a good fit for the best-case scenario.

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

def countVectorizer(data, train_data, validation_data, test_data):
    # Initialize CountVectorizer
    vectorizer = CountVectorizer(stop_words=my_stop_words, ngram_range=(1,2), max_features=40000)

    x_train= vectorizer.fit_transform(train_data['text'].apply(lambda x: ' '.join(x)))

    x_val = vectorizer.transform(validation_data['text'].apply(lambda x: ' '.join(x)))
    x_test = vectorizer.transform(test_data['text'].apply(lambda x: ' '.join(x)))

    y_train = train_data['label']
    y_val = validation_data['label']
    y_test = test_data['label']

    return x_train, x_val, x_test, y_train, y_val, y_test



### Doc 2 Vec

In [30]:
# import optuna
# from gensim.models import Doc2Vec
# from gensim.models.doc2vec import TaggedDocument
# from sklearn.metrics import accuracy_score
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split
# import numpy as np

# y_train = train_data['label']
# y_val = validation_data['label']

# def objective(trial):
#     vector_size = trial.suggest_int("vector_size", 50, 300)
#     window = trial.suggest_int("window", 3, 15)
#     min_count = trial.suggest_int("min_count", 1, 10)
#     epochs = trial.suggest_int("epochs", 10, 50)
    
#     # Train Doc2Vec model
#     tagged_data = [TaggedDocument(words=doc, tags=[str(label)]) for doc, label in zip(train_data['text'], train_data['label'])]
#     doc2vec_model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs)
#     doc2vec_model.build_vocab(tagged_data)
#     doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
    
#     # Prepare document vectors for training and test sets
#     X_train_vecs  = [doc2vec_model.infer_vector(doc) for doc in train_data['text']]
#     X_val_vecs = [doc2vec_model.infer_vector(doc) for doc in validation_data['text']]
    
#     # Train Logistic Regression classifier
#     classifier = LogisticRegression(max_iter=1000)
#     classifier.fit(X_train_vecs, y_train)
    
#     # Evaluate on test set
#     y_pred = classifier.predict(X_val_vecs)
#     accuracy = accuracy_score(y_val, y_pred)
    
#     return accuracy

# # Define the study object and optimize the objective function
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=100)

# # Print the best hyperparameters found
# best_params = study.best_params
# print("Best hyperparameters:", best_params)

# # Train the final model with the best hyperparameters
# best_vector_size = best_params["vector_size"]
# best_window = best_params["window"]
# best_min_count = best_params["min_count"]
# best_epochs = best_params["epochs"]

# # Train Doc2Vec model with best hyperparameters
# tagged_data = [TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(train_data["text"])]
# best_doc2vec_model = Doc2Vec(vector_size=best_vector_size, window=best_window, min_count=best_min_count, epochs=best_epochs)
# best_doc2vec_model.build_vocab(tagged_data)
# best_doc2vec_model.train(tagged_data, total_examples=best_doc2vec_model.corpus_count, epochs=best_doc2vec_model.epochs)

# # Prepare document vectors for training and test sets with the best Doc2Vec model
# X_train_vecs  = [best_doc2vec_model.infer_vector(doc) for doc in train_data['text']]
# X_val_vecs = [best_doc2vec_model.infer_vector(doc) for doc in validation_data['text']]

# # Train Logistic Regression classifier on the final Doc2Vec vectors
# final_classifier = LogisticRegression(max_iter=1000)
# final_classifier.fit(X_train_vecs, y_train)

# # Evaluate on test set
# final_accuracy = final_classifier.score(X_val_vecs, y_val)
# print("Final accuracy on test set with the best model:", final_accuracy)


In [31]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import utils

def train_doc2vec(train_data):
    max_epochs = 16
    vec_size = 88
    alpha = 0.025
    window = 4
    min_count = 7

    tagged_data = [TaggedDocument(words=doc, tags=[str(label)]) for doc, label in zip(train_data['text'], train_data['label'])]

    # antes tinha workers definidos
    model = Doc2Vec(vector_size=vec_size, window=window, min_count=min_count, epochs=max_epochs)
    
    model.build_vocab(tagged_data)


    model.train(tagged_data, total_examples=model.corpus_count, epochs=50)


    model.save("d2v_best_stopwords.model")
    print("Model Saved")

    return model

train_doc2vec(train_data)

Model Saved


<gensim.models.doc2vec.Doc2Vec at 0x18a96038730>

In [32]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import utils
from gensim.models.doc2vec import Doc2Vec

def doc2vec(data, train_data, validation_data, test_data):

    # training the doc2vec model
    #model = train_doc2vec(train_data)
    model = Doc2Vec.load("d2v_best_stopwords.model")
    
    x_train = [model.infer_vector(doc) for doc in train_data['text']]
    x_val = [model.infer_vector(doc) for doc in validation_data['text']]
    x_test = [model.infer_vector(doc) for doc in test_data['text']]

    # pca = PCA(n_components=3)
    # x_train = pca.fit_transform(x_train)
    # x_val = pca.transform(x_val)
    # x_test = pca.transform(x_test)

    y_train = train_data['label']
    y_val = validation_data['label']
    y_test = test_data['label']

    return x_train, x_val, x_test, y_train, y_val, y_test

### Word 2 Vec

In [33]:
from gensim.models import Word2Vec
import numpy as np

def lala(model, data):
    vectors = []
    for doc in data['text']:
        document = []
        for word in doc:
            if word in model.wv:
                document.append(model.wv[word])
        vectors.append(document)
    return vectors

# test [d,c,c]  | label
########################
## test 
## [ [] [] [] ] | label

def wordEmbeddingsVectorizer(data):

    data_model = Word2Vec(data["text"], vector_size=3, window=10, min_count=2, workers=10, sg=1)
    
    x_train = lala(data_model, train_data)
    x_val =lala(data_model, validation_data)
    x_test = lala(data_model, test_data)

    y_train = train_data['label']
    y_val = validation_data['label']
    y_test = test_data['label']

    return x_train, x_val, x_test, y_train, y_val, y_test



In [34]:
# Choose vectorizer (featurizer)
#x_train, x_val, x_test, y_train, y_val, y_test = tfidfVectorizer(data, train_data, validation_data, test_data)
#x_train, x_val, x_test, y_train, y_val, y_test = countVectorizer(data, train_data, validation_data, test_data)
#x_train, x_val, x_test, y_train, y_val, y_test = wordEmbeddingsVectorizer(data)
x_train, x_val, x_test, y_train, y_val, y_test = doc2vec(data, train_data, validation_data, test_data)

#print(x_train)

## Apply SMOTE
smote = SMOTE(random_state=42, sampling_strategy='auto', k_neighbors=10)
x_train, y_train = smote.fit_resample(x_train, y_train)



# 3. Model Training

#### 3.1. Model Selection
Aqui também só estava a querer espetar modelos para começar a ver o que dá que ainda não sei que features vão ser usadas:
tf-idf, word embeddings, ???, features mais feitas à mão?


Isto pelos vistos é uma cena, que não implementei (ainda..)

"The validation set uses a subset of the training data to provide an unbiased evaluation of a model. The validation data set contrasts with training and test sets in that it is an intermediate phase used for choosing the best model and optimizing it. It is in this phase that hyperparameter tuning occurs."

Wikipedia:
The basic process of using a validation data set for model selection (as part of training data set, validation data set, and test data set) is:

Since our goal is to find the network having the best performance on new data, the simplest approach to the comparison of different networks is to evaluate the error function using data which is independent of that used for training. Various networks are trained by minimization of an appropriate error function defined with respect to a training data set. The performance of the networks is then compared by evaluating the error function using an independent validation set, and the network having the smallest error with respect to the validation set is selected. This approach is called the hold out method. Since this procedure can itself lead to some overfitting to the validation set, the performance of the selected network should be confirmed by measuring its performance on a third independent set of data called a test set.

An application of this process is in early stopping, where the candidate models are successive iterations of the same network, and training stops when the error on the validation set grows, choosing the previous model (the one with minimum error).

##### 3.1.1. Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

logreg_classifier = LogisticRegression(max_iter=1000, )
logreg_classifier.fit(x_train, y_train)
y_pred = logreg_classifier.predict(x_val)
print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

0.81
              precision    recall  f1-score   support

           0       0.88      0.85      0.86       550
           1       0.92      0.81      0.86       704
           2       0.67      0.78      0.72       178
           3       0.77      0.78      0.77       275
           4       0.70      0.79      0.74       212
           5       0.50      0.74      0.60        81

    accuracy                           0.81      2000
   macro avg       0.74      0.79      0.76      2000
weighted avg       0.83      0.81      0.81      2000

[[466   9   9  28  25  13]
 [ 22 573  49  18  19  23]
 [  7  19 139   6   6   1]
 [ 17  12   6 215  17   8]
 [ 11   4   4  11 167  15]
 [  7   5   2   3   4  60]]


##### 3.1.2. Multinomial Naive Bayes

In [36]:

# mnb_classifier = MultinomialNB()
# mnb_classifier.fit(x_train, y_train)
# y_pred = mnb_classifier.predict(x_val)
# print(accuracy_score(y_val, y_pred))
# print(classification_report(y_val, y_pred))
# print(confusion_matrix(y_val, y_pred))



ValueError: Negative values in data passed to MultinomialNB (input X)

## SVM


In [37]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

svm_classifier = SVC(kernel='linear', probability=True, random_state=42)
svm_classifier.fit(x_train, y_train)
y_pred = svm_classifier.predict(x_val)

# Evaluate the performance
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8025
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85       550
           1       0.91      0.82      0.86       704
           2       0.65      0.78      0.71       178
           3       0.76      0.77      0.76       275
           4       0.70      0.75      0.72       212
           5       0.52      0.73      0.61        81

    accuracy                           0.80      2000
   macro avg       0.73      0.78      0.75      2000
weighted avg       0.81      0.80      0.81      2000

Confusion Matrix:
 [[460  14   8  32  31   5]
 [ 27 577  50  19  16  15]
 [ 10  17 139   6   3   3]
 [ 19  15   7 211  15   8]
 [ 10   5   7   8 159  23]
 [  7   7   2   2   4  59]]


## KNN

In [38]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(x_train, y_train)
y_pred = knn_classifier.predict(x_val)

# Evaluate the performance
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.6795
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.73      0.75       550
           1       0.86      0.70      0.77       704
           2       0.48      0.63      0.54       178
           3       0.59      0.62      0.60       275
           4       0.54      0.58      0.56       212
           5       0.39      0.70      0.50        81

    accuracy                           0.68      2000
   macro avg       0.60      0.66      0.62      2000
weighted avg       0.71      0.68      0.69      2000

Confusion Matrix:
 [[401  21  15  54  38  21]
 [ 42 494  86  33  29  20]
 [ 11  23 113  10  11  10]
 [ 42  19  11 170  21  12]
 [ 22  13   9  18 124  26]
 [  8   3   3   4   6  57]]


### Boosting Algorithms

Testing with some boosting algorithms

##### XGBOOST

In [39]:
import xgboost as xgb
# XGBoost Classifier
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(x_train, y_train)
y_pred_xgb = xgb_classifier.predict(x_val)

# Evaluate the performance for XGBoost
accuracy_xgb = accuracy_score(y_val, y_pred_xgb)
report_xgb = classification_report(y_val, y_pred_xgb)
conf_matrix_xgb = confusion_matrix(y_val, y_pred_xgb)

# Print the evaluation metrics for XGBoost
print("XGBoost Classifier Evaluation:")
print("Accuracy:", accuracy_xgb)
print("Classification Report:\n", report_xgb)
print("Confusion Matrix:\n", conf_matrix_xgb)

XGBoost Classifier Evaluation:
Accuracy: 0.7895
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.86      0.83       550
           1       0.83      0.88      0.85       704
           2       0.76      0.58      0.66       178
           3       0.78      0.69      0.73       275
           4       0.71      0.72      0.72       212
           5       0.57      0.48      0.52        81

    accuracy                           0.79      2000
   macro avg       0.74      0.70      0.72      2000
weighted avg       0.79      0.79      0.79      2000

Confusion Matrix:
 [[474  22   5  22  20   7]
 [ 28 620  22  12  15   7]
 [ 17  45 104   6   5   1]
 [ 33  33   4 189  13   3]
 [ 24  13   1   9 153  12]
 [ 14  15   0   5   8  39]]


##### Lightgbm

In [40]:
import lightgbm as lgb

lgb_classifier = lgb.LGBMClassifier()
lgb_classifier.fit(x_train, y_train)
y_pred_lgb = lgb_classifier.predict(x_val)

# Evaluate the performance for LightGBM
accuracy_lgb = accuracy_score(y_val, y_pred_lgb)
report_lgb = classification_report(y_val, y_pred_lgb)
conf_matrix_lgb = confusion_matrix(y_val, y_pred_lgb)

# Print the evaluation metrics for LightGBM
print("\nLightGBM Classifier Evaluation:")
print("Accuracy:", accuracy_lgb)
print("Classification Report:\n", report_lgb)
print("Confusion Matrix:\n", conf_matrix_lgb)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034909 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22440
[LightGBM] [Info] Number of data points in the train set: 32172, number of used features: 88
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759

LightGBM Classifier Evaluation:
Accuracy: 0.7845
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.85      0.82       550
           1       0.82      0.89      0.85       704
           2       0.75      0.54      0.63       178
           3       0.77      0.68      0.72       275
           4       0.72      0.73      0.

In [41]:
from sklearn.ensemble import AdaBoostClassifier

# AdaBoost Classifier
adaboost_classifier = AdaBoostClassifier()
adaboost_classifier.fit(x_train, y_train)
y_pred_adaboost = adaboost_classifier.predict(x_val)

# Evaluate the performance for AdaBoost
accuracy_adaboost = accuracy_score(y_val, y_pred_adaboost)
report_adaboost = classification_report(y_val, y_pred_adaboost)
conf_matrix_adaboost = confusion_matrix(y_val, y_pred_adaboost)

# Print the evaluation metrics for AdaBoost
print("AdaBoost Classifier Evaluation:")
print("Accuracy:", accuracy_adaboost)
print("Classification Report:\n", report_adaboost)
print("Confusion Matrix:\n", conf_matrix_adaboost)


AdaBoost Classifier Evaluation:
Accuracy: 0.6675
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.69      0.71       550
           1       0.79      0.77      0.78       704
           2       0.57      0.52      0.55       178
           3       0.57      0.53      0.55       275
           4       0.50      0.66      0.57       212
           5       0.38      0.44      0.41        81

    accuracy                           0.67      2000
   macro avg       0.59      0.60      0.59      2000
weighted avg       0.68      0.67      0.67      2000

Confusion Matrix:
 [[381  43   8  56  54   8]
 [ 41 539  44  33  23  24]
 [ 17  45  93   5  12   6]
 [ 41  35  13 147  33   6]
 [ 31  13   1  13 139  15]
 [ 13   8   4   4  16  36]]


###  Bagging Algorithm

In [42]:
from sklearn.ensemble import BaggingClassifier, VotingClassifier

model1 = xgb_classifier
model2 = svm_classifier
model3 = logreg_classifier

voting_clf = VotingClassifier(estimators=[
    ('model1', model1), 
    ('model2', model2),
    ('model3', model3)
], voting='soft')
# Fit the ensemble model
voting_clf.fit(x_train, y_train)

# Predict the validation set
y_pred_voting = voting_clf.predict(x_val)

# Evaluate the performance for the ensemble model
accuracy_voting = accuracy_score(y_val, y_pred_voting)
report_voting = classification_report(y_val, y_pred_voting)
conf_matrix_voting = confusion_matrix(y_val, y_pred_voting)

# Print the evaluation metrics for the ensemble model
print("Voting Classifier Evaluation:")
print("Accuracy:", accuracy_voting)
print("Classification Report:\n", report_voting)
print("Confusion Matrix:\n", conf_matrix_voting)


Voting Classifier Evaluation:
Accuracy: 0.817
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86       550
           1       0.91      0.84      0.87       704
           2       0.69      0.76      0.72       178
           3       0.78      0.78      0.78       275
           4       0.70      0.77      0.73       212
           5       0.57      0.70      0.63        81

    accuracy                           0.82      2000
   macro avg       0.75      0.79      0.77      2000
weighted avg       0.82      0.82      0.82      2000

Confusion Matrix:
 [[472  10   8  27  27   6]
 [ 26 592  42  15  17  12]
 [  9  21 135   6   5   2]
 [ 17  17   5 215  15   6]
 [ 14   5   5   8 163  17]
 [  8   7   1   3   5  57]]


# 5. Model Evaluation

In [44]:
# evaluate with test set
y_pred = logreg_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

y_pred_logreg = logreg_classifier.predict(x_test)

0.8015
              precision    recall  f1-score   support

           0       0.91      0.85      0.88       581
           1       0.91      0.78      0.84       695
           2       0.58      0.77      0.66       159
           3       0.78      0.79      0.79       275
           4       0.73      0.76      0.75       224
           5       0.39      0.83      0.53        66

    accuracy                           0.80      2000
   macro avg       0.72      0.80      0.74      2000
weighted avg       0.83      0.80      0.81      2000

[[492  17   8  30  21  13]
 [ 11 545  74  11  23  31]
 [  6  17 122   5   4   5]
 [ 17   9   4 218  13  14]
 [ 12   5   1  12 171  23]
 [  1   4   0   3   3  55]]


In [None]:
# y_pred = mnb_classifier.predict(x_test)
# print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

In [45]:
y_pred = svm_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


0.786
              precision    recall  f1-score   support

           0       0.89      0.84      0.86       581
           1       0.90      0.79      0.84       695
           2       0.55      0.72      0.62       159
           3       0.74      0.76      0.75       275
           4       0.71      0.75      0.73       224
           5       0.39      0.68      0.50        66

    accuracy                           0.79      2000
   macro avg       0.70      0.76      0.72      2000
weighted avg       0.81      0.79      0.79      2000

[[490  17  12  34  19   9]
 [ 15 548  75  14  19  24]
 [  8  20 114   9   5   3]
 [ 21  10   8 208  20   8]
 [ 15   4   0  13 167  25]
 [  4   9   0   3   5  45]]


In [46]:
y_pred = xgb_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


0.781
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       581
           1       0.80      0.86      0.83       695
           2       0.68      0.58      0.63       159
           3       0.78      0.64      0.70       275
           4       0.73      0.68      0.71       224
           5       0.58      0.45      0.51        66

    accuracy                           0.78      2000
   macro avg       0.73      0.68      0.70      2000
weighted avg       0.78      0.78      0.78      2000

[[509  35   3  18  13   3]
 [ 35 600  34   9  11   6]
 [ 10  44  93   6   4   2]
 [ 38  36   4 177  17   3]
 [ 28  19   3  13 153   8]
 [  5  15   0   5  11  30]]


In [47]:
y_pred = lgb_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.777
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       581
           1       0.80      0.86      0.83       695
           2       0.68      0.56      0.62       159
           3       0.75      0.63      0.69       275
           4       0.75      0.69      0.72       224
           5       0.59      0.44      0.50        66

    accuracy                           0.78      2000
   macro avg       0.73      0.68      0.70      2000
weighted avg       0.77      0.78      0.77      2000

[[509  33   5  18  13   3]
 [ 31 598  31  14  14   7]
 [ 13  47  89   8   1   1]
 [ 43  41   2 174  13   2]
 [ 27  18   3  14 155   7]
 [ 10  14   0   3  10  29]]


In [48]:
y_pred = adaboost_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.661
              precision    recall  f1-score   support

           0       0.76      0.70      0.73       581
           1       0.77      0.72      0.74       695
           2       0.49      0.55      0.51       159
           3       0.55      0.55      0.55       275
           4       0.55      0.63      0.59       224
           5       0.32      0.47      0.38        66

    accuracy                           0.66      2000
   macro avg       0.57      0.60      0.59      2000
weighted avg       0.67      0.66      0.67      2000

[[409  48  15  52  50   7]
 [ 46 501  59  42  17  30]
 [ 12  36  87   9  11   4]
 [ 39  35  12 152  27  10]
 [ 23  21   5  18 142  15]
 [ 10   9   1   5  10  31]]


In [49]:
y_pred = knn_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.6805
              precision    recall  f1-score   support

           0       0.80      0.74      0.77       581
           1       0.85      0.66      0.75       695
           2       0.44      0.67      0.53       159
           3       0.59      0.63      0.61       275
           4       0.55      0.64      0.60       224
           5       0.35      0.65      0.46        66

    accuracy                           0.68      2000
   macro avg       0.60      0.67      0.62      2000
weighted avg       0.72      0.68      0.69      2000

[[431  27  19  49  38  17]
 [ 38 462  88  42  37  28]
 [  8  21 107   7  10   6]
 [ 33  17  16 174  22  13]
 [ 24  11   9  20 144  16]
 [  3   6   3   2   9  43]]


# 6. Cause of errors


In [None]:
emotions_to_int = {
    'sadness': 0,
    'joy': 1,
    'love': 2,
    'anger': 3,
    'fear': 4,
    'surprise': 5
}

int_to_emotions = {v: k for k, v in emotions_to_int.items()}


wrong_predictions = y_test[y_test != y_pred_logreg].index
for i, index in enumerate(wrong_predictions):
    if i == 10:
        break
    print(test_data['text'][index])
    print('Real: ', int_to_emotions[test_data['label'][index]])
    print('Pred:', int_to_emotions[y_pred_logreg[index]])
    print('\n')



In [None]:
print(confusion_matrix(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

In [None]:
# check which emotions are being confused
conf_matrix = confusion_matrix(y_test, y_pred_logreg)
conf_matrix_copy = conf_matrix.copy()
for i in range(conf_matrix.shape[0]):
    conf_matrix[i, i] = 0 # make the diagonal null, to not eclipse the other values
    
plt.imshow(conf_matrix, cmap='viridis', interpolation='nearest')
# annotate the axes with the emotion names
plt.xticks(range(6), int_to_emotions.values(), rotation=45)
plt.yticks(range(6), int_to_emotions.values())
# add colorbar more to the right
# plt.colorbar()
# legend the axes with predicted and true values
plt.xlabel('Predicted')
plt.ylabel('True')

# add counts in the plot
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        if i == j:
            plt.text(j, i, conf_matrix_copy[i, j], ha='center', va='center', color='white')
        else:
            plt.text(j, i, conf_matrix[i, j], ha='center', va='center', color='black')

for i in range(conf_matrix.shape[0]): # this does not count correct predictions
    plt.text(6, i, conf_matrix[i, :].sum(), ha='center', va='center', color='black')
for i in range(conf_matrix.shape[1]):
    plt.text(i, 7, conf_matrix[:, i].sum(), ha='center', va='center', color='black')
plt.show()

The test set appears to be unbalance.
- 'joy' is a lot more mixed with 'love' than the contrary. 
- 'surprise' has a low Precision

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_logreg)
conf_matrix = conf_matrix / conf_matrix.sum(axis=1)[:, None] # normalize the confusion matrix
conf_matrix_copy = conf_matrix.copy()
for i in range(conf_matrix.shape[0]):
    conf_matrix[i, i] = 0 # make the diagonal null, to not eclipse the other values
plt.imshow(conf_matrix, cmap='viridis', interpolation='nearest')

plt.xticks(range(6), int_to_emotions.values(), rotation=45)
plt.yticks(range(6), int_to_emotions.values())
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('True')

# add counts in the plot
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        if i == j:
            plt.text(j, i, f'{conf_matrix_copy[i, j]:.2f}', ha='center', va='center', color='white')
        else:
            plt.text(j, i, f'{conf_matrix[i, j]:.2f}', ha='center', va='center', color='black')

        
plt.title('Percentage of predictions (row sum=1)')
# Rows add to 1
plt.show()



- sadness mixed with joy and anger
- joy mixed with love
- love mixed with joy
- anger mixed with sadness and joy??
- fear mixed with sadness
- surprise mixed with almost everything

# 7. Experiment with user-inputed setences

In [80]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import text

def preprocess_text(phrase):
    # Tokenize
    tokens = nltk.word_tokenize(phrase.lower())

    # Remove stop words
    my_stop_words = set(stopwords.words('english'))
    words_to_keep = frozenset(['no', 'couldnt', 'cry', 'not', 'cant', 'cannot', 'nor', 'except', 'nobody', 'off', 'but', 'serious', 'enough', 'nothing', 'alone', 'down', 'only', 'without','hereby'])
    my_stop_words -= words_to_keep
    tokens = [word for word in tokens if word not in my_stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

def doc2vec_text(tokens):
    model = Doc2Vec.load("d2v_best_stopwords.model")
    return model.infer_vector(tokens)

def classify_emotion(number):
    if number==0:
        return "0. sadness"
    if number==1:
        return "1. joy"
    if number==2:
        return "2. love"
    if number==3:
        return "3. anger"
    if number==4:
        return "4. fear"
    if number==5:
        return "5. surprise"

    return "Not possible to identify"


def analyze_sentiment(phrase):
    tokens = preprocess_text(phrase)
    tokens_embeddings = doc2vec_text(tokens)

    # Reshape to have proper structure
    tokens_embeddings = np.array(tokens_embeddings).reshape(1, -1)

    # Predict the class using the SVM classifier
    predicted_class = svm_classifier.predict(tokens_embeddings)
    print(classify_emotion(predicted_class[0]))


In [81]:
phrase = "she touched me"
analyze_sentiment(phrase)

3. anger
