# 1. Pre-processing

#### 1.1. Text Pre-processing

The dataset is already lowercased and lacks punctuation. We will tokenize the text and remove stopwords, as well as apply lemmatization to the tokens.

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

data = pd.read_json('./data/data.jsonl', lines=True)
def pre_process_data(dataset):
    #tokenize
    dataset['text'] = dataset['text'].apply(nltk.word_tokenize)

    #remove stop words
    stop_words = set(stopwords.words('english'))
    dataset['text'] = dataset['text'].apply(lambda x: [word for word in x if word not in stop_words])

    #lemmatize
    lemmatizer = WordNetLemmatizer()
    dataset['text'] = dataset['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

    return dataset

data = pre_process_data(data)
data.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\inesc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\inesc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\inesc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,label
0,"[feel, awful, job, get, position, succeed, hap...",0
1,"[im, alone, feel, awful]",0
2,"[ive, probably, mentioned, really, feel, proud...",1
3,"[feeling, little, low, day, back]",0
4,"[beleive, much, sensitive, people, feeling, te...",2


In [5]:
test_data = pd.read_json('./data/test.jsonl', lines=True)
train_data = pd.read_json('./data/train.jsonl', lines=True)
validation_data = pd.read_json('./data/validation.jsonl', lines=True)

test_data = pre_process_data(test_data)
train_data = pre_process_data(train_data)
validation_data = pre_process_data(validation_data)

# 2. Vectorization
(secção possívelmente temporária, mas queria experimentar as cenas de tf_idf depois do pré-processamento) -- matos
não acho que convenha ser temporaria, dado que efetivamente melhora os resultados ihihihi, e é uma prática comum e recomendada pelo que estivemos a ver -- ines


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidfVectorizer(data):
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=40000)
    # good idea to use two-grams??
    # print(X.shape)

    X = vectorizer.fit_transform(data['text'].apply(lambda x: ' '.join(x)))

    x_train = vectorizer.transform(train_data['text'].apply(lambda x: ' '.join(x)))
    x_val = vectorizer.transform(validation_data['text'].apply(lambda x: ' '.join(x)))
    x_test = vectorizer.transform(test_data['text'].apply(lambda x: ' '.join(x)))

    y_train = train_data['label']
    y_val = validation_data['label']
    y_test = test_data['label']
    
    # print(vectorizer.get_feature_names_out())

    return x_train, x_val, x_test, y_train, y_val, y_test



In [7]:
from sklearn.feature_extraction.text import CountVectorizer

def countVectorizer(data):
    # Initialize CountVectorizer
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=40000)

    X = vectorizer.fit_transform(data['text'].apply(lambda x: ' '.join(x)))

    x_train = vectorizer.transform(train_data['text'].apply(lambda x: ' '.join(x)))
    x_val = vectorizer.transform(validation_data['text'].apply(lambda x: ' '.join(x)))
    x_test = vectorizer.transform(test_data['text'].apply(lambda x: ' '.join(x)))

    y_train = train_data['label']
    y_val = validation_data['label']
    y_test = test_data['label']

    return x_train, x_val, x_test, y_train, y_val, y_test



In [8]:
from gensim.models import Word2Vec
import numpy as np

# def wordEmbeddingsVectorizer(data):
#     return x_train, x_val, x_test, y_train, y_val, y_test



In [16]:
# Choose vectorizer (featurizer)
#x_train, x_val, x_test, y_train, y_val, y_test = tfidfVectorizer(data)
x_train, x_val, x_test, y_train, y_val, y_test = countVectorizer(data)
#x_train, x_val, x_test, y_train, y_val, y_test = wordEmbeddingsVectorizer(data)

## Apply SMOTE
smote = SMOTE(random_state=42, sampling_strategy='auto', k_neighbors=10)
x_train, y_train = smote.fit_resample(x_train, y_train)

print(y_train.value_counts())

label
0    5362
3    5362
2    5362
5    5362
4    5362
1    5362
Name: count, dtype: int64



# 3. Model Training

#### 3.1. Model Selection
Aqui também só estava a querer espetar modelos para começar a ver o que dá que ainda não sei que features vão ser usadas:
tf-idf, word embeddings, ???, features mais feitas à mão?


Isto pelos vistos é uma cena, que não implementei (ainda..)

"The validation set uses a subset of the training data to provide an unbiased evaluation of a model. The validation data set contrasts with training and test sets in that it is an intermediate phase used for choosing the best model and optimizing it. It is in this phase that hyperparameter tuning occurs."

Wikipedia:
The basic process of using a validation data set for model selection (as part of training data set, validation data set, and test data set) is:

Since our goal is to find the network having the best performance on new data, the simplest approach to the comparison of different networks is to evaluate the error function using data which is independent of that used for training. Various networks are trained by minimization of an appropriate error function defined with respect to a training data set. The performance of the networks is then compared by evaluating the error function using an independent validation set, and the network having the smallest error with respect to the validation set is selected. This approach is called the hold out method. Since this procedure can itself lead to some overfitting to the validation set, the performance of the selected network should be confirmed by measuring its performance on a third independent set of data called a test set.

An application of this process is in early stopping, where the candidate models are successive iterations of the same network, and training stops when the error on the validation set grows, choosing the previous model (the one with minimum error).

##### 3.1.1. Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(x_train.shape, y_train.shape)
logreg_classifier = LogisticRegression(max_iter=1000)
logreg_classifier.fit(x_train, y_train)
y_pred = logreg_classifier.predict(x_val)
print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

(32172, 40000) (32172,)
0.8875
              precision    recall  f1-score   support

           0       0.93      0.94      0.94       550
           1       0.93      0.90      0.91       704
           2       0.76      0.83      0.79       178
           3       0.88      0.88      0.88       275
           4       0.83      0.81      0.82       212
           5       0.70      0.79      0.74        81

    accuracy                           0.89      2000
   macro avg       0.84      0.86      0.85      2000
weighted avg       0.89      0.89      0.89      2000

[[517   4   4   7  12   6]
 [  7 631  39   9   9   9]
 [  4  23 148   2   1   0]
 [ 15   6   2 243   8   1]
 [  6   8   2  13 172  11]
 [  4   8   0   1   4  64]]


##### 3.1.2. Multinomial Naive Bayes

In [18]:

mnb_classifier = MultinomialNB()
mnb_classifier.fit(x_train, y_train)
y_pred = mnb_classifier.predict(x_val)
print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))



0.8525
              precision    recall  f1-score   support

           0       0.85      0.93      0.89       550
           1       0.86      0.91      0.88       704
           2       0.81      0.68      0.74       178
           3       0.89      0.82      0.86       275
           4       0.84      0.73      0.78       212
           5       0.76      0.64      0.70        81

    accuracy                           0.85      2000
   macro avg       0.84      0.79      0.81      2000
weighted avg       0.85      0.85      0.85      2000

[[510  19   2   7  10   2]
 [ 16 641  24   9  10   4]
 [ 13  42 121   1   1   0]
 [ 27  16   2 226   4   0]
 [ 22  15   1   9 155  10]
 [  9  15   0   1   4  52]]


# 4. Model Evaluation

In [None]:
# evaluate with test set
y_pred = logreg_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


0.8975
              precision    recall  f1-score   support

           0       0.93      0.94      0.93       581
           1       0.91      0.94      0.92       695
           2       0.78      0.76      0.77       159
           3       0.89      0.88      0.88       275
           4       0.90      0.86      0.88       224
           5       0.82      0.64      0.72        66

    accuracy                           0.90      2000
   macro avg       0.87      0.84      0.85      2000
weighted avg       0.90      0.90      0.90      2000

[[548  13   2  14   4   0]
 [  4 651  30   4   2   4]
 [  4  32 121   2   0   0]
 [ 19  10   1 241   4   0]
 [ 15   2   1   9 192   5]
 [  2  10   0   0  12  42]]


In [None]:
y_pred = mnb_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


0.823
              precision    recall  f1-score   support

           0       0.82      0.95      0.88       581
           1       0.78      0.98      0.87       695
           2       0.95      0.34      0.50       159
           3       0.93      0.69      0.79       275
           4       0.87      0.70      0.78       224
           5       1.00      0.14      0.24        66

    accuracy                           0.82      2000
   macro avg       0.89      0.63      0.68      2000
weighted avg       0.84      0.82      0.80      2000

[[554  19   0   5   3   0]
 [  9 683   3   0   0   0]
 [ 23  80  54   2   0   0]
 [ 43  39   0 189   4   0]
 [ 39  21   0   7 157   0]
 [ 11  30   0   0  16   9]]
