# 1. Pre-processing

#### 1.1. Text Pre-processing

The dataset is already lowercased and lacks punctuation. We will tokenize the text and remove stopwords, as well as apply lemmatization to the tokens.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

data = pd.read_json('./data/data.jsonl', lines=True)
def pre_process_data(dataset):
    #tokenize
    dataset['text'] = dataset['text'].apply(nltk.word_tokenize)

    #remove stop words
    stop_words = set(stopwords.words('english'))
    dataset['text'] = dataset['text'].apply(lambda x: [word for word in x if word not in stop_words])

    #lemmatize
    lemmatizer = WordNetLemmatizer()
    dataset['text'] = dataset['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

    return dataset

data = pre_process_data(data)
data.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\inesc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\inesc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\inesc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,label
0,"[feel, awful, job, get, position, succeed, hap...",0
1,"[im, alone, feel, awful]",0
2,"[ive, probably, mentioned, really, feel, proud...",1
3,"[feeling, little, low, day, back]",0
4,"[beleive, much, sensitive, people, feeling, te...",2


In [2]:
test_data = pd.read_json('./data/test.jsonl', lines=True)
train_data = pd.read_json('./data/train.jsonl', lines=True)
validation_data = pd.read_json('./data/validation.jsonl', lines=True)

test_data = pre_process_data(test_data)
train_data = pre_process_data(train_data)
validation_data = pre_process_data(validation_data)

print(test_data["text"])

0       [im, feeling, rather, rotten, im, ambitious, r...
1                      [im, updating, blog, feel, shitty]
2       [never, make, separate, ever, want, feel, like...
3       [left, bouquet, red, yellow, tulip, arm, feeli...
4                            [feeling, little, vain, one]
                              ...                        
1995    [keep, feeling, like, someone, unkind, wrong, ...
1996    [im, feeling, little, cranky, negative, doctor...
1997    [feel, useful, people, give, great, feeling, a...
1998    [im, feeling, comfortable, derby, feel, though...
1999    [feel, weird, meet, w, people, text, like, don...
Name: text, Length: 2000, dtype: object


# 2. Vectorization
(secção possívelmente temporária, mas queria experimentar as cenas de tf_idf depois do pré-processamento) -- matos
não acho que convenha ser temporaria, dado que efetivamente melhora os resultados ihihihi, e é uma prática comum e recomendada pelo que estivemos a ver -- ines


### TF-IDF

#### Hypertunning for tf-idf

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidfVectorizer(data, train_data, validation_data, test_data):
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=40000)
    # good idea to use two-grams??
    # print(X.shape)

    x_train = vectorizer.fit_transform(train_data['text'].apply(lambda x: ' '.join(x)))

    x_val = vectorizer.transform(validation_data['text'].apply(lambda x: ' '.join(x)))
    x_test = vectorizer.transform(test_data['text'].apply(lambda x: ' '.join(x)))

    y_train = train_data['label']
    y_val = validation_data['label']
    y_test = test_data['label']
    
    # print(vectorizer.get_feature_names_out())

    return x_train, x_val, x_test, y_train, y_val, y_test



### BOW model
Count vectorizer which is an implementation of the BOW model.

The disadvantage of the BOW model is it does not consider the sequence of words, and as language does involve sequence and context, sometimes the BOW model might not be a good fit for the best-case scenario.

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

def countVectorizer(data, train_data, validation_data, test_data):
    # Initialize CountVectorizer
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=40000)

    x_train= vectorizer.fit_transform(train_data['text'].apply(lambda x: ' '.join(x)))

    x_val = vectorizer.transform(validation_data['text'].apply(lambda x: ' '.join(x)))
    x_test = vectorizer.transform(test_data['text'].apply(lambda x: ' '.join(x)))

    y_train = train_data['label']
    y_val = validation_data['label']
    y_test = test_data['label']

    return x_train, x_val, x_test, y_train, y_val, y_test



### Doc 2 Vec

#### Document 2 Vector training

In [99]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import utils

def train_doc2vec(train_data):
    max_epochs = 10
    vec_size = 100
    alpha = 0.025

    tagged_data = [TaggedDocument(words=doc, tags=[str(label)]) for doc, label in zip(train_data['text'], train_data['label'])]

    model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=8, epochs=50)
    
    model.build_vocab(tagged_data)


    model.train(tagged_data, total_examples=model.corpus_count, epochs=50)


    model.save("d2v_pca.model")
    print("Model Saved")

    return model

train_doc2vec(train_data)

Model Saved


<gensim.models.doc2vec.Doc2Vec at 0x28617851ea0>

In [100]:
# from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# from sklearn import utils
# from gensim.models.doc2vec import Doc2Vec
# from sklearn.decomposition import PCA

# def doc2vec(data, train_data, validation_data, test_data):

#     # training the doc2vec model
#     #model = train_doc2vec(train_data)
#     model = Doc2Vec.load("d2v_pca.model")
    
#     x_train = [model.infer_vector(doc) for doc in train_data['text']]
#     x_val = [model.infer_vector(doc) for doc in validation_data['text']]
#     x_test = [model.infer_vector(doc) for doc in test_data['text']]

#     # pca = PCA(n_components=3)
#     # x_train = pca.fit_transform(x_train)
#     # x_val = pca.transform(x_val)
#     # x_test = pca.transform(x_test)

#     y_train = train_data['label']
#     y_val = validation_data['label']
#     y_test = test_data['label']

#     return x_train, x_val, x_test, y_train, y_val, y_test

### Word 2 Vec

In [57]:
from gensim.models import Word2Vec
import numpy as np

def lala(model, data):
    vectors = []
    for doc in data:
        document = []
        for word in doc:
            document.append(model.wv[word])
        vectors.append(document)
    
    vectors = vectors.apply(lambda x: ' '.join(x))
    return print(vectors)

# [ [], [], [] ]
def wordEmbeddingsVectorizer(train_data, validation_data, test_data):
    model = Word2Vec(train_data, vector_size=150, window=10, min_count=2, workers=10, sg=1)

    x_train = lala(model, train_data)
    x_val = lala(model, validation_data)
    x_test = lala(model, test_data)

    y_train = train_data['label']
    y_val = validation_data['label']
    y_test = test_data['label']

    return x_train, x_val, x_test, y_train, y_val, y_test


In [101]:
# Choose vectorizer (featurizer)
#x_train, x_val, x_test, y_train, y_val, y_test = tfidfVectorizer(data, train_data, validation_data, test_data)
#x_train, x_val, x_test, y_train, y_val, y_test = countVectorizer(data, train_data, validation_data, test_data)
#x_train, x_val, x_test, y_train, y_val, y_test = wordEmbeddingsVectorizer(train_data, validation_data, test_data)
x_train, x_val, x_test, y_train, y_val, y_test = doc2vec(data, train_data, validation_data, test_data)

#print(x_train)

## Apply SMOTE
smote = SMOTE(random_state=42, sampling_strategy='auto', k_neighbors=10)
x_train, y_train = smote.fit_resample(x_train, y_train)


##  Hypertunning

In [8]:
import optuna
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np

# Load your data
# Assuming X_train, X_test, y_train, y_test are already defined

# Define a function to train and evaluate Doc2Vec + Logistic Regression model
y_train = train_data['label']
y_val = validation_data['label']

def objective(trial):
    vector_size = trial.suggest_int("vector_size", 50, 300)
    window = trial.suggest_int("window", 3, 15)
    min_count = trial.suggest_int("min_count", 1, 10)
    epochs = trial.suggest_int("epochs", 10, 50)
    
    # Train Doc2Vec model
    tagged_data = [TaggedDocument(words=doc, tags=[str(label)]) for doc, label in zip(train_data['text'], train_data['label'])]
    doc2vec_model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs)
    doc2vec_model.build_vocab(tagged_data)
    doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
    
    # Prepare document vectors for training and test sets
    X_train_vecs  = [doc2vec_model.infer_vector(doc) for doc in train_data['text']]
    X_val_vecs = [doc2vec_model.infer_vector(doc) for doc in validation_data['text']]
    
    # Train Logistic Regression classifier
    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(X_train_vecs, y_train)
    
    # Evaluate on test set
    y_pred = classifier.predict(X_val_vecs)
    accuracy = accuracy_score(y_val, y_pred)
    
    return accuracy

# Define the study object and optimize the objective function
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Print the best hyperparameters found
best_params = study.best_params
print("Best hyperparameters:", best_params)

# Train the final model with the best hyperparameters
best_vector_size = best_params["vector_size"]
best_window = best_params["window"]
best_min_count = best_params["min_count"]
best_epochs = best_params["epochs"]

# Train Doc2Vec model with best hyperparameters
tagged_data = [TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(train_data["text"])]
best_doc2vec_model = Doc2Vec(vector_size=best_vector_size, window=best_window, min_count=best_min_count, epochs=best_epochs)
best_doc2vec_model.build_vocab(tagged_data)
best_doc2vec_model.train(tagged_data, total_examples=best_doc2vec_model.corpus_count, epochs=best_doc2vec_model.epochs)

# Prepare document vectors for training and test sets with the best Doc2Vec model
#X_train_vecs = np.array([best_doc2vec_model.infer_vector(doc.split()) for doc in train_data["text"]])
#X_test_vecs = np.array([best_doc2vec_model.infer_vector(doc.split()) for doc in test_data["text"]])
X_train_vecs  = [best_doc2vec_model.infer_vector(doc) for doc in train_data['text']]
X_val_vecs = [best_doc2vec_model.infer_vector(doc) for doc in validation_data['text']]

# Train Logistic Regression classifier on the final Doc2Vec vectors
final_classifier = LogisticRegression(max_iter=1000)
final_classifier.fit(X_train_vecs, y_train)

# Evaluate on test set
final_accuracy = final_classifier.score(X_val_vecs, y_val)
print("Final accuracy on test set with the best model:", final_accuracy)


[I 2024-03-31 09:03:32,731] A new study created in memory with name: no-name-8ca9bc2d-94a7-4162-b78e-9b927adfe84f


[I 2024-03-31 09:05:14,266] Trial 0 finished with value: 0.837 and parameters: {'vector_size': 288, 'window': 8, 'min_count': 7, 'epochs': 31}. Best is trial 0 with value: 0.837.
[I 2024-03-31 09:06:18,242] Trial 1 finished with value: 0.8515 and parameters: {'vector_size': 182, 'window': 9, 'min_count': 8, 'epochs': 20}. Best is trial 1 with value: 0.8515.
[I 2024-03-31 09:08:14,106] Trial 2 finished with value: 0.84 and parameters: {'vector_size': 167, 'window': 5, 'min_count': 7, 'epochs': 40}. Best is trial 1 with value: 0.8515.
[I 2024-03-31 09:09:58,190] Trial 3 finished with value: 0.8025 and parameters: {'vector_size': 68, 'window': 7, 'min_count': 4, 'epochs': 41}. Best is trial 1 with value: 0.8515.
[I 2024-03-31 09:10:41,288] Trial 4 finished with value: 0.8705 and parameters: {'vector_size': 290, 'window': 10, 'min_count': 8, 'epochs': 13}. Best is trial 4 with value: 0.8705.
[I 2024-03-31 09:11:56,821] Trial 5 finished with value: 0.806 and parameters: {'vector_size': 161,

KeyboardInterrupt: 

In [113]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import utils

def train_doc2vec(train_data):

    tagged_data = [TaggedDocument(words=doc, tags=[str(label)]) for doc, label in zip(train_data['text'], train_data['label'])]

    model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=8, epochs=10)
    
    model.build_vocab(tagged_data)


    model.train(tagged_data, total_examples=model.corpus_count, epochs=50)


    model.save("d2v_pca.model")
    print("Model Saved")

    return model

train_doc2vec(train_data)

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import utils
from gensim.models.doc2vec import Doc2Vec
from sklearn.decomposition import PCA

def doc2vec(data, train_data, validation_data, test_data):

    # training the doc2vec model
    #model = train_doc2vec(train_data)
    model = Doc2Vec.load("d2v_pca.model")
    
    x_train = [model.infer_vector(doc) for doc in train_data['text']]
    x_val = [model.infer_vector(doc) for doc in validation_data['text']]
    x_test = [model.infer_vector(doc) for doc in test_data['text']]

    y_train = train_data['label']
    y_val = validation_data['label']
    y_test = test_data['label']

    return x_train, x_val, x_test, y_train, y_val, y_test

x_train, x_val, x_test, y_train, y_val, y_test = doc2vec(data, train_data, validation_data, test_data)



## Apply SMOTE
smote = SMOTE(random_state=42, sampling_strategy='auto', k_neighbors=10)
x_train, y_train = smote.fit_resample(x_train, y_train)

# Data training
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

logreg_classifier = LogisticRegression(max_iter=1000)
logreg_classifier.fit(x_train, y_train)
y_pred = logreg_classifier.predict(x_val)
print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

# evaluate with test set
y_pred = logreg_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Model Saved
0.664
              precision    recall  f1-score   support

           0       0.77      0.72      0.74       550
           1       0.84      0.65      0.73       704
           2       0.47      0.58      0.52       178
           3       0.63      0.65      0.64       275
           4       0.50      0.64      0.56       212
           5       0.34      0.68      0.45        81

    accuracy                           0.66      2000
   macro avg       0.59      0.65      0.61      2000
weighted avg       0.70      0.66      0.67      2000

[[397  23  19  39  43  29]
 [ 53 459  68  38  52  34]
 [ 16  27 104   7  15   9]
 [ 24  20  14 178  22  17]
 [ 20   9  11  17 135  20]
 [  7   7   3   5   4  55]]
0.6845
              precision    recall  f1-score   support

           0       0.80      0.72      0.76       581
           1       0.83      0.69      0.75       695
           2       0.43      0.60      0.50       159
           3       0.65      0.68      0.67       27


# 3. Model Training

#### 3.1. Model Selection
Aqui também só estava a querer espetar modelos para começar a ver o que dá que ainda não sei que features vão ser usadas:
tf-idf, word embeddings, ???, features mais feitas à mão?


Isto pelos vistos é uma cena, que não implementei (ainda..)

"The validation set uses a subset of the training data to provide an unbiased evaluation of a model. The validation data set contrasts with training and test sets in that it is an intermediate phase used for choosing the best model and optimizing it. It is in this phase that hyperparameter tuning occurs."

Wikipedia:
The basic process of using a validation data set for model selection (as part of training data set, validation data set, and test data set) is:

Since our goal is to find the network having the best performance on new data, the simplest approach to the comparison of different networks is to evaluate the error function using data which is independent of that used for training. Various networks are trained by minimization of an appropriate error function defined with respect to a training data set. The performance of the networks is then compared by evaluating the error function using an independent validation set, and the network having the smallest error with respect to the validation set is selected. This approach is called the hold out method. Since this procedure can itself lead to some overfitting to the validation set, the performance of the selected network should be confirmed by measuring its performance on a third independent set of data called a test set.

An application of this process is in early stopping, where the candidate models are successive iterations of the same network, and training stops when the error on the validation set grows, choosing the previous model (the one with minimum error).

##### 3.1.1. Logistic Regression

In [102]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

logreg_classifier = LogisticRegression(max_iter=1000)
logreg_classifier.fit(x_train, y_train)
y_pred = logreg_classifier.predict(x_val)
print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

0.6555
              precision    recall  f1-score   support

           0       0.76      0.71      0.74       550
           1       0.84      0.64      0.73       704
           2       0.46      0.58      0.51       178
           3       0.63      0.67      0.65       275
           4       0.48      0.63      0.54       212
           5       0.32      0.60      0.42        81

    accuracy                           0.66      2000
   macro avg       0.58      0.64      0.60      2000
weighted avg       0.70      0.66      0.67      2000

[[392  20  22  41  45  30]
 [ 50 450  77  35  63  29]
 [ 17  26 103  11  11  10]
 [ 21  16  13 183  23  19]
 [ 25  12   9  14 134  18]
 [  9   9   2   6   6  49]]


##### 3.1.2. Multinomial Naive Bayes

In [60]:

mnb_classifier = MultinomialNB()
mnb_classifier.fit(x_train, y_train)
y_pred = mnb_classifier.predict(x_val)
print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))



ValueError: Negative values in data passed to MultinomialNB (input X)

## SVM


In [103]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(x_train, y_train)
y_pred = svm_classifier.predict(x_val)

# Evaluate the performance
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.663
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.73      0.74       550
           1       0.84      0.66      0.74       704
           2       0.45      0.57      0.50       178
           3       0.62      0.63      0.62       275
           4       0.49      0.64      0.56       212
           5       0.39      0.60      0.47        81

    accuracy                           0.66      2000
   macro avg       0.59      0.64      0.61      2000
weighted avg       0.69      0.66      0.67      2000

Confusion Matrix:
 [[402  23  22  45  41  17]
 [ 57 466  71  32  55  23]
 [ 18  30 101   9  11   9]
 [ 29  17  17 172  28  12]
 [ 24  10  11  14 136  17]
 [ 11   9   1   5   6  49]]


### Boosting Algorithms

Testing with some boosting algorithms

##### XGBOOST

In [None]:
import xgboost as xgb
# XGBoost Classifier
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(x_train, y_train)
y_pred_xgb = xgb_classifier.predict(x_val)

# Evaluate the performance for XGBoost
accuracy_xgb = accuracy_score(y_val, y_pred_xgb)
report_xgb = classification_report(y_val, y_pred_xgb)
conf_matrix_xgb = confusion_matrix(y_val, y_pred_xgb)

# Print the evaluation metrics for XGBoost
print("XGBoost Classifier Evaluation:")
print("Accuracy:", accuracy_xgb)
print("Classification Report:\n", report_xgb)
print("Confusion Matrix:\n", conf_matrix_xgb)

XGBoost Classifier Evaluation:
Accuracy: 0.592
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.63      0.63       550
           1       0.64      0.76      0.69       704
           2       0.45      0.36      0.40       178
           3       0.56      0.45      0.50       275
           4       0.47      0.42      0.44       212
           5       0.43      0.35      0.38        81

    accuracy                           0.59      2000
   macro avg       0.53      0.49      0.51      2000
weighted avg       0.58      0.59      0.58      2000

Confusion Matrix:
 [[344 104  18  43  36   5]
 [ 79 535  39  15  26  10]
 [ 25  59  64  14  10   6]
 [ 54  60   7 124  20  10]
 [ 32  55   9  21  89   6]
 [ 11  23   5   4  10  28]]


##### Lightgbm

In [None]:
import lightgbm as lgb

lgb_classifier = lgb.LGBMClassifier()
lgb_classifier.fit(x_train, y_train)
y_pred_lgb = lgb_classifier.predict(x_val)

# Evaluate the performance for LightGBM
accuracy_lgb = accuracy_score(y_val, y_pred_lgb)
report_lgb = classification_report(y_val, y_pred_lgb)
conf_matrix_lgb = confusion_matrix(y_val, y_pred_lgb)

# Print the evaluation metrics for LightGBM
print("\nLightGBM Classifier Evaluation:")
print("Accuracy:", accuracy_lgb)
print("Classification Report:\n", report_lgb)
print("Confusion Matrix:\n", conf_matrix_lgb)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024793 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 32172, number of used features: 20
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759

LightGBM Classifier Evaluation:
Accuracy: 0.587
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.62      0.63       550
           1       0.67      0.72      0.69       704
           2       0.44      0.43      0.44       178
           3       0.50      0.42      0.45       275
           4       0.46      0.46      0.46

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# AdaBoost Classifier
adaboost_classifier = AdaBoostClassifier()
adaboost_classifier.fit(x_train, y_train)
y_pred_adaboost = adaboost_classifier.predict(x_val)

# Evaluate the performance for AdaBoost
accuracy_adaboost = accuracy_score(y_val, y_pred_adaboost)
report_adaboost = classification_report(y_val, y_pred_adaboost)
conf_matrix_adaboost = confusion_matrix(y_val, y_pred_adaboost)

# Print the evaluation metrics for AdaBoost
print("AdaBoost Classifier Evaluation:")
print("Accuracy:", accuracy_adaboost)
print("Classification Report:\n", report_adaboost)
print("Confusion Matrix:\n", conf_matrix_adaboost)


AdaBoost Classifier Evaluation:
Accuracy: 0.5015
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.47      0.53       550
           1       0.66      0.61      0.63       704
           2       0.32      0.47      0.38       178
           3       0.45      0.39      0.42       275
           4       0.33      0.44      0.38       212
           5       0.22      0.40      0.28        81

    accuracy                           0.50      2000
   macro avg       0.43      0.46      0.44      2000
weighted avg       0.54      0.50      0.51      2000

Confusion Matrix:
 [[259  84  43  60  70  34]
 [ 73 427  82  36  50  36]
 [ 17  42  83  12  17   7]
 [ 34  40  30 108  42  21]
 [ 29  35  15  21  94  18]
 [  9  18  10   3   9  32]]


###  Bagging Algorithm

# 5. Model Evaluation

In [None]:
# evaluate with test set
y_pred = logreg_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


0.5585
              precision    recall  f1-score   support

           0       0.75      0.60      0.67       581
           1       0.76      0.56      0.64       695
           2       0.30      0.53      0.39       159
           3       0.48      0.48      0.48       275
           4       0.44      0.54      0.48       224
           5       0.22      0.65      0.33        66

    accuracy                           0.56      2000
   macro avg       0.49      0.56      0.50      2000
weighted avg       0.63      0.56      0.58      2000

[[351  50  43  44  55  38]
 [ 55 386  98  55  53  48]
 [  9  25  84  14  17  10]
 [ 36  26  29 132  24  28]
 [ 17  17  17  26 121  26]
 [  3   4   6   5   5  43]]


In [None]:
y_pred = mnb_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


0.853
              precision    recall  f1-score   support

           0       0.86      0.93      0.90       581
           1       0.86      0.91      0.88       695
           2       0.74      0.65      0.69       159
           3       0.88      0.77      0.82       275
           4       0.89      0.79      0.83       224
           5       0.76      0.59      0.67        66

    accuracy                           0.85      2000
   macro avg       0.83      0.77      0.80      2000
weighted avg       0.85      0.85      0.85      2000

[[542  15   1  13   9   1]
 [ 15 633  35   3   4   5]
 [ 11  40 104   4   0   0]
 [ 29  29   1 211   5   0]
 [ 23  11   0   7 177   6]
 [  9  12   0   1   5  39]]


In [None]:
y_pred = svm_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


0.571
              precision    recall  f1-score   support

           0       0.75      0.62      0.68       581
           1       0.76      0.57      0.65       695
           2       0.32      0.54      0.40       159
           3       0.50      0.49      0.49       275
           4       0.45      0.55      0.49       224
           5       0.23      0.62      0.33        66

    accuracy                           0.57      2000
   macro avg       0.50      0.57      0.51      2000
weighted avg       0.63      0.57      0.59      2000

[[361  52  43  41  53  31]
 [ 57 397  94  48  52  47]
 [  8  26  86  13  16  10]
 [ 36  25  27 134  27  26]
 [ 16  19  15  26 123  25]
 [  3   5   7   5   5  41]]


In [None]:
y_pred = xgb_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


0.585
              precision    recall  f1-score   support

           0       0.65      0.67      0.66       581
           1       0.63      0.70      0.67       695
           2       0.39      0.33      0.36       159
           3       0.54      0.41      0.47       275
           4       0.47      0.43      0.45       224
           5       0.41      0.44      0.43        66

    accuracy                           0.58      2000
   macro avg       0.51      0.50      0.50      2000
weighted avg       0.58      0.58      0.58      2000

[[389 107  14  26  39   6]
 [ 88 489  39  37  29  13]
 [ 22  57  52  13  10   5]
 [ 58  53  15 114  27   8]
 [ 39  55   7  17  97   9]
 [  7  14   6   4   6  29]]


In [None]:
y_pred = lgb_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.596
              precision    recall  f1-score   support

           0       0.68      0.66      0.67       581
           1       0.65      0.71      0.68       695
           2       0.41      0.40      0.41       159
           3       0.51      0.42      0.46       275
           4       0.48      0.48      0.48       224
           5       0.42      0.45      0.43        66

    accuracy                           0.60      2000
   macro avg       0.52      0.52      0.52      2000
weighted avg       0.59      0.60      0.59      2000

[[383  93  21  37  41   6]
 [ 78 492  40  37  35  13]
 [ 19  45  64  15  12   4]
 [ 48  61  16 116  25   9]
 [ 27  50  10  20 107  10]
 [ 11  13   5   4   3  30]]


In [None]:
y_pred = adaboost_classifier.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.4825
              precision    recall  f1-score   support

           0       0.66      0.48      0.55       581
           1       0.63      0.55      0.59       695
           2       0.25      0.47      0.33       159
           3       0.46      0.37      0.41       275
           4       0.33      0.44      0.38       224
           5       0.20      0.47      0.28        66

    accuracy                           0.48      2000
   macro avg       0.42      0.46      0.42      2000
weighted avg       0.54      0.48      0.50      2000

[[278  98  57  44  70  34]
 [ 68 380  94  44  66  43]
 [  9  35  75  14  18   8]
 [ 30  40  38 103  38  26]
 [ 30  38  23  19  98  16]
 [  9   9   8   2   7  31]]
