# TF-IDF Evaluation

This notebook performs preliminary evaluation of bagging, random forest, and AdaBoost using TF-IDF feature extraction. Ultimately the bag-of-words approach was chosen for further study so the models below were not fine-tuned.

In [4]:
import pandas as pd

# read in all data
test = pd.read_csv('../data/test.txt', delimiter=';', names=['text', 'target'])
train = pd.read_csv('../data/train.txt', delimiter=';',
                    names=['text', 'target'])
val = pd.read_csv('../data/val.txt', delimiter=';', names=['text', 'target'])
trainval = pd.concat([train,val])
testval = pd.concat([test,val])

In [5]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5,min_df=5,stop_words='english')
BoW = vectorizer.fit_transform(trainval.text)
print('Number of Features: ',len(vectorizer.get_feature_names_out()))

X_train = BoW.toarray()
Y_train = trainval.target
X_test = vectorizer.transform(test.text)
Y_test = test.target


Number of Features:  3397


## Bagging Classification with DT

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import sklearn.metrics as skm

bag = BaggingClassifier(n_estimators=10, random_state=0)
bag.fit(X_train, Y_train)
Y_test_pred = bag.predict(X_test)

train_acc = bag.score(X_train, Y_train)
test_acc = bag.score(X_test, Y_test)


In [7]:
# get metrics

print('---------------- Bagging Statistics ----------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred), 
    columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])


---------------- Bagging Statistics ----------------
Train Accuracy: 0.9852
Test Accuracy: 0.873
              precision    recall  f1-score   support

       anger       0.89      0.92      0.90       275
        fear       0.82      0.88      0.85       224
         joy       0.90      0.89      0.89       695
        love       0.73      0.74      0.73       159
     sadness       0.91      0.90      0.91       581
    surprise       0.64      0.59      0.61        66

    accuracy                           0.87      2000
   macro avg       0.82      0.82      0.82      2000
weighted avg       0.87      0.87      0.87      2000



Unnamed: 0,anger,fear,joy,love,sadness,surprise
anger,252,7,5,0,10,1
fear,8,197,2,2,10,5
joy,5,7,616,33,26,8
love,2,0,36,117,2,2
sadness,14,12,16,8,525,6
surprise,1,16,8,0,2,39


## Random Forest with DT

In [8]:
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as skm

rf = RandomForestClassifier(n_estimators=10, random_state=0)
rf.fit(X_train, Y_train)
Y_test_pred = rf.predict(X_test)

train_acc = rf.score(X_train, Y_train)
test_acc = rf.score(X_test, Y_test)


In [9]:
print('------------- Random Forest Statistics -------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred),
             columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])

------------- Random Forest Statistics -------------
Train Accuracy: 0.9902
Test Accuracy: 0.8805
              precision    recall  f1-score   support

       anger       0.90      0.89      0.90       275
        fear       0.83      0.87      0.85       224
         joy       0.89      0.92      0.91       695
        love       0.81      0.71      0.76       159
     sadness       0.93      0.90      0.92       581
    surprise       0.62      0.65      0.64        66

    accuracy                           0.88      2000
   macro avg       0.83      0.82      0.83      2000
weighted avg       0.88      0.88      0.88      2000



Unnamed: 0,anger,fear,joy,love,sadness,surprise
anger,245,9,7,1,12,1
fear,7,194,1,0,11,11
joy,4,4,641,22,15,9
love,2,0,43,113,0,1
sadness,14,14,20,4,525,4
surprise,0,13,8,0,2,43


## AdaBoost Classification with DT

In [10]:
from sklearn.ensemble import AdaBoostClassifier
import sklearn.metrics as skm

ada = AdaBoostClassifier(n_estimators=100, random_state=0)
ada.fit(X_train, Y_train)
Y_test_pred = ada.predict(X_test)

train_acc = ada.score(X_train, Y_train)
test_acc = ada.score(X_test, Y_test)


In [11]:

print('---------------- AdaBoost Statistics ----------------')
print('Train Accuracy: {:.4}'.format(train_acc))
print('Test Accuracy: {:.4}'.format(test_acc))
print(skm.classification_report(Y_test, Y_test_pred))
pd.DataFrame(skm.confusion_matrix(Y_test, Y_test_pred),
             columns=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
             index=['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'])


---------------- AdaBoost Statistics ----------------
Train Accuracy: 0.3684
Test Accuracy: 0.3795
              precision    recall  f1-score   support

       anger       0.55      0.02      0.04       275
        fear       0.63      0.20      0.30       224
         joy       0.36      0.95      0.53       695
        love       0.46      0.12      0.19       159
     sadness       0.49      0.03      0.06       581
    surprise       0.73      0.12      0.21        66

    accuracy                           0.38      2000
   macro avg       0.54      0.24      0.22      2000
weighted avg       0.47      0.38      0.26      2000



Unnamed: 0,anger,fear,joy,love,sadness,surprise
anger,6,1,266,0,2,0
fear,0,44,179,0,1,0
joy,1,1,663,20,7,3
love,1,1,138,19,0,0
sadness,0,4,556,2,19,0
surprise,3,19,26,0,10,8
