In [1]:
import os
from pathlib import Path
import logging
import itertools

import pandas as pd
import numpy as np

from sentiment_analysis.config import SentimentAnalysisConfig
from sentiment_analysis.data_access import DataClass
from sentiment_analysis.features import Features
from sentiment_analysis.evaluation import CustomEvaluation
from sentiment_analysis.utils.constants import (
    TEXT,
    TARGET,
    ORIGINAL_TEXT,
    SPLIT,
    TRAIN,
    VALID,
    TEST,
    SAVED_MODELS,
    PREDICTION
)

PARENT_PATH = Path(os.getcwd()).parent.absolute()
FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO)

In [2]:
config = SentimentAnalysisConfig()
config.CURRENT_PATH = PARENT_PATH

data = DataClass(config)
df = data.build()
df.head(10)

2022-12-05 02:05:58,558 - sentiment_analysis.utils.utils - INFO - func:build took: 13.28 sec


Unnamed: 0,review,sentiment,Split
0,Working with one of the best Shakespeare sourc...,neg,development
1,"Well...tremors I, the original started off in ...",neg,development
2,Ouch! This one was a bit painful to sit throug...,neg,development
3,"I've seen some crappy movies in my life, but t...",neg,development
4,"""Carriers"" follows the exploits of two guys an...",neg,development
5,I had been looking forward to seeing this film...,neg,development
6,Effect(s) without cause is generally not possi...,neg,development
7,"This picture started out with good intentions,...",neg,development
8,I chose to see this movie because it got a goo...,neg,development
9,This film has to be the worst I have ever seen...,neg,development


In [3]:
features = Features()
df_features = features.build(df)

2022-12-05 02:07:27,382 - sentiment_analysis.utils.utils - INFO - func:clean took: 1 min and                 28.77 sec
2022-12-05 02:07:54,097 - sentiment_analysis.utils.utils - INFO - func:fit took: 26.51 sec
2022-12-05 02:08:20,028 - sentiment_analysis.utils.utils - INFO - func:transform took: 25.91 sec
2022-12-05 02:08:20,039 - sentiment_analysis.utils.utils - INFO - func:fit_transform took: 52.45 sec
2022-12-05 02:08:22,780 - sentiment_analysis.utils.utils - INFO - func:transform took: 2.71 sec
2022-12-05 02:08:46,882 - sentiment_analysis.utils.utils - INFO - func:transform took: 24.10 sec
2022-12-05 02:08:58,427 - sentiment_analysis.utils.utils - INFO - func:build took: 2 min and                 59.82 sec


In [4]:
test = df_features[df_features[SPLIT].isin([TEST])]
test.reset_index(drop=True, inplace=True)
Y_test = test[TARGET]

Load predictions

In [5]:
xgboost = pd.read_pickle(os.path.join(data.reports_path, "xgboost-prediction.pkl"))
bert = pd.read_pickle(os.path.join(data.reports_path, "bert-prediction.pkl"))
roberta = pd.read_pickle(os.path.join(data.reports_path, "roberta-prediction.pkl"))

xgboost.reset_index(drop=True, inplace=True)
bert.reset_index(drop=True, inplace=True)
roberta.reset_index(drop=True, inplace=True)

Ensemble

In [9]:
eval = CustomEvaluation()

combined = (xgboost[PREDICTION] + bert[PREDICTION] + roberta[PREDICTION])
pred = np.where(combined >= 2, 1, 0)

In [10]:
eval.evaluate(y_true=Y_test.to_numpy(), y_pred=pred)

Accuracy     0.951640
Precision    0.947097
Recall       0.956720
F1 Score     0.951884
dtype: float64