In [1]:
%load_ext autoreload
%autoreload 2

In [19]:
import os
from pathlib import Path
import logging
import itertools

import numpy as np
import pandas as pd
import xgboost as xgb

from sentiment_analysis.config import SentimentAnalysisConfig
from sentiment_analysis.data_access import DataClass
from sentiment_analysis.features import Features
from sentiment_analysis.evaluation import CustomEvaluation
from sentiment_analysis.utils.constants import (
    TEXT,
    TARGET,
    ORIGINAL_TEXT,
    SPLIT,
    TRAIN,
    VALID,
    TEST,
    PREDICTION
)

PARENT_PATH = Path(os.getcwd()).parent.absolute()
FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO)

In [3]:
config = SentimentAnalysisConfig()
config.CURRENT_PATH = PARENT_PATH

data = DataClass(config)
df = data.build()
df.head(10)

2022-12-04 21:02:55,139 - sentiment_analysis.utils.utils - INFO - func:build took: 10.24 sec


Unnamed: 0,review,sentiment,Split
0,Working with one of the best Shakespeare sourc...,neg,development
1,"Well...tremors I, the original started off in ...",neg,development
2,Ouch! This one was a bit painful to sit throug...,neg,development
3,"I've seen some crappy movies in my life, but t...",neg,development
4,"""Carriers"" follows the exploits of two guys an...",neg,development
5,I had been looking forward to seeing this film...,neg,development
6,Effect(s) without cause is generally not possi...,neg,development
7,"This picture started out with good intentions,...",neg,development
8,I chose to see this movie because it got a goo...,neg,development
9,This film has to be the worst I have ever seen...,neg,development


In [4]:
features = Features()
df_features = features.build(df)

2022-12-04 21:04:18,296 - sentiment_analysis.utils.utils - INFO - func:clean took: 1 min and                 23.04 sec
2022-12-04 21:04:41,708 - sentiment_analysis.utils.utils - INFO - func:fit took: 23.29 sec
2022-12-04 21:05:07,018 - sentiment_analysis.utils.utils - INFO - func:transform took: 25.31 sec
2022-12-04 21:05:07,038 - sentiment_analysis.utils.utils - INFO - func:fit_transform took: 48.62 sec
2022-12-04 21:05:09,861 - sentiment_analysis.utils.utils - INFO - func:transform took: 2.81 sec
2022-12-04 21:05:37,174 - sentiment_analysis.utils.utils - INFO - func:transform took: 27.31 sec
2022-12-04 21:05:50,616 - sentiment_analysis.utils.utils - INFO - func:build took: 2 min and                 55.37 sec


In [5]:
df_features.head(3)

Unnamed: 0,review,sentiment,Split,Original Text,char__ 00,char__ 1,char__ 10,char__ 11,char__ 12,char__ 13,...,word__york city,word__youll see,word__young boy,word__young girl,word__young man,word__young woman,word__youre going,word__youre looking,word__youve got,word__youve seen
0,tolerant really bad sci fi horror movie ive wa...,0,train,I am very tolerant of really bad sci/fi and ho...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,wow good movie acting wasnt good look moment f...,1,train,wow! this is a good movie! The acting wasn't g...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,whatever rating give boom superb location phot...,0,train,Whatever rating I give BOOM is only because of...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Prepare data for training and validation

In [6]:
train = df_features[df_features[SPLIT].isin([TRAIN])].copy()
valid = df_features[df_features[SPLIT].isin([VALID])].copy()
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)

X_train = train[features.vectorizer.get_feature_names_out()].copy()
X_valid = valid[features.vectorizer.get_feature_names_out()].copy()
Y_train = train[TARGET]
Y_valid = valid[TARGET]

pos_prob_train = sum(Y_train) / len(Y_train)
pos_prob_valid = sum(Y_train) / len(Y_train)
print(f"Number of features: {X_train.shape[1]:,}")
print(f"Number of training samples: {X_train.shape[0]:,}")
print(f"Training set label distribution: pos:{pos_prob_train:0.2f}, neg:{1-pos_prob_train:0.2f}")
print(f"Number of validation samples: {X_valid.shape[0]:,}")
print(f"Validation set label distribution: pos:{pos_prob_valid:0.2f}, neg:{1-pos_prob_valid:0.2f}")

Number of features: 5,000
Number of training samples: 22,500
Training set label distribution: pos:0.50, neg:0.50
Number of validation samples: 2,500
Validation set label distribution: pos:0.50, neg:0.50


Train with early stopping on validation step

This will help us discover the optimal boosting rounds. Once complete, we can train on the whole train + validation set using the discovered boosting rounds.

In [7]:
# Prepare dataset for training and validation
d_train = xgb.DMatrix(X_train, Y_train)
d_val = xgb.DMatrix(X_valid, Y_valid)
evals = [(d_val, "validation")]

# Evaluation class to access custom objective and evaluation metric
eval = CustomEvaluation()

model = xgb.train(
    params=config.XGB_PARAMETERS,
    dtrain=d_train,
    num_boost_round=config.XGB_NUM_BOOST_ROUND,
    evals=evals,
    obj=eval.binary_logistic,
    custom_metric=eval.accuracy_eval,
    maximize=True,
    early_stopping_rounds=config.XGB_EARLY_STOPPING_ROUNDS,
    verbose_eval=10
)

[0]	validation-accuracy:0.70080
[10]	validation-accuracy:0.71160
[20]	validation-accuracy:0.71120
[30]	validation-accuracy:0.71880
[40]	validation-accuracy:0.73240
[50]	validation-accuracy:0.73360
[60]	validation-accuracy:0.74200
[70]	validation-accuracy:0.73840
[80]	validation-accuracy:0.74480
[90]	validation-accuracy:0.74600
[100]	validation-accuracy:0.74840
[110]	validation-accuracy:0.75240
[120]	validation-accuracy:0.75440
[130]	validation-accuracy:0.75800
[140]	validation-accuracy:0.75920
[150]	validation-accuracy:0.76120
[160]	validation-accuracy:0.76480
[170]	validation-accuracy:0.76800
[180]	validation-accuracy:0.77000
[190]	validation-accuracy:0.77240
[200]	validation-accuracy:0.77440
[210]	validation-accuracy:0.77520
[220]	validation-accuracy:0.77680
[230]	validation-accuracy:0.78160
[240]	validation-accuracy:0.78120
[250]	validation-accuracy:0.78360
[260]	validation-accuracy:0.78480
[270]	validation-accuracy:0.78480
[280]	validation-accuracy:0.78640
[290]	validation-accuracy

In [8]:
num_boost_rounds = model.best_iteration
print(f"Learnt boosting rounds after validation: {num_boost_rounds:,}")

Learnt boosting rounds after validation: 2,080


Evaluation

Set the threshold which maximizes the accuracy on validation set

In [16]:
Y_valid_pred_probab = model.predict(xgb.DMatrix(X_valid))

threshold = eval.threshold_discovery(y_true=Y_valid.to_numpy(), y_pred_probab=Y_valid_pred_probab)
print(f"Setting threshold to: {threshold}")

Setting threshold to: 0.01


Score on validation set

In [17]:
Y_valid_pred = np.where(Y_valid_pred_probab > threshold, 1, 0)

eval.evaluate(y_true=Y_valid.to_numpy(), y_pred=Y_valid_pred)

Accuracy     0.860000
Precision    0.847104
Recall       0.878303
F1 Score     0.862421
dtype: float64

Score on test set

In [18]:
test = df_features[df_features[SPLIT].isin([TEST])].copy()
test.reset_index(drop=True, inplace=True)

X_test = test[features.vectorizer.get_feature_names_out()].copy()
Y_test = test[TARGET]

Y_test_pred_probab = model.predict(xgb.DMatrix(X_test))
Y_test_pred = np.where(Y_test_pred_probab > threshold, 1, 0)

eval.evaluate(y_true=Y_test.to_numpy(), y_pred=Y_test_pred)

Accuracy     0.855440
Precision    0.848361
Recall       0.865600
F1 Score     0.856894
dtype: float64

In [21]:
pred_df = test[[ORIGINAL_TEXT, TARGET]].copy()
pred_df[PREDICTION] = Y_test_pred.reshape(-1,)
save_to = os.path.join(data.reports_path, "xgboost-prediction.pkl")
pred_df.to_pickle(save_to)