In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
import logging
import itertools

import numpy as np
import pandas as pd
import xgboost as xgb

from sentiment_analysis.config import SentimentAnalysisConfig
from sentiment_analysis.data_access import DataClass
from sentiment_analysis.features import Features
from sentiment_analysis.evaluation import CustomEvaluation
from sentiment_analysis.utils.constants import (
    TEXT,
    TARGET,
    ORIGINAL_TEXT,
    SPLIT,
    TRAIN,
    VALID,
    TEST
)

PARENT_PATH = Path(os.getcwd()).parent.absolute()
FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO)

In [3]:
config = SentimentAnalysisConfig()
config.CURRENT_PATH = PARENT_PATH

data = DataClass(config)
df = data.build()
df.head(10)

2022-12-04 22:50:29,899 - sentiment_analysis.utils.utils - INFO - func:build took: 1.25 sec


Unnamed: 0,review,sentiment,Split
0,how can a director that makes such great films...,neg,development
1,This is one of the worst film adaptations of a...,neg,development
2,This is an art film that was either made in 19...,neg,development
3,"For two of the funniest comedians, the movie w...",neg,development
4,Doris Day never lets a bad script get her down...,neg,development
5,Unwatchable. You can't even make it past the f...,neg,development
6,"Quite average even by Monogram standards, this...",neg,development
7,This is not a good movie. Too preachy in parts...,neg,development
8,I am really shocked that a great director like...,neg,development
9,This is one of the weakest soft porn film arou...,neg,development


In [4]:
features = Features()
df_features = features.build(df)

2022-12-04 22:51:46,253 - sentiment_analysis.utils.utils - INFO - func:clean took: 1 min and                 16.21 sec
2022-12-04 22:52:04,997 - sentiment_analysis.utils.utils - INFO - func:fit took: 18.72 sec
2022-12-04 22:52:21,422 - sentiment_analysis.utils.utils - INFO - func:transform took: 16.42 sec
2022-12-04 22:52:21,424 - sentiment_analysis.utils.utils - INFO - func:fit_transform took: 35.15 sec
2022-12-04 22:52:23,104 - sentiment_analysis.utils.utils - INFO - func:transform took: 1.68 sec
2022-12-04 22:52:40,818 - sentiment_analysis.utils.utils - INFO - func:transform took: 17.71 sec
2022-12-04 22:52:42,552 - sentiment_analysis.utils.utils - INFO - func:build took: 2 min and                 12.52 sec


In [5]:
df_features.head(3)

Unnamed: 0,review,sentiment,Split,Original Text,char__ 00,char__ 1,char__ 10,char__ 11,char__ 12,char__ 13,...,word__york city,word__youll see,word__young boy,word__young girl,word__young man,word__young woman,word__youre going,word__youre looking,word__youve got,word__youve seen
0,saw movie tv afternoon cant see anyone sit pie...,0,train,I saw this movie on t.v. this afternoon and I ...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,film true historical film useful researching l...,1,train,This film is a true and historical film. It is...,0.0,0.0,0.05085,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,film something like sequel white zombie since ...,0,train,"This film is something like a sequel of ""White...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Prepare data for training and validation

In [6]:
train = df_features[df_features[SPLIT].isin([TRAIN])].copy()
valid = df_features[df_features[SPLIT].isin([VALID])].copy()
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)

X_train = train[features.vectorizer.get_feature_names_out()].copy()
X_valid = valid[features.vectorizer.get_feature_names_out()].copy()
Y_train = train[TARGET]
Y_valid = valid[TARGET]

pos_prob_train = sum(Y_train) / len(Y_train)
pos_prob_valid = sum(Y_train) / len(Y_train)
print(f"Number of features: {X_train.shape[1]:,}")
print(f"Number of training samples: {X_train.shape[0]:,}")
print(f"Training set label distribution: pos:{pos_prob_train:0.2f}, neg:{1-pos_prob_train:0.2f}")
print(f"Number of validation samples: {X_valid.shape[0]:,}")
print(f"Validation set label distribution: pos:{pos_prob_valid:0.2f}, neg:{1-pos_prob_valid:0.2f}")

Number of features: 5,000
Number of training samples: 22,500
Training set label distribution: pos:0.50, neg:0.50
Number of validation samples: 2,500
Validation set label distribution: pos:0.50, neg:0.50


Train with early stopping on validation step

This will help us discover the optimal boosting rounds. Once complete, we can train on the whole train + validation set using the discovered boosting rounds.

In [7]:
# Prepare dataset for training and validation
d_train = xgb.DMatrix(X_train, Y_train)
d_val = xgb.DMatrix(X_valid, Y_valid)
evals = [(d_val, "validation")]

# Evaluation class to access custom objective and evaluation metric
eval = CustomEvaluation()

model = xgb.train(
    params=config.XGB_PARAMETERS,
    dtrain=d_train,
    num_boost_round=config.XGB_NUM_BOOST_ROUND,
    evals=evals,
    obj=eval.binary_logistic,
    custom_metric=eval.accuracy_eval,
    maximize=True,
    early_stopping_rounds=config.XGB_EARLY_STOPPING_ROUNDS,
    verbose_eval=10
)

[0]	validation-accuracy:0.69000
[10]	validation-accuracy:0.70360
[20]	validation-accuracy:0.71480
[30]	validation-accuracy:0.72880
[40]	validation-accuracy:0.73040
[50]	validation-accuracy:0.73640
[60]	validation-accuracy:0.73640
[70]	validation-accuracy:0.74000
[80]	validation-accuracy:0.74760
[90]	validation-accuracy:0.75040
[100]	validation-accuracy:0.75320
[110]	validation-accuracy:0.75440
[120]	validation-accuracy:0.75600
[130]	validation-accuracy:0.76000
[140]	validation-accuracy:0.76400
[150]	validation-accuracy:0.76520
[160]	validation-accuracy:0.76880
[170]	validation-accuracy:0.77080
[180]	validation-accuracy:0.77080
[190]	validation-accuracy:0.77360
[200]	validation-accuracy:0.77680
[210]	validation-accuracy:0.77960
[220]	validation-accuracy:0.78080
[230]	validation-accuracy:0.78120
[240]	validation-accuracy:0.78320
[250]	validation-accuracy:0.78480
[260]	validation-accuracy:0.78560
[270]	validation-accuracy:0.78760
[280]	validation-accuracy:0.78840
[290]	validation-accuracy

In [8]:
num_boost_rounds = model.best_iteration
print(f"Learnt boosting rounds after validation: {num_boost_rounds:,}")

Learnt boosting rounds after validation: 985


Evaluation

Set the threshold which maximizes the accuracy on validation set

In [9]:
Y_valid_pred_probab = model.predict(xgb.DMatrix(X_valid))

threshold = eval.threshold_discovery(y_true=Y_valid.to_numpy(), y_pred_probab=Y_valid_pred_probab)

Score on validation set

In [10]:
Y_valid_pred = np.where(Y_valid_pred_probab > threshold, 1, 0)

eval.evaluate(y_true=Y_valid.to_numpy(), y_pred=Y_valid_pred)

Accuracy     0.829600
Precision    0.816295
Recall       0.850280
F1 Score     0.832941
dtype: float64

Score on test set

In [11]:
test = df_features[df_features[SPLIT].isin([TEST])].copy()
test.reset_index(drop=True, inplace=True)

X_test = test[features.vectorizer.get_feature_names_out()].copy()
Y_test = test[TARGET]

Y_test_pred_probab = model.predict(xgb.DMatrix(X_test))
Y_test_pred = np.where(Y_test_pred_probab > threshold, 1, 0)

eval.evaluate(y_true=Y_test.to_numpy(), y_pred=Y_test_pred)

Accuracy     0.833880
Precision    0.821955
Recall       0.852400
F1 Score     0.836901
dtype: float64