In [1]:
from sys import platform
import pandas as pd
import os
import torch

from src.models.sentiment_analysis.training_pipeline import train_models, find_best_model

from src.data.generate_oversample import generate_oversample

from src.models.sentiment_analysis.log_reg import LogReg
from src.models.sentiment_analysis.lstm import BasicLSTM
from src.models.sentiment_analysis.naive_bayes import Naivebayes
from src.models.sentiment_analysis.pre_trained.bert_fine_tuned import BertFineTuned
from src.models.sentiment_analysis.pre_trained.siebert import Siebert
from src.models.sentiment_analysis.svm import SVM
from src.models.sentiment_analysis.xg_boost import XgBoost
from src.models.sentiment_analysis.xg_boost_svd import XgBoostSvd

# Sentiment Analysis

In [2]:
data = pd.read_csv("../../data/raw/reviews.csv")
data

Unnamed: 0,Sentiment,Time,Text
0,positive,18/6/21,This is a very healthy dog food. Good for thei...
1,positive,7/7/21,I've been very pleased with the Natural Balanc...
2,positive,18/6/21,"Before I was educated about feline nutrition, ..."
3,positive,7/7/21,"My holistic vet recommended this, along with a..."
4,positive,1/7/21,I bought this coffee because its much cheaper ...
...,...,...,...
5439,negative,26/2/21,"This is an okay gift box, only if you like med..."
5440,negative,18/12/19,It looks llike I just walked into a raw deal. ...
5441,negative,19/1/20,Thank god that i tasted the metal before i swa...
5442,negative,13/9/20,This product was very good when I began buying...


In [2]:
from src.data.make_dataset import main as make_dataset
# Create the processed and feature engineered dataset
data = pd.read_csv("../../data/raw/reviews.csv")
X_train, X_test, y_train, y_test = make_dataset(
        data,
        train_split_output_filepath="../../data/processed/train_final_processed_reviews.csv",
        test_split_output_filepath="../../data/processed/test_final_processed_reviews.csv",
    )

[PP] Preprocessing complete
[FE] finished lowercase count...
[FE] finished uppercase count...
[FE] finished uppercase ratio...
[FE] finished punc count...
[FE] finished pos tags...
[FE] finished pos tag count...
[FE] finished tokenized_untokenized count...
[FE] finished num words misspelled...
[FE] finished polarity...
[FE] finished subjectivity...
[FE] finished pos neg count...
[FE] finished adding features...


## Data Preprocessing and Feature Engineering
make_dataset function takes in the raw data frame, and two output file paths.

make_dataset makes use of our Preprocessor and FeatureEngineer classes to process the data, performs a train-test split, and returns the respective X/y train and test sets.
The train and test sets are also written as csv files so that we can run make_dataset once and load the processed data should we need it again.

### Preprocessing

- Resolve contractions
- Remove numbers and special characters
- Lower case all words and remove punctuations
- Remove our custom list of stop words
- Find part-of-speech(POS) tag for each token using nltk
- Use nltk's WordNetLemmatizer to lemmatize each token based on POS tag

### Feature Engineering

Pandarallel was used to parallelize feature engineering steps.
The dataframe is split and processed in parallel, allowing us to speed up an otherwise single-threaded operation

<b>lowercase_count</b>: number of lower case words

<b>uppercase_count</b>: number of upper case words

<b>uppercase_ratio</b>: ratio of uppercase words to total tokens

<b>punc_count</b>: number of punctuations

<b>num_digits</b>: number of cardinal digits

<b>num_verbs</b>: number of verbs based on POS

<b>num_nouns</b>: number of nouns based on POS

<b>num_tokens_cleaned</b>: number of words after preprocessing

<b>num_tokens_raw</b>: number of words before preprocessing

<b>num_words_misspelled</b>: number of misspelled words using spellchecker

<b>polarity</b>: compound polarity score of a sentence using VaderSentiment

<b>subjectivity</b>: subjectivity score of a sentence using TextBlob

<b>num_pos_words</b>: number of positive words in a sentence using VaderSentiment

<b>num_neg_words</b>: number of negative words in a sentence using VaderSentiment

### Oversampling

<b>BERT Augmentation for Text Class Imbalance</b>:

BERT stands for Bidirectional Encoder Representations from Transformers and is a language representation model.
It uses masked word prediction by hiding keywords in sentences and letting BERT guess what they are.
Next sentence prediction is also used, teaching BERT to recognize longer-term dependencies across sentences.
Oversampled training data was only used for naive bayes as that was the only model with significant impact.

In [3]:
X_train.head()

Unnamed: 0,text,cleaned_text,lowercase_count,uppercase_count,uppercase_ratio,punc_count,num_digits,num_verbs,num_nouns,num_tokens_cleaned,num_tokens_raw,num_words_misspelled,polarity,subjectivity,num_pos_words,num_neg_words
0,Disappointed. The big boxes had a very differ...,disappointed big box very different flavor tha...,23,0,0.0,3,0,1,3,16,24,0,-0.6848,0.603148,0,0
1,This is a wonderfully refreshing citrus drink....,wonderfully refresh citrus drink not too sweet...,87,0,0.0,7,0,7,12,45,88,2,0.9502,0.685354,2,0
2,I just brewed my first cup and the aroma and f...,brew my first cup aroma flavor both speak autu...,27,1,0.032258,3,0,1,8,16,31,0,0.8283,0.491111,1,0
3,"It taste okay, but they are not soft and the ...",taste okay but not soft taste not stay very lo...,24,0,0.0,3,0,2,3,13,25,2,0.1154,0.456667,0,0
4,These are good chips. I like the texture and ...,good chip like texture easy chew than regular ...,26,0,0.0,3,0,0,6,13,27,0,0.8858,0.558974,0,0


## Training Pipeline
- Initialise all models and store in dict
- Train all models on same X_train, other than naive bayes which trained on an oversampled version of X_train
- Save (pickle) each of the trained models
- Evaluation of models
    - Load saved models
    - Predict on same X_test
    - Calculate metrics
- Select best model

### Models Used

- XgBoost
- XgBoost with singular value decomposition
- Logistic Regression
- Support Vector Machine
- Naivebayes
- BertFineTuned (our own fine-tuned checkpoint of bert base cased)
- Siebert (pre-trained checkpoint of RoBERTa-large)
- BasicLSTM




In [4]:
BASE_DIR = os.path.abspath(os.path.join("..", ".."))

# Load the data
train_filepath = os.path.join(BASE_DIR, "data/processed/train_final_processed_reviews.csv")
train_oversample_filepath = os.path.join(BASE_DIR, "data/processed/train_oversample_final_processed_reviews.csv")
test_filepath = os.path.join(BASE_DIR, "data/processed/test_final_processed_reviews.csv")

# If preprocessed and oversampled data is saved to csv, load them
if os.path.exists(train_filepath) and os.path.exists(test_filepath) and os.path.exists(train_oversample_filepath):
    train = pd.read_csv(train_filepath, index_col="Unnamed: 0")
    train_os = pd.read_csv(train_oversample_filepath, index_col="Unnamed: 0")
    test = pd.read_csv(test_filepath, index_col="Unnamed: 0")
    X_train = train.drop("sentiment", axis=1)
    X_train_os = train_os.drop("sentiment", axis=1)
    X_test = test.drop("sentiment", axis=1)
    y_train = train.sentiment.tolist()
    y_train_os = train_os.sentiment.tolist()
    y_test = test.sentiment.tolist()
else:  # generate the files
    data = pd.read_csv(os.path.join(BASE_DIR, "data/raw/reviews.csv"))
    oversample_filepath = os.path.join(BASE_DIR, "data/raw/reviews_oversample.csv")
    if not os.path.exists(
            oversample_filepath
    ):  # already generated, note: will take ~2hrs to generate the oversample
        generate_oversample(raw_df=data, oversample_filepath=oversample_filepath)
    data_os = pd.read_csv(oversample_filepath)
    # without oversample
    X_train, X_test, y_train, y_test = make_dataset(
        data, train_split_output_filepath=train_filepath, test_split_output_filepath=test_filepath
    )
    # with oversample
    print("Proceeding with FE on oversampled data...")
    X_train_os, X_test, y_train_os, y_test = make_dataset(
        data_os,
        train_split_output_filepath=train_oversample_filepath,
        test_split_output_filepath=test_filepath,
        oversample=True,
    )

if platform == "win32":
    models_path = os.path.join(BASE_DIR, "models\\sentiment_analysis")
else:
    models_path = os.path.join(BASE_DIR, "models/sentiment_analysis")

if torch.cuda.is_available():
    models = {
        "xg_boost": XgBoost(models_path),
        "xg_boost_svd": XgBoostSvd(models_path),
        "log_reg": LogReg(models_path),
        "svm": SVM(models_path),
        "naive_bayes": Naivebayes(models_path),
        "bert_fine_tuned": BertFineTuned(models_path),
        "siebert": Siebert(models_path),
        "lstm": BasicLSTM(models_path),
    }
else:
    models = {
        "xg_boost": XgBoost(models_path),
        "xg_boost_svd": XgBoostSvd(models_path),
        "log_reg": LogReg(models_path),
        "svm": SVM(models_path),
        "naive_bayes": Naivebayes(models_path),
        "lstm": BasicLSTM(models_path),
    }

# Train the models and save them
train_models(models, X_train, y_train, X_train_os, y_train_os, models_path)
best_model, best_model_name, best_accuracy = find_best_model(models, models_path, X_test, y_test)
print(f"Best model: {best_model_name}")
print(f"Accuracy: {best_accuracy}")


Training models:   0%|          | 0/8 [00:00<?, ?it/s]

________________________________________________________________________________
Loaded model: xg_boost
Accuracy: 0.8778696051423324
Precision: 0.9068923821039904
Recall: 0.9305210918114144
F1 Score: 0.9185548071034905
ROC AUC: 0.8292181430788521
________________________________________________________________________________
________________________________________________________________________________
Loaded model: xg_boost_svd
Accuracy: 0.8888888888888888
Precision: 0.9043683589138135
Recall: 0.9503722084367245
F1 Score: 0.9267997580157289
ROC AUC: 0.8320765635823199
________________________________________________________________________________
________________________________________________________________________________
Loaded model: log_reg
Accuracy: 0.8659320477502296
Precision: 0.8909952606635071
Recall: 0.9330024813895782
F1 Score: 0.9115151515151515
ROC AUC: 0.8039570710834816
________________________________________________________________________________
_____________

# Scoring Pipeline

- Reads in the test file
- Performs same preprocessing and feature engineering as above
- Loads best model as determined by training pipeline
- Runs prediction on test data
- Returns csv with Text, Time, predicted_sentiment_probability, predicted_sentiment
- Writes to output directory

In [None]:
os.chdir("../../")

In [None]:
from src.models.sentiment_analysis.scoring_pipeline import run_scoring_pipeline

# read in test csv which only has Time and Text columns
test_data = pd.read_csv("data/raw/reviews_test.csv")

pred = run_scoring_pipeline(test_data)
os.makedirs("data/predictions", exist_ok=True)
pred.to_csv("data/predictions/reviews_test_predictions_data-dialogue.csv", index=False)
