# Summary
To compare results between NB model trained on without vs with stopwords removed.

# 1. Importing necessary libraries and reading the data from a CSV file.


Importing all relevant libraries

In [None]:
# Installations
# !pip install contractions

In [None]:
#general libraries
import pandas as pd
import numpy as np
import gensim
from sklearn.model_selection import train_test_split

#for Vader
import nltk
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sent_analyzer = SentimentIntensityAnalyzer()

#for decontract
import re
import contractions

#confusion matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
pd.options.mode.chained_assignment = None
nltk.download('stopwords')
english_stop_list = nltk.corpus.stopwords.words('english')
indonesian_stop_list = nltk.corpus.stopwords.words('indonesian')
stop_list = english_stop_list.copy()
stop_list.extend(indonesian_stop_list)

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 2. Data preprocessing, including expanding contractions and assigning gold truth labels based on review scores.

In [None]:
df = pd.read_csv('cleaned_grab_playstore_reviews.csv')

df = df.drop('Date', axis=1)

In [None]:
#obtain list of polarity based on scores
score_polarity = []
for i in range(len(df)):
    if df['Score'][i] == 3:
        score_polarity.append('neutral')
    elif df['Score'][i] < 3:
        score_polarity.append('negative')
    else:
        score_polarity.append('positive')

# Expand contractions within the reviews which include those like cant, dont
def decontract_reviews(text):
    return contractions.fix(text)

df['Review_SA_Processed'] = df['Review'].apply(decontract_reviews)

In [None]:
df = df.assign(gold_truth=score_polarity)
df = df.reindex(columns=['Review_SA_Processed', 'Score', 'gold_truth'])
df
# df.loc[df['gold_truth'] == 'neutral']
# df[(df['Review_SA_Processed'].str.contains('good')) & (df['Score'].isin([1, 2]))]


Unnamed: 0,Review_SA_Processed,Score,gold_truth
0,update poor performance taking minutes search ...,1,negative
1,bad app,1,negative
2,friendly helpful nice well maintained vehicle ...,5,positive
3,remove grab app looking food knew car number w...,2,negative
4,embarassing singapore waiting time reduced min...,2,negative
...,...,...,...
9802,good heavens future taxi trip arrangements tha...,5,positive
9803,apa punya babi customer service baru nak typin...,1,negative
9804,grab app helpful service fast amazing delivere...,5,positive
9805,trying book ride broad daylight specific spent...,1,negative


# 3.1 Training a Naive Bayes Classifier (Probablistic) with Stopwords

In [None]:
#clone original df
df_copy = df.copy(deep=True)

#make x and y for train_test split
y = df_copy.pop('gold_truth')

X = df_copy

In [None]:
#train-test split the data, where corpus = X and labels = y
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

# We use the following list to store the sentences, where each sentence itself is a list of words.
X_train_corpus = []

for i in range(len(X_train)):
    # Tokenize the text.
    sent = nltk.word_tokenize(X_train['Review_SA_Processed'].iloc[i])

    # Store the sentence into the corpus.
    X_train_corpus.append(sent)

# Create a dictionary from the corpus.
dictionary = gensim.corpora.Dictionary(X_train_corpus)

# Store the labeled training data in the following list.
labeled_training_data = []

# Going through the two lists in parallel to create the labeled data set.
for (l, s) in zip(y_train, X_train_corpus):

    # Convert the original sentence into a vector.
    vector = dictionary.doc2bow(s)

    # Create a dict object to store the document vector (in order to use NLTK's classifier later)
    sent_as_dict = {id:1 for (id, tf) in vector}

    # Add the labeled sentence to the labeled data set.
    labeled_training_data.append((sent_as_dict, l))

print('Finished preparing the training data.')

# Training Naive Bayes classifier.
classifierWithStopwords = nltk.NaiveBayesClassifier.train(labeled_training_data)

print('Finished training the classifier.')

# Store the labeled test data in the following list.
labeled_test_data = []

X_test_corpus = []

for i in range(len(X_test)):
    # Tokenize the text.
    sent = nltk.word_tokenize(X_test['Review_SA_Processed'].iloc[i])

    # Store the sentence into the corpus.
    X_test_corpus.append(sent)

# Going through the two lists in parallel to create the labeled data set.
for (l, s) in zip(y_test, X_test_corpus):

    # Convert the original sentence into a vector.
    vector = dictionary.doc2bow(s)

    # Create a dict object to store the document vector (in order to use NLTK's classifier later)
    sent_as_dict = {id:1 for (id, tf) in vector}

    # Add the labeled sentence to the labeled data set.
    labeled_test_data.append((sent_as_dict, l))

print('Finished preparing the test data.')

Finished preparing the training data.
Finished training the classifier.
Finished preparing the test data.


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Convert the labeled test data to a list of features and labels
test_set = [(features, label) for (features, label) in labeled_test_data]

# Create a list of predicted labels and a list of gold truth labels for the test set
y_pred = [classifierWithStopwords.classify(features) for (features, label) in test_set]
y_true = [label for (features, label) in test_set]

# Generate confusion matrix
nb_conf_mat = confusion_matrix(y_true, y_pred)

# Generate classification report
nb_class_report = classification_report(y_true, y_pred, output_dict=True)

# Confusion Matrix and Classification Report with Stopwords

cm_with_stopwords = nb_conf_mat
cr_with_stopwords = classification_report(y_true, y_pred)
cr_with_stopwords_dict = classification_report(y_true, y_pred, output_dict=True)

# Print results
print("Naive Bayes with Stopwords")
print("Confusion Matrix:\n", cm_with_stopwords)
print("\nClassification Report:\n", cr_with_stopwords)




Naive Bayes with Stopwords
Confusion Matrix:
 [[ 902  226   26]
 [  99   34   16]
 [ 317  165 1158]]

Classification Report:
               precision    recall  f1-score   support

    negative       0.68      0.78      0.73      1154
     neutral       0.08      0.23      0.12       149
    positive       0.96      0.71      0.82      1640

    accuracy                           0.71      2943
   macro avg       0.58      0.57      0.55      2943
weighted avg       0.81      0.71      0.75      2943



In [None]:
# Define the function to preprocess the sentences
def preprocess_sentence(sentence):
    # Convert to lowercase
    sentence = sentence.lower()

    # Expand contractions
    sentence = decontract_reviews(sentence)

    # Remove punctuation
    sentence = re.sub('[^\w\s]', '', sentence)

    # Tokenize the sentence
    tokens = nltk.word_tokenize(sentence)

    return tokens

# List of sentences to test
sentences = ["Good driver service!", "Bad driver service..", "this is an app for driver and rider ", "i like this app!!"]

print("Model: Naive Bayes")
print("Custom Input", "| ", "Predicted Sentiment")

# Iterate over the list of sentences and predict the sentiment for each
for sentence in sentences:
    # Preprocess the sentence
    processed_sentence = preprocess_sentence(sentence)

    # Convert the tokenized sentence to a vector
    vector = dictionary.doc2bow(processed_sentence)

    # Create a dictionary object to store the document vector (in order to use NLTK's classifier)
    sent_as_dict = {id: 1 for (id, tf) in vector}

    # Use the Naive Bayes classifier to predict the sentiment of the sentence
    predicted_sentiment = classifierWithStopwords.classify(sent_as_dict)

    # Print the predicted sentiment
    print(sentence, "|", predicted_sentiment)


Model: Naive Bayes
Custom Input |  Predicted Sentiment
Good driver service! | positive
Bad driver service.. | negative
this is an app for driver and rider  | neutral
i like this app!! | neutral


In [None]:
#create new df that consist of x_test and y_test for other models to use
classifying_df_with_stopwords = X_test.join(y_test)

#insert results of naive bayes classifier to df
classifying_df_with_stopwords['Naive_Bayes'] = y_pred

# Display random 5 rows
classifying_df_with_stopwords[['Review_SA_Processed', 'gold_truth', 'Naive_Bayes']].sample(5)


Unnamed: 0,Review_SA_Processed,gold_truth,Naive_Bayes
673,good driver nice clean car,positive,positive
3101,great service malaysia,positive,positive
9508,ovo tidak terkoneksi,negative,negative
6489,give option rider wants accept nearer driver t...,neutral,negative
4001,good,positive,positive


# 3.2 Training a Naive Bayes Classifier (Probablistic) without Stopwords

In [None]:
df = pd.read_csv('cleaned_grab_playstore_reviews.csv')

df = df.drop('Date', axis=1)

#obtain list of polarity based on scores
score_polarity = []
for i in range(len(df)):
    if df['Score'][i] == 3:
        score_polarity.append('neutral')
    elif df['Score'][i] < 3:
        score_polarity.append('negative')
    else:
        score_polarity.append('positive')

# Expand contractions within the reviews which include those like cant, dont
def decontract_reviews(text):
    return contractions.fix(text)

def preprocess_sentence(sentence):
    # Convert to lowercase
    sentence = sentence.lower()

    # Expand contractions
    sentence = decontract_reviews(sentence)

    # Remove punctuation
    sentence = re.sub('[^\w\s]', '', sentence)

    # Tokenize the sentence
    tokens = nltk.word_tokenize(sentence)

    # Remove stopwords
    filtered_tokens = [token for token in tokens if token not in stop_list]

    # Join the filtered tokens using a space character
    filtered_sentence = ' '.join(filtered_tokens)

    return filtered_sentence

df['Review_SA_Processed_Removed_Stopwords'] = df['Review'].apply(preprocess_sentence)
df = df.assign(gold_truth=score_polarity)
df = df.reindex(columns=['Review_SA_Processed_Removed_Stopwords', 'Score', 'gold_truth'])


#clone original df
df_copy = df.copy(deep=True)

#make x and y for train_test split
y = df_copy.pop('gold_truth')

X = df_copy

df.tail()


Unnamed: 0,Review_SA_Processed_Removed_Stopwords,Score,gold_truth
9802,good heavens future taxi trip arrangements tha...,5,positive
9803,babi customer service nak typing dah ended ses...,1,negative
9804,grab app helpful service fast amazing delivere...,5,positive
9805,trying book ride broad daylight specific spent...,1,negative
9806,cashing easy use credit pay cashless try withd...,1,negative


In [None]:
#train-test split the data, where corpus = X and labels = y
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

# We use the following list to store the sentences, where each sentence itself is a list of words.
X_train_corpus = []

for i in range(len(X_train)):
    # Tokenize the text.
    sent = nltk.word_tokenize(X_train['Review_SA_Processed_Removed_Stopwords'].iloc[i])

    # Store the sentence into the corpus.
    X_train_corpus.append(sent)

# Create a dictionary from the corpus.
dictionary = gensim.corpora.Dictionary(X_train_corpus)

# Store the labeled training data in the following list.
labeled_training_data = []

# Going through the two lists in parallel to create the labeled data set.
for (l, s) in zip(y_train, X_train_corpus):

    # Convert the original sentence into a vector.
    vector = dictionary.doc2bow(s)

    # Create a dict object to store the document vector (in order to use NLTK's classifier later)
    sent_as_dict = {id:1 for (id, tf) in vector}

    # Add the labeled sentence to the labeled data set.
    labeled_training_data.append((sent_as_dict, l))

print('Finished preparing the training data.')

# Training Naive Bayes classifier.
classifierWithoutStopwords = nltk.NaiveBayesClassifier.train(labeled_training_data)

print('Finished training the classifier.')

# Store the labeled test data in the following list.
labeled_test_data = []

X_test_corpus = []

for i in range(len(X_test)):
    # Tokenize the text.
    sent = nltk.word_tokenize(X_test['Review_SA_Processed_Removed_Stopwords'].iloc[i])

    # Store the sentence into the corpus.
    X_test_corpus.append(sent)

# Going through the two lists in parallel to create the labeled data set.
for (l, s) in zip(y_test, X_test_corpus):

    # Convert the original sentence into a vector.
    vector = dictionary.doc2bow(s)

    # Create a dict object to store the document vector (in order to use NLTK's classifier later)
    sent_as_dict = {id:1 for (id, tf) in vector}

    # Add the labeled sentence to the labeled data set.
    labeled_test_data.append((sent_as_dict, l))

print('Finished preparing the test data.')

Finished preparing the training data.
Finished training the classifier.
Finished preparing the test data.


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Convert the labeled test data to a list of features and labels
test_set = [(features, label) for (features, label) in labeled_test_data]

# Create a list of predicted labels and a list of gold truth labels for the test set
y_pred = [classifierWithoutStopwords.classify(features) for (features, label) in test_set]
y_true = [label for (features, label) in test_set]

# Generate confusion matrix
nb_conf_mat = confusion_matrix(y_true, y_pred)

# Generate classification report
nb_class_report = classification_report(y_true, y_pred, output_dict=True)

# Confusion Matrix and Classification Report with Stopwords

cm_without_stopwords = nb_conf_mat
cr_without_stopwords = classification_report(y_true, y_pred)
cr_without_stopwords_dict = classification_report(y_true, y_pred, output_dict=True)


# Print results
print("Naive Bayes Removed Stopwords")
print("Confusion Matrix:\n", cm_without_stopwords)
print("\nClassification Report:\n", cr_without_stopwords)




Naive Bayes Removed Stopwords
Confusion Matrix:
 [[ 897  230   27]
 [  91   42   16]
 [ 292  171 1177]]

Classification Report:
               precision    recall  f1-score   support

    negative       0.70      0.78      0.74      1154
     neutral       0.09      0.28      0.14       149
    positive       0.96      0.72      0.82      1640

    accuracy                           0.72      2943
   macro avg       0.59      0.59      0.57      2943
weighted avg       0.82      0.72      0.75      2943



In [None]:
# Define the function to preprocess the sentences
def preprocess_sentence(sentence):
    # Convert to lowercase
    sentence = sentence.lower()

    # Expand contractions
    sentence = decontract_reviews(sentence)

    # Remove punctuation
    sentence = re.sub('[^\w\s]', '', sentence)

    # Tokenize the sentence
    tokens = nltk.word_tokenize(sentence)

    return tokens

# List of sentences to test
sentences = ["Good driver service!", "Bad driver service..", "this is an app for driver and rider ", "i like this app!!"]

print("Model: Naive Bayes")
print("Custom Input", "| ", "Predicted Sentiment")

# Iterate over the list of sentences and predict the sentiment for each
for sentence in sentences:
    # Preprocess the sentence
    processed_sentence = preprocess_sentence(sentence)

    # Convert the tokenized sentence to a vector
    vector = dictionary.doc2bow(processed_sentence)

    # Create a dictionary object to store the document vector (in order to use NLTK's classifier)
    sent_as_dict = {id: 1 for (id, tf) in vector}

    # Use the Naive Bayes classifier to predict the sentiment of the sentence
    predicted_sentiment = classifierWithoutStopwords.classify(sent_as_dict)

    # Print the predicted sentiment
    print(sentence, "|", predicted_sentiment)


Model: Naive Bayes
Custom Input |  Predicted Sentiment
Good driver service! | positive
Bad driver service.. | negative
this is an app for driver and rider  | negative
i like this app!! | negative


In [None]:
#create new df that consist of x_test and y_test for other models to use
classifying_df_without_stopwords = X_test.join(y_test)

#insert results of naive bayes classifier to df
classifying_df_without_stopwords['Naive_Bayes_Stopwords_Removed'] = y_pred

# Display random 5 rows
classifying_df_without_stopwords[['Review_SA_Processed_Removed_Stopwords', 'gold_truth','Naive_Bayes_Stopwords_Removed']].sample(5)

Unnamed: 0,Review_SA_Processed_Removed_Stopwords,gold_truth,Naive_Bayes_Stopwords_Removed
1285,nice,positive,positive
4684,mute notification thangkew,neutral,negative
4510,hard book,negative,negative
3438,leave unsafe ride app button cancel booking pr...,negative,negative
8406,review still appears restaurants name leaving ...,positive,negative


# 4 Comparing NB with/without stopwords removed

## Compare classification results between the 2 models.

In [None]:
#comparing results of Naive Bayes with stopwords, Naive Bayes without stopwords
results = {'Model': ['Naive Bayes with stopwords', 'Naive Bayes removed stopwords'],
           'Accuracy_Score': [cr_with_stopwords_dict['accuracy'], cr_without_stopwords_dict['accuracy']],
           'Weighted_Precision_Score': [cr_with_stopwords_dict['weighted avg']['precision'], cr_without_stopwords_dict['weighted avg']['precision']],
           'Weighted_Recall_Score': [cr_with_stopwords_dict['weighted avg']['recall'], cr_without_stopwords_dict['weighted avg']['recall'], ],
           'Weighted_F1_Score': [cr_with_stopwords_dict['weighted avg']['f1-score'], cr_without_stopwords_dict['weighted avg']['f1-score'], ],
            }


results_df = pd.DataFrame(data=results)
results_df

Unnamed: 0,Model,Accuracy_Score,Weighted_Precision_Score,Weighted_Recall_Score,Weighted_F1_Score
0,Naive Bayes with stopwords,0.711519,0.810154,0.711519,0.746591
1,Naive Bayes removed stopwords,0.718994,0.817202,0.718994,0.75486


In [None]:
# find index of respective highest score
highest_acc_index = results_df.index[results_df['Accuracy_Score'].idxmax()]
highest_precision_index = results_df.index[results_df['Weighted_Precision_Score'].idxmax()]
highest_recall_index = results_df.index[results_df['Weighted_Recall_Score'].idxmax()]
highest_f1_index = results_df.index[results_df['Weighted_F1_Score'].idxmax()]

print('Highest Accuracy Score: ' + str(results_df['Model'][highest_acc_index]) + ' (' + str(results_df['Accuracy_Score'][highest_acc_index]) + ') ')
print('Highest Precision Score: ' + str(results_df['Model'][highest_precision_index]) + ' (' + str(results_df['Weighted_Precision_Score'][highest_precision_index]) + ') ')
print('Highest Recall Score: ' + str(results_df['Model'][highest_recall_index]) + ' (' + str(results_df['Weighted_Recall_Score'][highest_recall_index]) + ') ')
print('Highest F1 Score: ' + str(results_df['Model'][highest_f1_index]) + ' (' + str(results_df['Weighted_F1_Score'][highest_f1_index]) + ') ')

Highest Accuracy Score: Naive Bayes removed stopwords (0.7189942235813795) 
Highest Precision Score: Naive Bayes removed stopwords (0.8172017311065883) 
Highest Recall Score: Naive Bayes removed stopwords (0.7189942235813795) 
Highest F1 Score: Naive Bayes removed stopwords (0.7548601326220378) 


## Combine the Dataframes

In [None]:
classifying_df_without_stopwords = classifying_df_without_stopwords[['Review_SA_Processed_Removed_Stopwords','Naive_Bayes_Stopwords_Removed']]
# Concatenate the two dataframes vertically
comparison_df = pd.concat([classifying_df_with_stopwords, classifying_df_without_stopwords], axis=1)
# comparison_df
# Keep only the desired columns
comparison_df = comparison_df[["Review_SA_Processed", "Review_SA_Processed_Removed_Stopwords", "gold_truth","Naive_Bayes", "Naive_Bayes_Stopwords_Removed"]]

# Reset the index of the resulting dataframe
comparison_df = comparison_df.reset_index(drop=True)

comparison_df

Unnamed: 0,Review_SA_Processed,Review_SA_Processed_Removed_Stopwords,gold_truth,Naive_Bayes,Naive_Bayes_Stopwords_Removed
0,put address food delivery grab decided another...,put address food delivery grab decided another...,negative,negative,negative
1,times cancel riders specific reasons,times cancel riders specific reasons,negative,negative,negative
2,never save card details app food orders proces...,never save card details app food orders proces...,negative,negative,negative
3,little application support trouble adding paym...,little application support trouble adding paym...,negative,negative,negative
4,driver lo ga punya etika males ngubungin customer,driver lo ga etika males ngubungin customer,negative,negative,negative
...,...,...,...,...,...
2938,grab system message cancel booking rebook due ...,grab system message cancel booking rebook due ...,positive,negative,negative
2939,getting booked need worst experience today,getting booked need worst experience today,negative,negative,negative
2940,great service clean drivers friendly courteous,great service clean drivers friendly courteous,positive,positive,positive
2941,terrible sign experience trying sign using qat...,terrible sign experience trying sign using qat...,negative,neutral,neutral


## Compare wrong outputs with the gold truth

In [None]:
# Get the Incorrect_Predictions of each model
def count_incorrect_predictions(row):
    incorrect_count = 0
    if row['gold_truth'] != row['Naive_Bayes']:
        incorrect_count += 1
    if row['gold_truth'] != row['Naive_Bayes_Stopwords_Removed']:
        incorrect_count += 1
    return incorrect_count

comparison_df['incorrect_count'] = comparison_df.apply(count_incorrect_predictions, axis=1)

comparison_df['Naive_Bayes_Incorrect'] = comparison_df['gold_truth'] != comparison_df['Naive_Bayes']
comparison_df['Naive_Bayes_Stopwords_Removed_Incorrect'] = comparison_df['gold_truth'] != comparison_df['Naive_Bayes_Stopwords_Removed']

# Create a new DataFrame to store the results
incorrect_predictions_df = pd.DataFrame({
    'Model': ['Naive Bayes with stopwords', 'Naive Bayes removed stopwords'],
    'Incorrect_Predictions': [
        comparison_df['Naive_Bayes_Incorrect'].sum(),
        comparison_df['Naive_Bayes_Stopwords_Removed_Incorrect'].sum()
    ]
})

incorrect_predictions_df


Unnamed: 0,Model,Incorrect_Predictions
0,Naive Bayes with stopwords,849
1,Naive Bayes removed stopwords,827


In [None]:
combined_df = results_df.merge(incorrect_predictions_df, on='Model', how='inner')
combined_df

Unnamed: 0,Model,Accuracy_Score,Weighted_Precision_Score,Weighted_Recall_Score,Weighted_F1_Score,Incorrect_Predictions
0,Naive Bayes with stopwords,0.711519,0.810154,0.711519,0.746591,849
1,Naive Bayes removed stopwords,0.718994,0.817202,0.718994,0.75486,827
