# NB

## Load and import data

In [None]:
from src.data.make_dataset import main as get_data
from src.data.make_dataset_oversample import main as get_data_os
from src.data.preprocess import Preprocessor

import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score,precision_score, recall_score, confusion_matrix, classification_report, roc_curve, roc_auc_score, f1_score
from src.data.make_dataset import main as get_data
from sklearn import model_selection, naive_bayes
import scipy
import matplotlib.pyplot as plt
import pickle
import joblib

In [None]:
X_train, X_test, y_train, y_test = get_data_os("../data-dialogue/data/raw/reviews.csv")

## Prepare data for NB
add normalisation of FE features to remove negative values (NB model does not accept this)

In [None]:
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train['cleaned_text'])
X_test_tfidf = vectorizer.transform(X_test['cleaned_text'])

X_train_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
X_test_tfidf = pd.DataFrame(X_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

X_train_clean = X_train.drop(['cleaned_text', 'text'], axis=1)
X_test_clean = X_test.drop(['cleaned_text', 'text'], axis=1)

#added this
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_clean_scaled = scaler.fit_transform(X_train_clean)
X_train_clean_scaled = pd.DataFrame(X_train_clean_scaled, columns = X_train_clean.columns)
X_test_clean_scaled = scaler.fit_transform(X_test_clean)
X_test_clean_scaled = pd.DataFrame(X_test_clean_scaled, columns = X_train_clean.columns)

X_train_concat = pd.concat([X_train_clean_scaled, X_train_tfidf], axis=1)
X_test_concat = pd.concat([X_test_clean_scaled, X_test_tfidf], axis=1)

X_train_concat = X_train_concat.loc[:, ~X_train_concat.columns.duplicated()].copy()
X_test_concat = X_test_concat.loc[:, ~X_test_concat.columns.duplicated()].copy()

In [None]:
X_train_concat.shape #9213 features

In [None]:
y_train

# Vanilla NB

In [None]:
# fit the training dataset on the classifier
nb = naive_bayes.MultinomialNB()
nb.fit(X_train_concat,y_train)

### Evaluation

In [None]:
# Make predictions on the testing set
y_pred = nb.predict(X_test_concat)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("f1-score:", f1)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)

In [None]:
# plot roc curve
fpr, tpr, _ = roc_curve(y_test,  y_pred)
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
auc = roc_auc_score(y_test, y_pred)
print(f"For this SVM model, the AUC score is: {auc}")
#For this SVM model, the AUC score is: 0.5294149882945051

## Set max_features=5000 for TfidfVectorizer & refit into Vanilla SVM

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)

X_train_tfidf = vectorizer.fit_transform(X_train['cleaned_text'])
X_test_tfidf = vectorizer.transform(X_test['cleaned_text'])

X_train_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
X_test_tfidf = pd.DataFrame(X_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

X_train_clean = X_train.drop(['cleaned_text', 'text'], axis=1)
X_test_clean = X_test.drop(['cleaned_text', 'text'], axis=1)

#added this
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_clean_scaled = scaler.fit_transform(X_train_clean)
X_train_clean_scaled = pd.DataFrame(X_train_clean_scaled, columns = X_train_clean.columns)
X_test_clean_scaled = scaler.fit_transform(X_test_clean)
X_test_clean_scaled = pd.DataFrame(X_test_clean_scaled, columns = X_train_clean.columns)

X_train_concat = pd.concat([X_train_clean_scaled, X_train_tfidf], axis=1)
X_test_concat = pd.concat([X_test_clean_scaled, X_test_tfidf], axis=1)

X_train_concat = X_train_concat.loc[:, ~X_train_concat.columns.duplicated()].copy()
X_test_concat = X_test_concat.loc[:, ~X_test_concat.columns.duplicated()].copy()

# NB with max_features (5000)

In [None]:
# fit the training dataset on the classifier
nb = naive_bayes.MultinomialNB()
nb.fit(X_train_concat,y_train)

### Evaluation

In [None]:
# Make predictions on the testing set
y_pred = nb.predict(X_test_concat)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("f1-score:", f1_score)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report) #better

In [None]:
# plot roc curve
fpr, tpr, _ = roc_curve(y_test,  y_pred)
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
auc = roc_auc_score(y_test, y_pred)
print(f"For this SVM model, the AUC score is: {auc}")
#For this SVM model, the AUC score is: 0.6159874264570492

## Set max_features=1000 for TfidfVectorizer & refit into Vanilla SVM

In [None]:
vectorizer = TfidfVectorizer(max_features=1000)

X_train_tfidf = vectorizer.fit_transform(X_train['cleaned_text'])
X_test_tfidf = vectorizer.transform(X_test['cleaned_text'])

X_train_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
X_test_tfidf = pd.DataFrame(X_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

X_train_clean = X_train.drop(['cleaned_text', 'text'], axis=1)
X_test_clean = X_test.drop(['cleaned_text', 'text'], axis=1)

#added this
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_clean_scaled = scaler.fit_transform(X_train_clean)
X_train_clean_scaled = pd.DataFrame(X_train_clean_scaled, columns = X_train_clean.columns)
X_test_clean_scaled = scaler.fit_transform(X_test_clean)
X_test_clean_scaled = pd.DataFrame(X_test_clean_scaled, columns = X_train_clean.columns)

X_train_concat = pd.concat([X_train_clean_scaled, X_train_tfidf], axis=1)
X_test_concat = pd.concat([X_test_clean_scaled, X_test_tfidf], axis=1)

X_train_concat = X_train_concat.loc[:, ~X_train_concat.columns.duplicated()].copy()
X_test_concat = X_test_concat.loc[:, ~X_test_concat.columns.duplicated()].copy()

# NB with max_features (1000)

In [None]:
# fit the training dataset on the classifier
nb = naive_bayes.MultinomialNB()
nb.fit(X_train_concat,y_train)

### Evaluation

In [None]:
# Make predictions on the testing set
y_pred = nb.predict(X_test_concat)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("f1-score:", f1)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report) #better

In [None]:
# plot roc curve
fpr, tpr, _ = roc_curve(y_test,  y_pred)
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
auc = roc_auc_score(y_test, y_pred)
print(f"For this SVM model, the AUC score is: {auc}")

# NB with dim reduction (SVD: 100 components)

In [None]:
n_components = 100
svd = TruncatedSVD(n_components=n_components)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

In [None]:
X_train_svd_df = pd.DataFrame(X_train_svd, columns=[f'svd_{i}' for i in range(n_components)])
X_test_svd_df = pd.DataFrame(X_test_svd, columns=[f'svd_{i}' for i in range(n_components)])

X_train_concat = pd.concat([X_train_clean, X_train_svd_df], axis=1)
X_test_concat = pd.concat([X_test_clean, X_test_svd_df], axis=1)

#added this
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_concat_svd = scaler.fit_transform(X_train_concat)
X_train_concat_svd = pd.DataFrame(X_train_concat_svd, columns = X_train_concat.columns)
X_test_concat_svd = scaler.fit_transform(X_test_concat)
X_test_concat_svd = pd.DataFrame(X_test_concat_svd, columns = X_test_concat.columns)

In [None]:
# fit the training dataset on the classifier
nb = naive_bayes.MultinomialNB()
nb.fit(X_train_concat_svd,y_train)

In [None]:
# Make predictions on the testing set
y_pred = nb.predict(X_test_concat_svd)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("f1-score:", f1_score)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)

# NB with oversampling of minority class

In [None]:
X_train_os, X_test_os, y_train_os, y_test_os = get_data_os("../data-dialogue/data/raw/reviews.csv", oversample=True)

In [None]:
#max features = 1000 was the best model thus far
vectorizer = TfidfVectorizer(max_features=1000)

X_train_tfidf = vectorizer.fit_transform(X_train_os['cleaned_text'])
X_test_tfidf = vectorizer.transform(X_test_os['cleaned_text'])

X_train_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
X_test_tfidf = pd.DataFrame(X_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

X_train_clean = X_train_os.drop(['cleaned_text', 'text'], axis=1)
X_test_clean = X_test_os.drop(['cleaned_text', 'text'], axis=1)

#added this
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_clean_scaled = scaler.fit_transform(X_train_clean)
X_train_clean_scaled = pd.DataFrame(X_train_clean_scaled, columns = X_train_clean.columns)
X_test_clean_scaled = scaler.fit_transform(X_test_clean)
X_test_clean_scaled = pd.DataFrame(X_test_clean_scaled, columns = X_train_clean.columns)

X_train_concat = pd.concat([X_train_clean_scaled, X_train_tfidf], axis=1)
X_test_concat = pd.concat([X_test_clean_scaled, X_test_tfidf], axis=1)

X_train_concat = X_train_concat.loc[:, ~X_train_concat.columns.duplicated()].copy()
X_test_concat = X_test_concat.loc[:, ~X_test_concat.columns.duplicated()].copy()

In [None]:
# fit the training dataset on the classifier
nb = naive_bayes.MultinomialNB()
nb.fit(X_train_concat,y_train_os)

In [None]:
# Make predictions on the testing set
y_pred = nb.predict(X_test_concat)

# Evaluate the model
accuracy = accuracy_score(y_test_os, y_pred)
precision = precision_score(y_test_os,y_pred)
recall = recall_score(y_test_os, y_pred)
conf_matrix = confusion_matrix(y_test_os, y_pred)
class_report = classification_report(y_test_os, y_pred)
f1 = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("f1-score:", f1)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)

# Evaluation:
best model = tfidf with max_features=1000 + oversampling of minority class

In [None]:
#max features = 1000 was the best model thus far
vectorizer = TfidfVectorizer(max_features=1000)

X_train_tfidf = vectorizer.fit_transform(X_train_os['cleaned_text'])
X_test_tfidf = vectorizer.transform(X_test_os['cleaned_text'])

X_train_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
X_test_tfidf = pd.DataFrame(X_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

X_train_clean = X_train_os.drop(['cleaned_text', 'text'], axis=1)
X_test_clean = X_test_os.drop(['cleaned_text', 'text'], axis=1)

#added this
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_clean_scaled = scaler.fit_transform(X_train_clean)
X_train_clean_scaled = pd.DataFrame(X_train_clean_scaled, columns = X_train_clean.columns)
X_test_clean_scaled = scaler.fit_transform(X_test_clean)
X_test_clean_scaled = pd.DataFrame(X_test_clean_scaled, columns = X_test_clean.columns)

X_train_concat = pd.concat([X_train_clean_scaled, X_train_tfidf], axis=1)
X_test_concat = pd.concat([X_test_clean_scaled, X_test_tfidf], axis=1)

X_train_concat = X_train_concat.loc[:, ~X_train_concat.columns.duplicated()].copy()
X_test_concat = X_test_concat.loc[:, ~X_test_concat.columns.duplicated()].copy()

# Parameter Tuning

In [None]:
# cv_method = RepeatedStratifiedKFold(n_splits=3, 
#                                     n_repeats=1, 
#                                     random_state=999)

# from sklearn.preprocessing import PowerTransformer

# alphas = [0.1, 0.3, 0.5, 0.7, 1.0]
# params_NB = {'alpha': alphas, 'fit_prior' : [True, False], 'class_prior' : [None, [.1,.9],[.2, .8]]}

# gs_NB = GridSearchCV(estimator=nb, 
#                      param_grid=params_NB, 
#                      cv=cv_method,
#                      verbose=1, 
#                      scoring='accuracy')

# gs_NB.fit(X_train_concat, y_train_os)

In [None]:
gs_NB.best_params_ #{'alpha': 0.5, 'class_prior': None, 'fit_prior': True}

In [None]:
grid_predictions = gs_NB.predict(X_test_concat)

# Evaluate the model
accuracy = accuracy_score(y_test, grid_predictions)
precision = precision_score(y_test,grid_predictions)
recall = recall_score(y_test, grid_predictions)
conf_matrix = confusion_matrix(y_test, grid_predictions)
class_report = classification_report(y_test, grid_predictions)
f1 = f1_score(y_test, grid_predictions)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("f1-score:", f1)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)

In [None]:
gs_NB.best_estimator_ 
# MultinomialNB(alpha=0.5)

In [None]:
# save best parameters
joblib.dump(gs_NB.best_estimator_, 'naive_bayes_best_model.pkl')

# Load Saved Model

In [None]:
saved_model = joblib.load('naive_bayes_best_model.pkl')