In [None]:
import pandas as pd
import numpy as np
import json
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import accuracy_score,precision_score, recall_score, confusion_matrix, classification_report, roc_curve, roc_auc_score, f1_score
from src.data.feature_engineering import FeatureEngineer
import scipy
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../../data/raw/reviews.csv", parse_dates=['Time'])
df.head()

In [None]:
stopwords = [
    "a",
    "an","and",
    "are",
    "as",
    "at",
    "be",
    "by",
    "can",
    "did",
    "do",
    "for",
    "from",
    "had",
    "has",
    "have",
    "he",
    "her",
    "hers",
    "him",
    "his",
    "if",
    "in",
    "is",
    "it",
    "its",
    "may",
    "of",
    "on",
    "or",
    "shall",
    "should",
    "since",
    "so",
    "some",
    "such",
    "that",
    "the",
    "their",
    "them",
    "then",
    "there",
    "these",
    "they",
    "this",
    "those",
    "to",
    "was",
    "we",
    "were",
    "when",
    "where",
    "which",
    "while",
    "who",
    "whom",
    "whose",
    "will",
    "with",
    "would",
    "you",
    "your",
    "yours",
    "about",
    "above",
    "across",
    "after",
    "against",
    "along",
    "among",
    "around",
    "before",
    "behind",
    "below",
    "beneath",
    "beside",
    "between",
    "beyond",
    "during",
    "inside",
    "into",
    "near",
    "outside",
    "over",
    "through",
    "under",
    "upon",
    "within",
    "without",
    "been",
    "having",
    "once",
    "other",
    "until",
    "own",
    "each",
    "every",
    "any",
    "all",
    "one",
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "ten",
    "many",
    "several",
    "few",
    "how",
    "anyway",
    "however",
    "just",
    "my"
]
stopwords = list(stopwords)

In [None]:
def preprocess_text(text, stopwords):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove special characters and digits
    text = re.sub(r'\W|\d+', ' ', text)
    
    # Tokenize words
    words = text.split()
    
    # Remove stopwords
    words = [word for word in words if word not in stopwords]
    
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]

    # Reconstruct the text
    text = ' '.join(words)

    return text

In [None]:
# Preprocess the reviews and feature engineer
pre_processed_df = df.copy()
pre_processed_df['cleaned_text'] = pre_processed_df['Text'].apply(lambda x: preprocess_text(x, stopwords))
pre_processed_df['Sentiment'] = pre_processed_df['Sentiment'].apply(lambda x: 1 if x == "positive" else 0)
pre_processed_df = pre_processed_df.rename(columns=str.lower)

In [None]:
pre_processed_df.head()

In [None]:
feature_engineer = FeatureEngineer(pre_processed_df)
feature_engineer.add_features()
feature_engineered_df = feature_engineer.feature_engineered_df

In [None]:
feature_engineered_df.head()

In [None]:
vectorizer = TfidfVectorizer()

X = feature_engineered_df.drop(['sentiment', 'time', 'polarity'], axis=1)

y = feature_engineered_df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4263, stratify=y)

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

X_train_tfidf = vectorizer.fit_transform(X_train['cleaned_text'])
X_test_tfidf = vectorizer.transform(X_test['cleaned_text'])

X_train_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
X_test_tfidf = pd.DataFrame(X_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

X_train_clean = X_train.drop(['cleaned_text', 'text', 'uppercase_words'], axis=1)
X_test_clean = X_test.drop(['cleaned_text', 'text', 'uppercase_words'], axis=1)

X_train_concat = pd.concat([X_train_clean, X_train_tfidf], axis=1)
X_test_concat = pd.concat([X_test_clean, X_test_tfidf], axis=1)

X_train_concat = X_train_concat.loc[:, ~X_train_concat.columns.duplicated()].copy()
X_test_concat = X_test_concat.loc[:, ~X_test_concat.columns.duplicated()].copy()

xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_clf.fit(X_train_concat, y_train)

In [None]:
# Make predictions on the testing set
y_pred = xgb_clf.predict(X_test_concat)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("f1-score:", f1_score)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)

In [None]:
# plot roc curve
fpr, tpr, _ = roc_curve(y_test,  y_pred)
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
auc = roc_auc_score(y_test, y_pred)
print(f"For this XGBoost model, the AUC score is: {auc}")

In [None]:
# feature selection
plot_importance(xgb_clf, max_num_features=20)
plt.show()

In [None]:
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=4263)

# param_grid = {
#     'learning_rate': [0.01, 0.1, 0.2],
#     'n_estimators': [50, 100, 200],
#     'max_depth': [3, 6, 9],
#     'subsample': [0.5, 0.8, 1],
#     'colsample_bytree': [0.5, 0.8, 1],
# }

# # Train the XGBoost classifier
# xgb_clf_tuned = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# grid_search = GridSearchCV(
#     estimator=xgb_clf_tuned,
#     param_grid=param_grid,
#     scoring='accuracy',
#     cv=skf,
#     verbose=2,
#     n_jobs=-1,
# )

# grid_search.fit(X_train, y_train)

# print("Best parameters found: ", grid_search.best_params_)
# print("Best accuracy score found: ", grid_search.best_score_)

In [None]:
# y_pred_tuned = grid_search.predict(X_test)
# accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
# precision_tuned = precision_score(y_test, y_pred_tuned)
# recall_tuned = recall_score(y_test, y_pred_tuned)
# conf_matrix_tuned = confusion_matrix(y_test, y_pred_tuned)
# class_report_tuned = classification_report(y_test, y_pred_tuned)


# print("Accuracy:", accuracy_tuned)
# print("Precision:", precision_tuned)
# print("Recall:", recall_tuned)
# print("Confusion Matrix:", conf_matrix)
# print("Classification Report:", class_report)