### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_auc_score

### Load Data

In [None]:
df = pd.read_csv("../data/review_2022_clean.csv")
df.head()

In [None]:
df_sw = pd.read_csv("../data/review_2022_clean_sw.csv")
df_sw.head()

### Train-Test Split

Create a function to perform train-test split.

In [None]:
def split(stop_words=False):
  if stop_words:
    X = df_sw["text"]
    y = df_sw["label"]
  else:
    X = df["text"]
    y = df["label"]
  return train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train, X_test, y_train, y_test = split()

In [None]:
X_train_sw, X_test_sw, y_train_sw, y_test_sw = split(stop_words=True)

### Modeling

Define parameter settings to try.

In [None]:
learning_rate = np.logspace(-2, 2, 5)
learning_rate

In [None]:
C_range = np.logspace(-2, 2, 5)
C_range

In [None]:
param_grid = {
  "ada__n_estimators": [50, 100, 200, 500, 1000],
  "ada__learning_rate": learning_rate,
  "ada__base_estimator__C": C_range
}

Create a function to create an instance of GridSearchCV.

In [None]:
def create_grid_search(ngram_range):
  tfidf = TfidfVectorizer(ngram_range=ngram_range)
  lr = LogisticRegression(class_weight="balanced", random_state=42, max_iter=10000)
  ada = AdaBoostClassifier(base_estimator=lr, random_state=42)
  pipe = Pipeline(steps=[("tfidf", tfidf), ("ada", ada)])
  
  search = GridSearchCV(pipe, param_grid, cv=10, scoring="balanced_accuracy", n_jobs=-1)
  return search

Create a function to perform training and evaluation.

In [None]:
def train_evaluate(ngram_range, X_train, X_test, y_train, y_test):
  search = create_grid_search(ngram_range)
  search.fit(X_train, y_train)

  print("Best Parameters:")
  print(search.best_params_)

  y_pred = search.predict(X_test)

  print("Classification Report:")
  print(classification_report(y_test, y_pred))

  y_pred_proba = search.predict_proba(X_test)

  print("ROC AUC:")
  print(roc_auc_score(y_test, y_pred_proba, average="weighted", multi_class="ovr"))

  print("Confusion Matrix:")
  ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
  plt.show()

  return search

#### Unigrams

In [None]:
model_uni = train_evaluate((1, 1), X_train, X_test, y_train, y_test)

#### Unigrams + Stop Words

In [None]:
model_uni_sw = train_evaluate((1, 1), X_train_sw, X_test_sw, y_train_sw, y_test_sw)

#### Bigrams

In [None]:
model_bi = train_evaluate((2, 2), X_train, X_test, y_train, y_test)

#### Bigrams + Stop Words

In [None]:
model_bi_sw = train_evaluate((2, 2), X_train_sw, X_test_sw, y_train_sw, y_test_sw)

#### Unigrams + Bigrams

In [None]:
model_uni_bi = train_evaluate((1, 2), X_train, X_test, y_train, y_test)

#### Unigrams + Bigrams + Stop Words

In [None]:
model_uni_bi_sw = train_evaluate((1, 2), X_train_sw, X_test_sw, y_train_sw, y_test_sw)

#### Sanity Check

Positive review?

In [None]:
model_uni_bi_sw.predict(["The food is great! Especially the unagi!"])

Negative review?

In [None]:
model_uni_bi_sw.predict(["The service sucks! I will never come back again!"])

Neutral review?

In [None]:
model_uni_bi_sw.predict(["The food is decent but the price is a bit high."])

Sarcastic review?

In [None]:
model_uni_bi_sw.predict(["The food is so good that I think you need next level of taste buds to appreciate it."])

Spam review?

In [None]:
model_uni_bi_sw.predict(["Personal loan with low interest - call 0123456789."])

In [None]:
model_uni_bi_sw.predict(["Personal loan with quick approval - call 0123456789."])

Random text?

In [None]:
model_uni_bi_sw.predict(["Market is bullish."])

In [None]:
model_uni_bi_sw.predict(["Market is bullish. Forgot what's the next part of the sentence already."])

Emoji?

In [None]:
model_uni_bi_sw.predict(["🙂🙂🙂"])

In [None]:
model_uni_bi_sw.predict(["🤬🤬🤬"])