In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

from mlxtend.preprocessing import DenseTransformer

import optuna
from optuna.samplers import TPESampler

In [3]:
reviews = pd.read_parquet('/kaggle/input/restaurant-reviews/reviews_sample_proc.parquet')
train = reviews.query('partition == "train"')
test = reviews.query('partition == "test"')

scope = 'original'
X_train = train[scope].to_numpy()
y_train = train['rating'].to_numpy() - 1

X_test = test[scope].to_numpy()
y_test = test['rating'].to_numpy() - 1

In [8]:
vectorizer = TfidfVectorizer(min_df=200)
X_train_vec = vectorizer.fit_transform(X_train[:10000])
X_train_vec.shape

(10000, 369)

#### Example of Bayesian optimization for max tree depth (the rest

In [None]:
%%time

def objective(trial):
    params = {
        'eta': 0.3,
        'max_depth': 8,
        'alpha': trial.suggest_float('alpha', low=0.0, high=0.3, step=0.01),
        'objective': 'multi:softmax',
        'num_class': 5,
        'eval_metric': 'mlogloss',
        'early_stopping_rounds': 5,
        'n_estimators': 200,
        'tree_method': 'gpu_hist',
    }

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)

    y_preds = []
    y_true = []
    scores = []
    n_rounds = []
    for (train_ix, test_ix) in cv.split(X_train, y_train):
        vectorizer = TfidfVectorizer(min_df=200)
        X_train_vec = vectorizer.fit_transform(X_train[train_ix])
        X_val_vec = vectorizer.transform(X_train[test_ix])
        clf = xgb.XGBClassifier(**params)
        clf.fit(X_train_vec, y_train[train_ix], eval_set=[(X_val_vec, y_train[test_ix])], verbose=False)
        scores.append(clf.best_score)
        n_rounds.append(clf.best_iteration)
        y_preds.extend(clf.predict(X_val_vec))
        y_true.extend(y_train[test_ix])

    score = metrics.f1_score(y_true, y_preds, average='macro')
    print(np.mean(n_rounds))
    
    return score

study = optuna.create_study(sampler=TPESampler(), direction='maximize')
study.optimize(objective, n_trials=20)

cols = ['Iteration number', 'Score'] + list(study.best_params.keys())
results = [study.best_trial.number, study.best_trial.value] + list(study.best_params.values())
results_df = pd.DataFrame([results], columns = cols)
results_df

#### Finding the optimal number of boosting rounds

In [7]:
%%time

params = {
    'eta': 0.1,
    'max_depth': 8,
    'alpha': 0.2,
    'objective': 'multi:softmax',
    'num_class': 5,
    'eval_metric': 'mlogloss',
    'early_stopping_rounds': 10,
    'n_estimators': 1000,
    'tree_method': 'gpu_hist',
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)

y_preds = []
y_true = []
scores = []
n_rounds = []
for (train_ix, test_ix) in cv.split(X_train, y_train):
    vectorizer = TfidfVectorizer(min_df=200)
    X_train_vec = vectorizer.fit_transform(X_train[train_ix])
    X_val_vec = vectorizer.transform(X_train[test_ix])
    clf = xgb.XGBClassifier(**params)
    clf.fit(X_train_vec, y_train[train_ix], eval_set=[(X_val_vec, y_train[test_ix])], verbose=False)
    scores.append(clf.best_score)
    n_rounds.append(clf.best_iteration)
    y_preds.extend(clf.predict(X_val_vec))
    y_true.extend(y_train[test_ix])

score = metrics.f1_score(y_true, y_preds, average='macro')
print(np.mean(n_rounds))
print(score)

605.3333333333334
0.5618651987997275
CPU times: user 5min 6s, sys: 424 ms, total: 5min 6s
Wall time: 4min 55s


#### Polish text, raw (punctuation removed)

In [9]:
%%time

scope = 'original'
X_train = train[scope].to_numpy()
y_train = train['rating'].to_numpy() - 1

X_test = test[scope].to_numpy()
y_test = test['rating'].to_numpy() - 1

params = {
    'eta': 0.1,
    'max_depth': 8,
    'alpha': 0.2,
    'objective': 'multi:softmax',
    'num_class': 5,
    'eval_metric': 'mlogloss',
    'n_estimators': 605,
    'tree_method': 'gpu_hist',
}

vectorizer = TfidfVectorizer(min_df=200)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = xgb.XGBClassifier(**params)
clf.fit(X_train_vec, y_train, verbose=False)

y_pred = clf.predict(X_test_vec)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.71      0.68      2500
           1       0.46      0.42      0.44      2500
           2       0.45      0.45      0.45      2500
           3       0.55      0.52      0.53      2500
           4       0.76      0.77      0.77      2500

    accuracy                           0.58     12500
   macro avg       0.57      0.58      0.57     12500
weighted avg       0.57      0.58      0.57     12500

CPU times: user 2min 20s, sys: 147 ms, total: 2min 21s
Wall time: 2min 18s


#### Polish text, lemmatized

In [11]:
%%time

scope = 'original_proc'
X_train = train[scope].to_numpy()
y_train = train['rating'].to_numpy() - 1

X_test = test[scope].to_numpy()
y_test = test['rating'].to_numpy() - 1

params = {
    'eta': 0.1,
    'max_depth': 8,
    'alpha': 0.2,
    'objective': 'multi:softmax',
    'num_class': 5,
    'eval_metric': 'mlogloss',
    'n_estimators': 605,
    'tree_method': 'gpu_hist',
}

vectorizer = TfidfVectorizer(min_df=200)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = xgb.XGBClassifier(**params)
clf.fit(X_train_vec, y_train, verbose=False)

y_pred = clf.predict(X_test_vec)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.70      0.68      2500
           1       0.45      0.43      0.44      2500
           2       0.45      0.46      0.46      2500
           3       0.55      0.52      0.53      2500
           4       0.76      0.78      0.77      2500

    accuracy                           0.58     12500
   macro avg       0.57      0.58      0.57     12500
weighted avg       0.57      0.58      0.57     12500

CPU times: user 2min 10s, sys: 157 ms, total: 2min 10s
Wall time: 2min 7s


#### Polish text, lemmatized with stopwords removed

In [12]:
%%time

scope = 'original_proc_no_stop'
X_train = train[scope].to_numpy()
y_train = train['rating'].to_numpy() - 1

X_test = test[scope].to_numpy()
y_test = test['rating'].to_numpy() - 1

params = {
    'eta': 0.1,
    'max_depth': 8,
    'alpha': 0.2,
    'objective': 'multi:softmax',
    'num_class': 5,
    'eval_metric': 'mlogloss',
    'n_estimators': 605,
    'tree_method': 'gpu_hist',
}

vectorizer = TfidfVectorizer(min_df=200)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = xgb.XGBClassifier(**params)
clf.fit(X_train_vec, y_train, verbose=False)

y_pred = clf.predict(X_test_vec)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.70      0.67      2500
           1       0.43      0.40      0.42      2500
           2       0.43      0.42      0.43      2500
           3       0.54      0.52      0.53      2500
           4       0.74      0.77      0.75      2500

    accuracy                           0.56     12500
   macro avg       0.56      0.56      0.56     12500
weighted avg       0.56      0.56      0.56     12500

CPU times: user 1min 47s, sys: 150 ms, total: 1min 47s
Wall time: 1min 44s


#### English translation, raw (with punctuation removed)

In [14]:
%%time

scope = 'translated'
X_train = train[scope].to_numpy()
y_train = train['rating'].to_numpy() - 1

X_test = test[scope].to_numpy()
y_test = test['rating'].to_numpy() - 1

params = {
    'eta': 0.1,
    'max_depth': 8,
    'alpha': 0.2,
    'objective': 'multi:softmax',
    'num_class': 5,
    'eval_metric': 'mlogloss',
    'n_estimators': 605,
    'tree_method': 'gpu_hist',
}

vectorizer = TfidfVectorizer(min_df=200)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = xgb.XGBClassifier(**params)
clf.fit(X_train_vec, y_train, verbose=False)

y_pred = clf.predict(X_test_vec)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.72      0.69      2500
           1       0.46      0.43      0.44      2500
           2       0.44      0.44      0.44      2500
           3       0.55      0.54      0.54      2500
           4       0.77      0.78      0.78      2500

    accuracy                           0.58     12500
   macro avg       0.58      0.58      0.58     12500
weighted avg       0.58      0.58      0.58     12500

CPU times: user 2min 21s, sys: 195 ms, total: 2min 21s
Wall time: 2min 18s


#### English text, lemmatized

In [17]:
%%time

scope = 'translated_proc'
X_train = train[scope].to_numpy()
y_train = train['rating'].to_numpy() - 1

X_test = test[scope].to_numpy()
y_test = test['rating'].to_numpy() - 1

params = {
    'eta': 0.1,
    'max_depth': 8,
    'alpha': 0.2,
    'objective': 'multi:softmax',
    'num_class': 5,
    'eval_metric': 'mlogloss',
    'n_estimators': 605,
    'tree_method': 'gpu_hist',
}

vectorizer = TfidfVectorizer(min_df=200)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = xgb.XGBClassifier(**params)
clf.fit(X_train_vec, y_train, verbose=False)

y_pred = clf.predict(X_test_vec)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.71      0.69      2500
           1       0.44      0.42      0.43      2500
           2       0.44      0.44      0.44      2500
           3       0.55      0.54      0.55      2500
           4       0.77      0.78      0.77      2500

    accuracy                           0.58     12500
   macro avg       0.57      0.58      0.58     12500
weighted avg       0.57      0.58      0.58     12500

CPU times: user 2min 7s, sys: 169 ms, total: 2min 7s
Wall time: 2min 4s


#### English text, lemmatized with stopwords removed

In [16]:
%%time

scope = 'translated_proc_no_stop'
X_train = train[scope].to_numpy()
y_train = train['rating'].to_numpy() - 1

X_test = test[scope].to_numpy()
y_test = test['rating'].to_numpy() - 1

params = {
    'eta': 0.1,
    'max_depth': 8,
    'alpha': 0.2,
    'objective': 'multi:softmax',
    'num_class': 5,
    'eval_metric': 'mlogloss',
    'n_estimators': 605,
    'tree_method': 'gpu_hist',
}

vectorizer = TfidfVectorizer(min_df=200)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = xgb.XGBClassifier(**params)
clf.fit(X_train_vec, y_train, verbose=False)

y_pred = clf.predict(X_test_vec)

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.68      0.66      2500
           1       0.42      0.40      0.41      2500
           2       0.41      0.38      0.39      2500
           3       0.53      0.52      0.52      2500
           4       0.74      0.77      0.75      2500

    accuracy                           0.55     12500
   macro avg       0.55      0.55      0.55     12500
weighted avg       0.55      0.55      0.55     12500

CPU times: user 1min 35s, sys: 126 ms, total: 1min 35s
Wall time: 1min 32s
