In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

from mlxtend.preprocessing import DenseTransformer

import optuna
from optuna.samplers import TPESampler

In [10]:
reviews = pd.read_parquet('../data/reviews_sample_proc.parquet')
train = reviews.query('partition == "train"')
test = reviews.query('partition == "test"')

In [22]:
train['original'].apply(lambda x: len(x)).min()

146

In [3]:
X_train = train['original'].to_numpy()
y_train = train['rating'].to_numpy() - 1

In [18]:
train.columns

Index(['original', 'translated', 'rating', 'partition', 'original_proc',
       'original_proc_no_stop', 'translated_proc', 'translated_proc_no_stop'],
      dtype='object')

In [None]:
%%time

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
y_preds = []
y_true = []
scores = []
n_rounds = []
for (train_ix, test_ix) in cv.split(X_train, y_train):
    params = {
        'eta': 0.3,
        'max_depth': 5,
        'objective': 'multi:softmax',
        'num_class': 5,
        'eval_metric': 'mlogloss',
        'early_stopping_rounds': 5,
        'n_estimators': 10,
    }
    # fit_params = {
    #     'xgbclassifier__eval_set': [(X_train[test_ix], y_train[test_ix])],
    # }

    vectorizer = TfidfVectorizer(min_df=50)
    X_train_vec = vectorizer.fit_transform(X_train[train_ix])
    X_val_vec = vectorizer.transform(X_train[test_ix])
    clf = xgb.XGBClassifier(**params)
    clf.fit(X_train_vec, y_train[train_ix], eval_set=[(X_val_vec, y_train[test_ix])], verbose=False)
    scores.append(clf.best_score)
    n_rounds.append(clf.get_num_boosting_rounds())
    y_preds.extend(clf.predict(X_val_vec))
    y_true.extend(y_train[test_ix])

In [55]:
print(metrics.classification_report(y_true, y_preds))

              precision    recall  f1-score   support

           0       0.53      0.65      0.58     10000
           1       0.39      0.28      0.33     10000
           2       0.36      0.39      0.38     10000
           3       0.45      0.40      0.42     10000
           4       0.62      0.67      0.64     10000

    accuracy                           0.48     50000
   macro avg       0.47      0.48      0.47     50000
weighted avg       0.47      0.48      0.47     50000



In [16]:
%%time

def objective(trial):
    params = {
        'eta': 0.3,
        # 'max_depth': 5,
        'max_depth': trial.suggest_int('max_depth', low=5, high=8),
        # 'scale_pos_weight': trial.suggest_float('scale_pos_weight', low=1, high=3, step=0.25),
        # 'colsample_bytree': trial.suggest_float('colsample_bytree', low=0.75, high=0.9, step=0.05),
        # 'subsample': trial.suggest_float('subsample', low=0.975, high=1, step=0.001),
        # 'min_child_weight': trial.suggest_float('min_child_weight', low=1, high=3, step=0.1),
        # 'alpha': trial.suggest_float('alpha', low=0, high=0.3, step=0.01),
        'objective': 'multi:softmax',
        'num_class': 5,
        'eval_metric': 'mlogloss',
        'early_stopping_rounds': 5,
        'n_estimators': 5,
    }

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)

    y_preds = []
    y_true = []
    scores = []
    n_rounds = []
    for (train_ix, test_ix) in cv.split(X_train, y_train):
        vectorizer = TfidfVectorizer(min_df=50)
        X_train_vec = vectorizer.fit_transform(X_train[train_ix])
        X_val_vec = vectorizer.transform(X_train[test_ix])
        clf = xgb.XGBClassifier(**params)
        clf.fit(X_train_vec, y_train[train_ix], eval_set=[(X_val_vec, y_train[test_ix])], verbose=False)
        scores.append(clf.best_score)
        n_rounds.append(clf.best_iteration)
        y_preds.extend(clf.predict(X_val_vec))
        y_true.extend(y_train[test_ix])

    # score = np.mean(scores)
    score = metrics.f1_score(y_true, y_preds, average='macro')
    
    return score

study = optuna.create_study(sampler=TPESampler(), direction='maximize')
study.optimize(objective, n_trials=3)

cols = ['Iteration number', 'Score'] + list(study.best_params.keys())
results = [study.best_trial.number, study.best_trial.value] + list(study.best_params.values())
results_df = pd.DataFrame([results], columns = cols)
results_df

[32m[I 2023-02-15 14:24:33,716][0m A new study created in memory with name: no-name-54464745-6b7f-4d52-a3e0-4ea0ef69fa04[0m
[32m[I 2023-02-15 14:24:43,175][0m Trial 0 finished with value: 0.45410934358116306 and parameters: {'max_depth': 7}. Best is trial 0 with value: 0.45410934358116306.[0m
[32m[I 2023-02-15 14:24:53,376][0m Trial 1 finished with value: 0.46091616686299763 and parameters: {'max_depth': 8}. Best is trial 1 with value: 0.46091616686299763.[0m
[32m[I 2023-02-15 14:25:03,574][0m Trial 2 finished with value: 0.46091616686299763 and parameters: {'max_depth': 8}. Best is trial 1 with value: 0.46091616686299763.[0m


CPU times: user 2min 9s, sys: 1.71 s, total: 2min 11s
Wall time: 29.9 s


Unnamed: 0,Iteration number,Score,max_depth
0,1,0.460916,8
