In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import optuna
from optuna.samplers import TPESampler

In [2]:
reviews = pd.read_parquet('../data/reviews_sample_proc.parquet')[['partition', 'rating', 'original', 'original_proc', 'original_proc_no_stop', 'translated', 'translated_proc', 'translated_proc_no_stop']]
train = reviews.query('partition == "train"')
test = reviews.query('partition == "test"')

In [12]:
X_train = train['original'].to_numpy()
y_train = train['rating'].to_numpy() - 1

In [15]:
params = {
    'eta': 0.1,
    'max_depth': 8,
    'objective': 'multi:softmax',
    'num_class': 5,
    'eval_metric': 'auc',
    # 'early_stopping_rounds': 20,
    'n_estimators': 50,
}
pipe = make_pipeline(TfidfVectorizer(min_df=50), xgb.XGBClassifier(**params))
pipe.fit(X_train[:1000], y_train[:1000])

In [None]:
def objective(trial):
    params = {
        'eta': 0.1,
        'max_depth': 8,
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', low=1, high=3, step=0.25),
        'colsample_bytree': trial.suggest_float('colsample_bytree', low=0.75, high=0.9, step=0.01),
        'subsample': trial.suggest_float('subsample', low=0.975, high=1, step=0.001),
        'min_child_weight': trial.suggest_float('min_child_weight', low=1, high=3, step=0.1),
        'alpha': trial.suggest_float('alpha', low=0, high=0.3, step=0.01),
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'gpu_hist',
    }

    sel_vars = None
    dtrain = xgb.DMatrix(X_train[sel_vars], y_train)
    
    result = xgb.cv(
        params, dtrain, num_boost_round=2000, early_stopping_rounds=50, nfold=5, stratified=True, seed=123)
    score = result['test-auc-mean'].max()
    
    return score

study = optuna.create_study(sampler=TPESampler(), direction='maximize')
study.optimize(objective, n_trials=40)

cols = ['Iteration number', 'Score'] + list(study.best_params.keys())
results = [study.best_trial.number, study.best_trial.value] + list(study.best_params.values())
results_df = pd.DataFrame([results], columns = cols)
results_df