In [None]:
import numpy as np
import pandas as pd
import optuna
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_param_importances
from sklearn.metrics import cross_val_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [None]:
from scipy.sparse import load_npz
X_train = load_npz('/data/processed_train.npz')

In [None]:
train = pd.read_csv('/data/train_essays.csv')
Y_train = train['label'].values

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)

In [None]:
from xgboost import XGBClassifier

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 2000)
    learning_rate = trial.suggest_float('learning_rate', 0.0001, 1)
    subsample = trial.suggest_float('subsample', 0.1, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.1, 1.0)
    colsample_bylevel = trial.suggest_float('colsample_bylevel', 0.1, 1.0)
    reg_alpha = trial.suggest_float('reg_alpha', 0.0, 1.0)
    reg_lambda = trial.suggest_float('reg_lambda', 0.0, 1.0)
    
    xgb = XGBClassifier(n_estimators=n_estimators,
                        learning_rate=learning_rate,
                        subsample=subsample,
                        colsample_bytree=colsample_bytree,
                        colsample_bylevel=colsample_bylevel,
                        use_label_encoder=False,
                        reg_alpha=reg_alpha,
                        reg_lambda=reg_lambda,
                        random_state=42) 
    
    score = cross_val_score(xgb, X_train, Y_train, cv=kfold, scoring='roc_auc', n_jobs=-1).mean()
    return score


xgb_study = optuna.create_study(direction='maximize')
xgb_study.optimize(objective, n_trials=100)

# Plot optimization history
fig1 = plot_optimization_history(xgb_study)
fig1.show(config={"staticPlot": True})

# Plot parameter importances
fig2 = plot_param_importances(xgb_study)
fig2.show(config={"staticPlot": True})

xgb_best_params = xgb_study.best_params
xgb_best_score = xgb_study.best_value

print('Xgboost Best score:', xgb_best_score)
print('Xgboost Best parameters:', xgb_best_params)

In [None]:
from sklearn.linear_model import Ridge

def objective(trial):
    max_iter = trial.suggest_int('max_iter', 500, 10000)
    alpha = trial.suggest_float('alpha', 1e-4, 10)  
    ridge = Ridge(solver='sag', max_iter=max_iter, tol=1e-4, alpha=alpha)
    score = cross_val_score(ridge, X_train, Y_train, cv=kfold, scoring='roc_auc', n_jobs=-1).mean()
    return score

ridge_study = optuna.create_study(direction='maximize')
ridge_study.optimize(objective, n_trials=100)  


fig1 = plot_optimization_history(ridge_study)
fig1.show(config={"staticPlot": True})

fig2 = plot_param_importances(ridge_study)
fig2.show(config={"staticPlot": True})

ridge_best_params = ridge_study.best_params
ridge_best_score = ridge_study.best_value

print('Ridge Best score:', ridge_best_score)
print('Ridge Best parameters:', ridge_best_params)

In [None]:
from sklearn.naive_bayes import MultinomialNB

def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-5, 1)  
    mnb = MultinomialNB(alpha=alpha)
    score = cross_val_score(mnb, X_train, Y_train, cv=kfold, scoring='roc_auc', n_jobs=-1).mean()
    return score

mnb_study = optuna.create_study(direction='maximize')
mnb_study.optimize(objective, n_trials=100)  

fig1 = plot_optimization_history(mnb_study)
fig1.show(config={"staticPlot": True})

fig2 = plot_param_importances(mnb_study)
fig2.show(config={"staticPlot": True})

mnb_best_params = mnb_study.best_params
mnb_best_score = mnb_study.best_value

print('MNB Best score:', mnb_best_score)
print('MNB Best parameters:', mnb_best_params)

In [None]:
from sklearn.svm import SVC

def objective(trial):
    C = trial.suggest_loguniform('C', 1e-6, 1e2)  
    
    svc = SVC(tol=1e-4, 
              C=C, 
              kernel='linear', 
              gamma='auto', 
              probability=True)

    score = cross_val_score(svc, X_train, Y_train, cv=kfold, scoring='roc_auc', n_jobs=-1).mean()

    return score

svc_study = optuna.create_study(direction='maximize')
svc_study.optimize(objective, n_trials=100)


fig1 = plot_optimization_history(svc_study)
fig1.show(config={"staticPlot": True})

fig2 = plot_param_importances(svc_study)
fig2.show(config={"staticPlot": True})

svc_best_params = svc_study.best_params
svc_best_score = svc_study.best_value

print('SVC Best score:', svc_best_score)
print('SVC Best parameters:', svc_best_params)