In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

In [None]:
train = pd.read_parquet('../data/train.parquet')

In [27]:
train['values'] = train['values'].apply(lambda x: np.array(json.loads(x)) if isinstance(x, str) else x)

train['mean'] = train['values'].apply(np.mean)
train['std'] = train['values'].apply(np.std)
train['median'] = train['values'].apply(np.median)
train['min'] = train['values'].apply(np.min)
train['max'] = train['values'].apply(np.max)

X_train = train[['mean', 'std', 'median', 'min', 'max']]
y_train = train['label']

(80000, 5)
(80000,)


In [31]:
param_grid_xgb = {
    'n_estimators': [100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}

In [36]:
xgb = XGBClassifier(eval_metric='auc', use_label_encoder=False)

random_search_xgb = RandomizedSearchCV(estimator=xgb,
                                        param_distributions=param_grid_xgb,
                                        n_iter=12,
                                        scoring='roc_auc',
                                        cv=3,
                                        n_jobs=-1,
                                        verbose=2,
                                        random_state=42)

X_val = X_train.sample(frac=0.2, random_state=42)
y_val = y_train[X_val.index]

random_search_xgb.fit(X_train, y_train)

print("Лучшие параметры для XGBoost:", random_search_xgb.best_params_)
print("Лучший ROC AUC на валидационной выборке (XGBoost):", random_search_xgb.best_score_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


Parameters: { "use_label_encoder" } are not used.



Лучшие параметры для XGBoost: {'subsample': 0.8, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
Лучший ROC AUC на валидационной выборке (XGBoost): 0.8274594557697044


In [38]:
best_xgb_model = random_search_xgb.best_estimator_
y_val_pred_proba_xgb = best_xgb_model.predict_proba(X_val)[:, 1]
roc_auc_xgb = roc_auc_score(y_val, y_val_pred_proba_xgb)

print("ROC AUC на валидационной выборке (XGBoost):", roc_auc_xgb)

ROC AUC на валидационной выборке (XGBoost): 0.8356503472642856


In [None]:
joblib.dump(best_model, '../data/best_model.pkl')