In [46]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, roc_auc_score
import random
import sklearn

In [192]:
data = pd.read_csv('../input/spreadprediction/train.csv')
test = pd.read_csv('../input/spreadprediction/test.csv')

Посмотрим на данные, определим есть ли выбросы и насколько они пригодны для обучения.

In [59]:
data.describe()

Unnamed: 0,VASK0,VASK1,VASK2,VASK3,VASK4,VBID0,VBID1,VBID2,VBID3,VBID4,...,VASK1_PREV,VASK2_PREV,VASK3_PREV,VASK4_PREV,VBID0_PREV,VBID1_PREV,VBID2_PREV,VBID3_PREV,VBID4_PREV,Y
count,186590.0,186590.0,186590.0,186590.0,186590.0,186590.0,186590.0,186590.0,186590.0,186590.0,...,186590.0,186590.0,186590.0,186590.0,186590.0,186590.0,186590.0,186590.0,186590.0,186590.0
mean,92.788981,178.840495,204.46264,205.277978,202.079436,92.270245,177.477716,203.058487,202.174195,197.799646,...,178.796431,204.430945,205.273975,202.076333,92.318286,177.438491,203.023479,202.169194,197.795123,0.010258
std,66.491713,62.138222,62.969816,65.644839,67.57185,65.2812,59.409587,58.546985,59.286325,61.265705,...,62.173445,62.981647,65.646947,67.573708,65.37613,59.444964,58.557993,59.289762,61.263966,0.10076
min,1.0,1.0,5.0,1.0,9.0,1.0,1.0,1.0,4.0,3.0,...,1.0,5.0,1.0,9.0,1.0,1.0,1.0,4.0,3.0,0.0
25%,41.0,138.0,162.0,161.0,156.0,40.0,137.0,162.0,161.0,156.0,...,138.0,162.0,161.0,156.0,40.0,136.0,162.0,161.0,156.0,0.0
50%,83.0,170.0,196.0,194.0,189.0,83.0,170.0,197.0,195.0,188.0,...,170.0,196.0,194.0,189.0,83.0,170.0,197.0,195.0,188.0,0.0
75%,131.0,210.0,232.0,234.0,232.0,132.0,211.0,234.0,233.0,228.0,...,210.0,232.0,234.0,232.0,132.0,211.0,234.0,233.0,228.0,0.0
max,647.0,784.0,666.0,813.0,738.0,593.0,657.0,683.0,687.0,718.0,...,784.0,666.0,813.0,738.0,593.0,657.0,683.0,687.0,718.0,1.0


In [376]:
np.shape(data.dropna()) # пропущенных значений нет

(186590, 21)

Очевидно, что наш датасет крайне разбалансирован и если мы будем учить на нём целиком, то велика вероятность получить точность 99% при этом не угадывая ни одного представителя класса `1`. Поэтому попробуем воспользоваться различными методиками ресемплирования, предварительно отобрав в тестовый датаест по 50 представителей того и другого классов


In [194]:
X0_train, X0_test, y0_train, y0_test = train_test_split(data[data.Y==0].iloc[:,:-1],\
                                                        data[data.Y==0].iloc[:,-1], test_size=50, random_state=123)

In [195]:
X1_train, X1_test, y1_train, y1_test = train_test_split(data[data.Y==1].iloc[:,:-1],\
                                                        data[data.Y==1].iloc[:,-1], test_size=50, random_state=123)

In [197]:
X_train, X_test, y_train, y_test = pd.concat([X0_train, X1_train]), pd.concat([X0_test, X1_test]),\
                                    pd.concat([y0_train, y1_train]), pd.concat([y0_test, y1_test])

Функция ниже будет использоваться в дальнейшем для тестирования разных вариантов сэмплирования

In [404]:
def boosted_classifier(X, y):
    model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc', gamma=1, min_child_weight=1, max_depth=6, \
                              learning_rate=0.01, n_estimators=100, use_label_encoder=False, \
                              subsample = 0.8, colsample_bytree=1, 
                              gpu_id=0, tree_method='gpu_hist')
    X_train_, X_test_, y_train_, y_test_ = train_test_split(X, y, test_size=0.2)
    model.fit(X_train_, y_train_)
    res = model.predict_proba(X_test_)[:,1]
    print(roc_auc_score(y_test_, res), 'auc score')
    print(roc_auc_score(y_test, model.predict(X_test)), 'auc test score')
#     print(accuracy_score(y_test, model.predict(X_test)), 'accuracy')
    return model

Воспользуемся методами, предлагаемыми в библиотеке imblearn, являющейся частью sklearn

In [405]:
from imblearn.over_sampling import RandomOverSampler

ran=RandomOverSampler()
X_ran,y_ran= ran.fit_resample(X_train,y_train)
first = boosted_classifier(X_ran, y_ran)

0.902369244617882 auc score
0.81 auc test score


In [401]:
from imblearn.under_sampling import RandomUnderSampler

ran=RandomUnderSampler()
X_rs,y_rs = ran.fit_resample(X_train,y_train)
third = boosted_classifier(X_rs, y_rs)

0.8913135669362083 auc score
0.83 auc test score


In [402]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X_train, y_train)

fourth = boosted_classifier(X_sm, y_sm)

0.9171818984185642 auc score
0.86 auc test score


In [403]:
from imblearn.over_sampling import ADASYN
adasyn = ADASYN(sampling_strategy='minority')
X_ad, y_ad = adasyn.fit_resample(X_train, y_train)
fifth = boosted_classifier(X_ad, y_ad)

0.9171223699010295 auc score
0.84 auc test score


Попробуем теперь произвести оптимизацию гиперпараметров

In [102]:
import warnings
warnings.filterwarnings('ignore')

In [103]:
from optuna import create_study
from optuna.samplers import TPESampler
from optuna.integration import XGBoostPruningCallback

In [133]:
def evaluate_model_skf(model, X_df, y_df, n_splits=5):
    X_values = X_df.values
    y_values = y_df.values
    skf = StratifiedKFold(
        n_splits=n_splits
    )
    y_pred = np.zeros((len(y_values), 2))
    
    for train_index, test_index in skf.split(X_values, y_values):
        X_A, X_B = X_values[train_index, :], X_values[test_index, :]
        y_A = y_values[train_index]
        model.fit(
            X_A, y_A,
        )
        y_pred[test_index] = model.predict_proba(X_B)
    return roc_auc_score(y_values, y_pred[:,1])

In [142]:
def objective(
    trial,
    X,
    y,
    random_state=22,
    n_splits=3,
    n_repeats=2,
    n_jobs=1,
    early_stopping_rounds=50,
):
    # XGBoost parameters
    params = {
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "objective": "binary:logistic",
        "n_estimators": 1000,
        "max_depth": trial.suggest_int("max_depth", 1, 12),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.05),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.6),
        "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8),
        "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0),
        "gamma": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 10, 1000),
        "seed": random_state,
        "n_jobs": n_jobs,
    }

    model = xgb.XGBClassifier(**params, use_label_encoder=False)
    pruning_callback = XGBoostPruningCallback(trial, "validation_0-auc")
    skf = StratifiedKFold(
        n_splits=n_splits
    )
    X_values = X.values
    y_values = y.values
    y_pred = np.zeros((len(y_values), 2))
    for train_index, test_index in skf.split(X_values, y_values):
        X_A, X_B = X_values[train_index, :], X_values[test_index, :]
        y_A, y_B = y_values[train_index], y_values[test_index]
        model.fit(
            X_A,
            y_A,
            eval_set=[(X_B, y_B)],
            eval_metric="auc",
            verbose=0,
            callbacks=[pruning_callback],
            early_stopping_rounds=early_stopping_rounds,
        )
        y_pred[test_index] = model.predict_proba(X_B)
    return roc_auc_score(y_values, y_pred[:,1])

In [272]:
sampler = TPESampler(seed=0, multivariate=True)
study = create_study(direction="maximize", sampler=sampler)
study.optimize(
    lambda trial: objective(
        trial,
        X_rs,
        y_rs,
        random_state=0,
        n_splits=5,
        n_repeats=2,
        n_jobs=8,
        early_stopping_rounds=50,
    ),
    n_trials=100,
    n_jobs=1,
)

hp = study.best_params
for key, value in hp.items():
    print(f"{key:>20s} : {value}")
print(f"{'best objective value':>20s} : {study.best_value}")

[32m[I 2021-11-24 20:14:25,053][0m A new study created in memory with name: no-name-f4c446b3-5291-4cde-92a2-97015e40e911[0m
[32m[I 2021-11-24 20:14:26,444][0m Trial 0 finished with value: 0.8230645538230581 and parameters: {'max_depth': 7, 'learning_rate': 0.02595131508847582, 'colsample_bytree': 0.38781197383981786, 'subsample': 0.5835608840550383, 'alpha': 0.18662266976517972, 'lambda': 0.006502000785097662, 'min_child_weight': 75.01954443620116}. Best is trial 0 with value: 0.8230645538230581.[0m
[32m[I 2021-11-24 20:14:26,914][0m Trial 1 finished with value: 0.5 and parameters: {'max_depth': 11, 'learning_rate': 0.04598675496284059, 'colsample_bytree': 0.3047741258042684, 'subsample': 0.6924572650469156, 'alpha': 0.3860866271460545, 'lambda': 0.001295391520396607, 'min_child_weight': 709.8936257405899}. Best is trial 0 with value: 0.8230645538230581.[0m
[32m[I 2021-11-24 20:14:27,370][0m Trial 2 finished with value: 0.5 and parameters: {'max_depth': 1, 'learning_rate': 0

           max_depth : 9
       learning_rate : 0.006969716702685446
    colsample_bytree : 0.5781140370083396
           subsample : 0.4796833363820795
               alpha : 0.22533807494938662
              lambda : 6.182474161714489e-06
    min_child_weight : 10.183845679943673
best objective value : 0.8522574209324172


In [271]:
hp['num_estimators'] = 1000
model_optimized = xgb.XGBClassifier(**hp).fit(X_ran, y_ran)
res = model_optimized.predict(X_test)
accuracy_score(y_test, res)

0.81

Будем пользоваться моделью, давшей лучший результат на тестовой выборке (пятой)

In [341]:
predictions = fifth.predict_proba(test)

In [358]:
def foo(x):
    if 0.2 <= abs(x) <= 0.4:
        return 0.5
    else:
        if x > 0:
            return 0
        else:
            return 1

In [353]:
pred = pd.DataFrame(predictions)

In [362]:
pred['diff'] = pred.apply(lambda x: x[0]-x[1], axis=1)

In [364]:
pred['res'] = pred['diff'].apply(foo)

In [372]:
pred['res'].to_csv('prediction.csv')