In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import BernoulliNB, CategoricalNB, ComplementNB, GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomTreesEmbedding, BaggingClassifier, StackingClassifier, VotingClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split

import xgboost as xgb
import optuna


# Импорт библиотек

In [None]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df.sample(3)

## Выставление настроек

In [None]:
2+2

# Обработка данных

Начнем с того, что приведем все названия колонок к "змеиной нотации"

In [None]:
df.columns = df.columns.str.lower()
test.columns = test.columns.str.lower()

Переведем колонку с индексом в индекс датафрейма

In [None]:
df = df.set_index('passengerid')
test = test.set_index('passengerid')

Выведем общую информацию об обучающем и тестовом датасетах

In [None]:
def full_info(df):
    display(pd.DataFrame({'Non-Null Count': (~df.isna()).sum(),
                          'Null Count': df.isna().sum(),
                          'Dtype': df.dtypes}).join(df.describe().T).rename_axis('Feature', axis=1).fillna(''))

In [None]:
full_info(df)

In [None]:
full_info(test)

Выводы следующие:
* В обучающем датасете 891 наблюдение
* Пропуски есть в колонках с возрастом и номером кабины(?). Пропуски есть также и в тестовом датасете, поэтому модель должна уметь работать с ними
* Не все типы данных оптимальны
* есть небольшой дисбаланс в пользу погибших

Удалим признак с номером кабины, так как восстановить эту информацию и использовать далее не удастся

In [None]:
try:
    df = df.drop('cabin', axis=1)
    test = test.drop('cabin', axis=1)
except:
    print('Признака уже нет')

Поменяем типы данных

In [None]:
df['pclass'] = df['pclass'].replace({1: 'first', 2: 'second', 3: 'third'})
test['pclass'] = test['pclass'].replace({1: 'first', 2: 'second', 3: 'third'})

In [None]:
df['sex'] = df['sex'].replace({'male': 0, 'female': 1}).astype('bool')
test['sex'] = test['sex'].replace({'male': 0, 'female': 1}).astype('bool')

Добавим новый признак, равный длине имени. Позже проверим, как это влияет на шансы выжить

In [None]:
df['name_length'] = df['name'].apply(len)
test['name_length'] = test['name'].apply(len)

Посмотрим на признак "номер билета"

In [None]:
df['ticket'].value_counts()

Совпадающих значений мало, удалим признак

In [None]:
df = df.drop('ticket', axis=1)
test = test.drop('ticket', axis=1)

# Исследование данных

Построим по порядку для всех признаков графики, позволяющие пока визуально оценить влияние признака на целевую переменную

## Номер класса

In [None]:
df.groupby('pclass')['survived'].value_counts(normalize=True).round(2)

In [None]:
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(16, 6))

plt.suptitle('Барчарты выживаемости для пассажиров разного класса')
df.groupby(['pclass', 'survived'])['survived'].value_counts().unstack().plot.barh(ax=ax1);
ax1.legend(loc='lower right');
ax1.set_title('матплотлиб')

sns.barplot(data=df.groupby(['pclass', 'survived'])['survived'].value_counts().reset_index(),
            y='pclass',
            order=['third', 'second', 'first'],
            x='count',
            hue='survived',
            # hue_order=[0, 1],
            orient='h',
            ax=ax2);
ax2.set_title('сиборн');

Предсказуемо, шансы выжить у пассажиров первого класса были выше

## Имя

Проверим, существует ли связь между длиной имени и вероятностью выжить.

In [None]:
df.groupby('survived')['name_length'].plot(bins=20, alpha=0.5, kind='hist', grid=False);
plt.legend();
plt.title('Зависимость длины имени от вероятности выжить')
plt.xlabel('Длина имени в билете');

Удалим признак с именем

In [None]:
df = df.drop('name', axis=1)

## Пол

In [None]:
sns.heatmap(df.groupby(['sex', 'survived'])['survived'].count().unstack(), annot=True, fmt='.3g');

Среди женщин выжило примерно 75%, среди мужчин - 20%

## Возраст

In [None]:
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(16, 6))

# fig.title('a')
fig.suptitle('b')
plt.title('c')
plt.suptitle('d')

df.groupby('survived')['age'].plot(bins=20, alpha=0.5, kind='hist', grid=False, ax=ax1);
ax1.legend();
ax1.set_xlabel('Возраст');

df.boxplot(column='age', by='survived', grid=False, ax=ax2);
ax2.set_xlabel('');
ax2.set_title('')
ax2.set_xlabel('Выжил/нет выжил')
ax2.set_ylabel('Возраст');

# Обучение моделей

## Разделение датасета на признаки и таргет, на обучающую и тестовую выборки

In [None]:
x = df.drop('survived', axis=1)
y = df['survived']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

## Линейные модели

Построим пайплайн преобразования признаков для линейной модели

In [None]:
num_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                         ('scaler', StandardScaler())])

cat_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                         ('ohe', OneHotEncoder(drop='first'))])

transformer = ColumnTransformer(transformers=[('nums', num_ct, selector(dtype_exclude="object")),
                                              ('cats', cat_ct, selector(dtype_include="object"))],
                                remainder='passthrough')
Pipeline(steps=[('', transformer)])

Посмотрим на получившиеся признаки. У них теперь нет имен, но по порядку преобразвоаний можно увидеть, что последние 4 признака - это категориальные признаки, подвергнутые OHE.

In [None]:
full_info(pd.DataFrame(transformer.fit_transform(x_train)).astype('float'))

Добавим в пайплайн классификатор

In [None]:
lr_pipe = Pipeline(steps=[('transformer', transformer), ('clf', LogisticRegression(random_state=123))])

params = [{'transformer__nums__imputer': [SimpleImputer()],
           'transformer__nums__imputer__strategy': ['mean', 'median'],
           'clf__penalty': ['none', 'l2'],
           'clf__C': [0.1, 1, 10, 100]
          },
          { 'transformer__nums__imputer': [IterativeImputer(random_state=123)],
           'clf__penalty': ['none', 'l2'],
           'clf__C': [0.1, 1, 10, 100]
          }]

lr_gs    = GridSearchCV(estimator=lr_pipe,
                        param_grid=params,
                        scoring='roc_auc',
                        refit=True,
                        cv=3,
                        n_jobs=-1,
                        verbose=10)
lr_gs.fit(x_train, y_train)
lr_best = lr_gs.best_estimator_;

In [None]:
display(pd.DataFrame(lr_gs.cv_results_).sort_values('rank_test_score'))
print(f'параметры: {pd.DataFrame(lr_gs.cv_results_).sort_values("rank_test_score").head(1)["params"].values[0]}',
      f'\nroc-auc: {pd.DataFrame(lr_gs.cv_results_).sort_values("rank_test_score").head(1)["mean_test_score"].values[0]:.4f}')

In [None]:
lr = lr_gs.best_estimator_

Классификатор обучен и может уже что-то предсказывать. Посмотрим на важность признаков с точки зрения коэффициентов логистической регрессии и permutation_importance

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

pd.Series(lr.named_steps['clf'].coef_.reshape(-1), index=lr.named_steps['transformer'].get_feature_names_out()).sort_values().plot.barh(ax=ax1);
ax1.set_title('LR coefs');

r = permutation_importance(lr, x_train, y_train, scoring='roc_auc', n_repeats=30, random_state=123)
pd.Series(r.importances_mean, index=x_train.columns).sort_values().plot.barh(xerr=r.importances_std, ax=ax2);
ax2.set_title('Permutation importances');

### Добавление полиномиальных признаков

In [None]:
num_ct_poly = Pipeline(steps=[('imputer', IterativeImputer(random_state=123)),
                              ('scaler', StandardScaler()),
                              ('poly', PolynomialFeatures())])

cat_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                         ('ohe', OneHotEncoder(drop='first'))])

transformer_poly = ColumnTransformer(transformers=[('nums', num_ct_poly, selector(dtype_exclude="object")),
                                                   ('cats', cat_ct, selector(dtype_include="object"))],
                                                    remainder='passthrough')

clf_lr = LogisticRegression()
# lr_pipe = Pipeline(steps=[('transformer', transformer_poly), ('clf', clf_lr)])
#
# lr_pipe.fit(x, y)

In [None]:
rfe = RFE(clf_lr)

In [None]:
# создание словаря с признаками после всех преобразований можно было использовать rfe.support_
poly_features = pd.DataFrame({f'x{i}': transformer_poly.fit_transform(x_train)[: , i] for i in range(1, transformer_poly.fit_transform(x_train).shape[1])})
poly_features.sample()

In [None]:
rfe.fit(poly_features, y_train)

In [None]:
rfe.ranking_

In [None]:
rfe.estimator_.coef_

In [None]:
rfe_imp_features = rfe.get_feature_names_out()
rfe_imp_features.shape

In [None]:
cross_val_score(clf_lr, poly_features[rfe_imp_features], y_train, cv=3, scoring='roc_auc').mean()

### Встраивание RFE в Gridsearch

In [None]:
# класс, который возвращает коэффициенты логистической регрессии
# https://stackoverflow.com/questions/36683230/grid-search-with-recursive-feature-elimination-in-scikit-learn-pipeline-returns

class MyPipe(Pipeline):

    def fit(self, X, y=None, **fit_params):
        """Calls last elements .coef_ method.
        Based on the sourcecode for decision_function(X).
        Link: https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/pipeline.py
        ----------
        """
        super(MyPipe, self).fit(X, y, **fit_params)
        self.coef_ = self.steps[-1][-1].coef_
        return self

In [None]:
num_ct_poly = Pipeline(steps=[('imputer', IterativeImputer(random_state=123)),
                              ('scaler', StandardScaler()),
                              ('poly', PolynomialFeatures())])

cat_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                         ('ohe', OneHotEncoder(drop='first'))])

transformer_poly = ColumnTransformer(transformers=[('nums', num_ct_poly, selector(dtype_exclude="object")),
                                                   ('cats', cat_ct, selector(dtype_include="object"))],
                                                    remainder='passthrough')

x_poly = transformer_poly.fit_transform(x_train)

params = [{'rfe__estimator__penalty': [None, 'l2'],
           'rfe__estimator__C': [0.1, 1, 10, 100],
           'rfe__n_features_to_select': range(5, 20)
          }]

clf_lr = LogisticRegression()
rfe = RFE(estimator=clf_lr, verbose=99)

lr_pipe_poly = Pipeline(steps=[('transformer', transformer_poly), ('rfe', rfe)])

lr_gs_poly=GridSearchCV(estimator=lr_pipe_poly,
                        param_grid=params,
                        scoring='roc_auc',
                        refit=True,
                        cv=3,
                        n_jobs=-1,
                        verbose=10)

lr_gs_poly.fit(x_train, y_train)

In [None]:
pd.DataFrame(lr_gs_poly.cv_results_).sort_values('rank_test_score').head(33)

НИЧЕГО НЕ ПОЛУЧИЛОСЬ

### Метод опорных векторов

#### SVC

In [None]:
%%time
num_ct = Pipeline(steps=[('imputer', IterativeImputer(random_state=123)),
                         ('scaler', StandardScaler())])

cat_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                         ('ohe', OneHotEncoder(drop='first'))])

transformer = ColumnTransformer(transformers=[('nums', num_ct, selector(dtype_exclude="object")),
                                              ('cats', cat_ct, selector(dtype_include="object"))],
                                                    remainder='passthrough')

# Параметр probability нужен, чтобы вернулись predict_proba. Это нужно для построения roc-auc и soft-voting
svm_pipe = Pipeline(steps=[('transformer', transformer), ('estimator', SVC(random_state=123, probability=True))])

params = {'transformer__nums__scaler': [StandardScaler(), MinMaxScaler()],
          'estimator__C': range(1, 20),
          'estimator__gamma': [i/10000 for i in range(10)]}

gs_svm = GridSearchCV(svm_pipe, param_grid=params, scoring='roc_auc', cv=3, n_jobs=-1, refit=True)

gs_svm.fit(x_train, y_train)
# {'estimator__C': 9, 'estimator__gamma': 0.009
svm_best = gs_svm.best_estimator_


In [None]:
display(pd.DataFrame(gs_svm.cv_results_).sort_values('rank_test_score'))
print(f'параметры: {pd.DataFrame(gs_svm.cv_results_).sort_values("rank_test_score").head(1)["params"].values[0]}',
      f'\nroc-auc: {pd.DataFrame(gs_svm.cv_results_).sort_values("rank_test_score").head(1)["mean_test_score"].values[0]:.4f}')

#### LinearSVC

In [None]:
%%time
num_ct = Pipeline(steps=[('imputer', IterativeImputer(random_state=123)),
                         ('scaler', StandardScaler())])

cat_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                         ('ohe', OneHotEncoder(drop='first'))])

transformer = ColumnTransformer(transformers=[('nums', num_ct, selector(dtype_exclude="object")),
                                              ('cats', cat_ct, selector(dtype_include="object"))],
                                                    remainder='passthrough')


svm_pipe = Pipeline(steps=[('transformer', transformer), ('estimator', LinearSVC(max_iter=100000, random_state=123))])

params = {'transformer__nums__scaler': [StandardScaler(), MinMaxScaler()],
          'estimator__C': [2*i/100 for i in range(1, 100)]}

gs_lsvm = GridSearchCV(svm_pipe, param_grid=params, scoring='roc_auc', cv=3, n_jobs=-1, refit=True, error_score='raise')
gs_lsvm.fit(x_train, y_train);
lsvm_best = gs_lsvm.best_estimator_

In [None]:
display(pd.DataFrame(gs_lsvm.cv_results_).sort_values('rank_test_score'))
print(f'параметры: {pd.DataFrame(gs_lsvm.cv_results_).sort_values("rank_test_score").head(1)["params"].values[0]}',
      f'\nroc-auc: {pd.DataFrame(gs_lsvm.cv_results_).sort_values("rank_test_score").head(1)["mean_test_score"].values[0]:.4f}')

## Наивный Байес

In [None]:
%%time
num_ct = Pipeline(steps=[('imputer', IterativeImputer(random_state=123)),
                         ('scaler', MinMaxScaler())])

cat_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                         ('ohe', OneHotEncoder(drop='first'))])

transformer_poly = ColumnTransformer(transformers=[('nums', num_ct, selector(dtype_exclude="object")),
                                                   ('cats', cat_ct, selector(dtype_include="object"))],
                                                    remainder='passthrough')

bayes_pipe = Pipeline(steps=[('transformer', transformer_poly), ('estimator', BernoulliNB())])

params = [{'estimator': [BernoulliNB(), CategoricalNB(), ComplementNB()],
           'estimator__alpha': [0.1, 1, 10]},

          {'estimator': [GaussianNB()]}
         ]

gs_bayes = GridSearchCV(bayes_pipe, param_grid=params, scoring='roc_auc', cv=3, n_jobs=-1)

gs_bayes.fit(x_train, y_train);

In [None]:
display(pd.DataFrame(gs_bayes.cv_results_).sort_values('rank_test_score'))
print(f'параметры: {pd.DataFrame(gs_bayes.cv_results_).sort_values("rank_test_score").head(1)["params"].values[0]}',
      f'\nroc-auc: {pd.DataFrame(gs_bayes.cv_results_).sort_values("rank_test_score").head(1)["mean_test_score"].values[0]:.4f}')

## Деревянные модели

### Дерево решений

In [None]:
%%time
num_ct = Pipeline(steps=[('imputer', IterativeImputer(random_state=123))])

cat_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                         ('encoder', OneHotEncoder(drop='first'))])

transformer = ColumnTransformer(transformers=[('nums', num_ct, selector(dtype_exclude="object")),
                                                   ('cats', cat_ct, selector(dtype_include="object"))],
                                                    remainder='passthrough')

tree_pipe = Pipeline(steps=[('transformer', transformer), ('clf', DecisionTreeClassifier(random_state=123))])

tree_params = {'transformer__cats__encoder': [OneHotEncoder(drop='first'), LabelEncoder()],
               'clf__max_depth': [2, 3, 4, 5, 7, 9],
               'clf__min_samples_split': [2, 3, 4, 5, 6, 8, 9, 10, 11],
               'clf__min_samples_leaf': [3, 5, 8, 9, 10, 11, 12, 14, 16]}

tree_gs = GridSearchCV(tree_pipe, param_grid=tree_params, scoring='roc_auc', cv=3, n_jobs=-1)
tree_gs.fit(x_train, y_train);
tree_best = tree_gs.best_estimator_

In [None]:
display(pd.DataFrame(tree_gs.cv_results_).sort_values('rank_test_score'))
print(f'параметры: {pd.DataFrame(tree_gs.cv_results_).sort_values("rank_test_score").head(1)["params"].values[0]}',
      f'\nroc-auc: {pd.DataFrame(tree_gs.cv_results_).sort_values("rank_test_score").head(1)["mean_test_score"].values[0]:.4f}')

### Случайный лес

In [None]:
%%time
num_ct = Pipeline(steps=[('imputer', IterativeImputer(random_state=123))])

cat_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                         ('encoder', OneHotEncoder(drop='first'))])

transformer = ColumnTransformer(transformers=[('nums', num_ct, selector(dtype_exclude="object")),
                                              ('cats', cat_ct, selector(dtype_include="object"))],
                                                remainder='passthrough')

rf_pipe = Pipeline(steps=[('transformer', transformer), ('clf', RandomForestClassifier(random_state=123))])

rf_params = {'clf__n_estimators': [10, 15, 20, 25, 30],
             'clf__max_depth': [5, 6, 7, 8, 9],
             'clf__min_samples_split': [2, 3, 4, 5, 6],
             'clf__min_samples_leaf': [2, 3, 4, 5, 6, 7, 8]}

rf_gs = GridSearchCV(rf_pipe, param_grid=rf_params, scoring='roc_auc', cv=3, n_jobs=-1, refit=True)
rf_gs.fit(x_train, y_train);
rf_best = rf_gs.best_estimator_

In [None]:
display(pd.DataFrame(rf_gs.cv_results_).sort_values('rank_test_score'))
print(f'параметры: {pd.DataFrame(rf_gs.cv_results_).sort_values("rank_test_score").head(1)["params"].values[0]}',
      f'\nroc-auc: {pd.DataFrame(rf_gs.cv_results_).sort_values("rank_test_score").head(1)["mean_test_score"].values[0]:.4f}')

### ExtraTreesClassifier

In [None]:
%%time
num_ct = Pipeline(steps=[('imputer', IterativeImputer(random_state=123))])

cat_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                         ('encoder', OneHotEncoder(drop='first'))])

transformer = ColumnTransformer(transformers=[('nums', num_ct, selector(dtype_exclude="object")),
                                              ('cats', cat_ct, selector(dtype_include="object"))],
                                                remainder='passthrough')

et_pipe = Pipeline(steps=[('transformer', transformer), ('clf', ExtraTreesClassifier(random_state=123))])

et_params = {'clf__n_estimators': [10, 15, 20, 25, 30],
             'clf__max_depth': [5, 6, 7, 8, 9],
             'clf__min_samples_split': [2, 3, 4, 5, 6],
             'clf__min_samples_leaf': [2, 3, 4, 5, 6, 7, 8]}

et_gs = GridSearchCV(et_pipe, param_grid=et_params, scoring='roc_auc', cv=3, n_jobs=-1, refit=True)
et_gs.fit(x_train, y_train);
et_best = et_gs.best_estimator_

In [None]:
display(pd.DataFrame(et_gs.cv_results_).sort_values('rank_test_score'))
print(f'параметры: {pd.DataFrame(et_gs.cv_results_).sort_values("rank_test_score").head(1)["params"].values[0]}',
      f'\nroc-auc: {pd.DataFrame(et_gs.cv_results_).sort_values("rank_test_score").head(1)["mean_test_score"].values[0]:.4f}')

### RandomTreesEmbedding

In [None]:
%%time
num_ct = Pipeline(steps=[('imputer', IterativeImputer(random_state=123))])

cat_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                         ('encoder', OneHotEncoder(drop='first'))])

transformer = ColumnTransformer(transformers=[('nums', num_ct, selector(dtype_exclude="object")),
                                              ('cats', cat_ct, selector(dtype_include="object"))],
                                                remainder='passthrough')

rte_pipe = Pipeline(steps=[('transformer', transformer), ('clf', RandomTreesEmbedding(random_state=123))])

rte_params = {'clf__n_estimators': [10, 15, 20, 25, 30],
             'clf__max_depth': [5, 6, 7, 8, 9],
             'clf__min_samples_split': [2, 3, 4, 5, 6],
             'clf__min_samples_leaf': [2, 3, 4, 5, 6, 7, 8]}

rte_gs = GridSearchCV(rte_pipe, param_grid=rte_params, scoring='roc_auc', cv=3, n_jobs=-1)
rte_gs.fit(x_train, y_train);

In [None]:
display(pd.DataFrame(rte_gs.cv_results_).sort_values('rank_test_score'))
print(f'параметры: {pd.DataFrame(rte_gs.cv_results_).sort_values("rank_test_score").head(1)["params"].values[0]}',
      f'\nroc-auc: {pd.DataFrame(rte_gs.cv_results_).sort_values("rank_test_score").head(1)["mean_test_score"].values[0]:.4f}')

## Ансамбли

### BaggingClassifier

BaggingClassifier почему-то не воспринимает трансформации по типам колонок внутри пайплайна, поэтому перепишем трансформер и пересоберем пайплайн, указав трансформируемые признаки индексами

In [None]:
num_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                         ('scaler', StandardScaler())])

cat_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                         ('ohe', OneHotEncoder(drop='first'))])

transformer = ColumnTransformer(transformers=[('nums', num_ct, (1, 2, 3, 4, 5, 7)),
                                              ('cats', cat_ct, [0, 6])],
                                remainder='passthrough')

lr_pipe = Pipeline(steps=[('transformer', transformer), ('clf', LogisticRegression(random_state=123))])

params = [{'transformer__nums__imputer': [SimpleImputer()],
           'transformer__nums__imputer__strategy': ['mean', 'median'],
           'clf__penalty': ['none', 'l2'],
           'clf__C': [0.1, 1, 10, 100]
          },
          { 'transformer__nums__imputer': [IterativeImputer(random_state=123)],
           'clf__penalty': ['none', 'l2'],
           'clf__C': [0.1, 1, 10, 100]
          }]

lr_gs    = GridSearchCV(estimator=lr_pipe,
                        param_grid=params,
                        scoring='roc_auc',
                        refit=True,
                        cv=3,
                        n_jobs=-1,
                        verbose=10)
lr_gs.fit(x_train, y_train)
lr_best = lr_gs.best_estimator_;

In [None]:
bc = BaggingClassifier(estimator=lr_best, n_estimators=10, random_state=123)
cross_val_score(bc, x_train, y_train, scoring='roc_auc', cv=5, n_jobs=-1).mean().round(4)

ну хз

### StackingClassifier

In [None]:
sc = StackingClassifier(estimators=[('rf', rf_best), ('et', et_best)],
                        final_estimator=RandomForestClassifier(n_estimators=15, max_depth=7),
                        cv=3,
                        n_jobs=-1)
cross_val_score(sc, x_train, y_train, scoring='roc_auc', cv=5, n_jobs=-1).mean().round(4)

### VotingClassifier

In [None]:
vc = VotingClassifier(estimators=[('lr', lr_best),
                                  ('tree', tree_best),
                                  ('rf', rf_best),
                                  ('et', et_best)],
                      voting='soft', n_jobs=-1)
print(cross_val_score(vc, x_train, y_train, scoring='roc_auc', cv=3, n_jobs=-1).mean().round(4))

## Градиентный бустинг

### XGBoost

XGBoost не умеет работать с категориальными признаками напрямую, поэтому преобразуем данные заранее

In [None]:
%%time
num_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                         ('scaler', StandardScaler())])

cat_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                         ('ohe', OneHotEncoder(drop='first'))])

transformer = ColumnTransformer(transformers=[('nums', num_ct, (1, 2, 3, 4, 5, 7)),
                                              ('cats', cat_ct, [0, 6])],
                                remainder='passthrough')

xgb_pipe = Pipeline(steps=[('transformer', transformer), ('clf', xgb.XGBClassifier())])

params = {'clf__n_estimators': [15, 25, 50],
          'clf__max_depth': [3, 4, 5, 6, 7],
          'clf__learning_rate': [00.1, 0.1, 0.4]}

xgb_gs   = GridSearchCV(estimator=xgb_pipe,
                        param_grid=params,
                        scoring='roc_auc',
                        refit=True,
                        cv=3,
                        n_jobs=-1,
                        verbose=10)
xgb_gs.fit(x_train, y_train)
xgb_best = xgb_gs.best_estimator_;

In [None]:
display(pd.DataFrame(xgb_gs.cv_results_).sort_values('rank_test_score'))
print(f'параметры: {pd.DataFrame(xgb_gs.cv_results_).sort_values("rank_test_score").head(1)["params"].values[0]}',
      f'\nroc-auc: {pd.DataFrame(xgb_gs.cv_results_).sort_values("rank_test_score").head(1)["mean_test_score"].values[0]:.4f}')

In [None]:
display(pd.DataFrame(xgb_gs.cv_results_).sort_values('rank_test_score'))
print(f'параметры: {pd.DataFrame(xgb_gs.cv_results_).sort_values("rank_test_score").head(1)["params"].values[0]}',
      f'\nroc-auc: {pd.DataFrame(xgb_gs.cv_results_).sort_values("rank_test_score").head(1)["mean_test_score"].values[0]:.4f}')

Попробуем добавить регуляризацию при помощи early_stopping. Встроить такую конструкцию в ГридСерч не получилось

In [None]:
%%time
num_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                         ('scaler', StandardScaler())])

cat_ct = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                         ('ohe', OneHotEncoder(drop='first'))])

transformer = ColumnTransformer(transformers=[('nums', num_ct, (1, 2, 3, 4, 5, 7)),
                                              ('cats', cat_ct, [0, 6])],
                                remainder='passthrough')

x_train_tr = transformer.fit_transform(x_train)
x_test_tr = transformer.fit_transform(x_test)

xgb_clf = xgb.XGBClassifier(n_estimators=500,
                            max_depth=3,
                            eval_metric="auc",
                            learning_rate=0.05,
                            early_stopping_rounds=50)

xgb_clf.fit(x_train_tr, y_train, eval_set=[(x_train_tr, y_train), (x_test_tr, y_test)]);

In [None]:
xgb_clf.evals_result_

Прикрутим Оптуну

In [None]:
def objective(trial):
    # Define the search space for hyperparameters
    param = {
        'objective': 'binary:logitraw',
        'eval_metric': 'auc',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        # 'num_boost_round': 100000, # Fix the boosting round and use early stopping
        'max_depth': trial.suggest_int('max_depth', 3, 10)
        # 'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        # 'gamma': trial.suggest_float('gamma', 0.0, 10.0),
        # 'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0),
        # 'lambda': trial.suggest_float('lambda', 0.1, 10.0),
        # 'alpha': trial.suggest_float('alpha', 0.0, 10.0),
    }

    # Split the data into further training and validation sets (three sets are preferable)
    train_data, valid_data, train_target, valid_target = train_test_split(x_train_tr, y_train, test_size=0.2, random_state=42)

    # Convert the data into DMatrix format
    dtrain = xgb.DMatrix(train_data, label=train_target)
    dvalid = xgb.DMatrix(valid_data, label=valid_target)

    # Define the pruning callback for early stopping
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-rmse')

    # Train the model with early stopping
    model = xgb.train(param, dtrain, evals=[(dvalid, 'validation')], num_boost_round=1000000, early_stopping_rounds=50)

                      # , callbacks=[pruning_callback])

    # Make predictions on the test set
    dtest = xgb.DMatrix(valid_data)
    y_pred = model.predict(dtest)

    # Calculate the root mean squared error
    roc_auc = roc_auc_score(valid_target, y_pred)

    return roc_auc

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=100, timeout=10, n_jobs=-1) # Control the number of trials

# Print the best hyperparameters and the best score
best_params = study.best_params
best_roc_auc = study.best_value
print("Best Hyperparameters: ", best_params)
print("Best roc_auc: ", best_roc_auc)

## Построение графиков roc-auc кривых

In [None]:
plt.figure(figsize=(15, 6));
plt.xlim([0, 1])
plt.ylim([0, 1])

plt.plot([0, 1], [0, 1]);

for model, label in zip((vc, sc, bc, rf_best),
                        ('VotingClassifier', 'StackingClassifier', 'BaggingClassifier', 'RandomForest')):
    fpr, tpr, thresholds = roc_curve(y_train, model.fit(x_train, y_train).predict_proba(x_train)[:, 0])
    plt.plot(tpr, fpr, label=label);
    plt.legend(loc='lower right')
    plt.legend(loc='upper left')

In [None]:
roc_auc_score(y_train, vc.fit(x_train, y_train).predict(x_train))

In [None]:
roc_auc_score(y_train, rf_best.fit(x_train, y_train).predict(x_train))