In [68]:
import pandas as pd
import numpy as np
import seaborn as sns

# отключим предупреждения Anaconda
import warnings
warnings.simplefilter('ignore')

%pylab inline
%config InlineBackend.figure_format = 'svg' 
%config InlineBackend.figure_format = 'retina'

from pylab import rcParams
rcParams['figure.figsize'] = 14,9

Populating the interactive namespace from numpy and matplotlib


In [69]:
def load_titanic():
    """
    Загружает данные Титаника и возвращает с виде одного датасета 
    с признаком is_test и данные выживания для тестовой выборки
    """
    X_train = pd.read_csv('titanic/train.csv')
    X_test = pd.read_csv('titanic/test.csv')
    X_train['is_test'] = 0
    X_test['is_test'] = 1    
    y_train = X_train.Survived
    X_train.drop('Survived', axis=1, inplace=True)
    return pd.concat([X_train, X_test]), y_train

In [70]:
def get_title(name): 
    return name.split(',')[1].split('.')[0].strip()

# Объединяем звания
titles_map = {
    "Jonkheer":    "Royalty",
    "Don":         "Royalty",
    "Sir" :        "Royalty",   
    "Lady" :       "Royalty", 
    "Dona":        "Royalty",    
    "the Countess":"Royalty",
    
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",    
    "Dr":         "Officer",
    "Rev":        "Officer",

    "Mr" :        "Mr",    
    
    "Mrs" :       "Mrs",        
    "Ms":         "Mrs",
    
    "Mlle":       "Miss",
    "Mme":        "Miss",
    "Miss" :      "Miss",  

    "Master" :    "Master",
}

def clean_data(X, silent=True):
    """Очищает исхожные данные и кодирует нечисловые значения"""
    _log = print if not silent else lambda *args: args
    
    # Сохраняем ID пассажира для создания файла с ответами для постинга на Kaggle
    # и is_test для последующего разделения датасета на тестовый и тренировочный
    # в отдельных pd.Series и удаляем эти колонки из датасета
    PassengerId = X[X.is_test == 1].PassengerId
    is_test = X.is_test
    X = X.drop(['is_test', 'PassengerId'], axis=1)
    
    # Пол
    _log('\n------Sex')
    _log(X.Sex.value_counts())     
    X.Sex = X.Sex.map({"male": 1, "female":0})
    
    # Обращение
    X['Title'] = X.Name.map(get_title).map(titles_map)
    _log('\n------Titles')
    _log(X.Title.value_counts())
    
    # Возраст. Если не Master, берём медиану по полу и классу
    age_master = X[X.Title == 'Master'].Age.median()
    age_by_pclass_and_sex = X.groupby(["Pclass", "Sex"])["Age"].median()
    def fill_age(row):
        if not pd.isnull(row.Age):
            return row.Age
        if row.Title == 'Master':
            return age_master
        return age_by_pclass_and_sex[row.Pclass][row.Sex]
    
    X.Age = X.apply(fill_age, axis=1)
    X['AgeLte15'] = X.Age.map(lambda age: int(age <= 15))   
        
    # Стоимость (для отсутствующих значений берём медиану по классу)
    X.Fare = X.Fare.map(lambda fare: fare if fare else float('nan'))
    fare_by_pclass = X.groupby("Pclass")["Fare"].median()
    def fill_fare(row):
        if not pd.isnull(row.Fare):
            return row.Fare
        return fare_by_pclass[row.Pclass]
    X.Fare = X.apply(fill_fare, axis=1)
    
    # Класс стоимости
    def get_fare_class(fare):
        if pd.isnull(fare):
            return fare
        if fare < 100: return '0-100'
        if fare < 500: return '100-500'
        return '>500'
    X['FareClass'] = X.Fare.map(get_fare_class)     
    
    # Размер семьи
    _log('\n------FamilySize')
    X['FamilySize'] = X.SibSp + X.Parch + 1
    _log(X.FamilySize.value_counts())   
    
    # Одиночки
    _log('\n------Alone')
    X['Alone'] = X.FamilySize.map(lambda size: int(size == 1))
    _log(X.Alone.value_counts()) 
    
    # Большая семья
    _log('\n------BigFamily')    
    X['BigFamily'] = X.FamilySize.map(lambda size: int(size > 4))
    _log(X.BigFamily.value_counts())  
    
    # Класс каюты
    _log('\n------Pclass')
    _log(X.Pclass.value_counts())     
    # Помимо dummy колонок для Pclass оставим сами значения, т.к. их можно сравнивать. 
    # Только меняем порядок 0,1,2
    X['Pclass2'] = 3 - X.Pclass  
    
    # Порт посадки
    _log('\n------Embarked')
    X.Embarked = X.Embarked.fillna('S')
    _log(X.Embarked.value_counts()) 
    
    
    # Есть каюта
    _log('\n------HasCabin')
    X['HasCabin'] = X.Cabin.notnull().astype(int)
    _log(X.HasCabin.value_counts()) 
    
    # Палуба из номера каюты
    _log('\n------Deck')
    X['Deck'] = X.Cabin.map(lambda cabin: cabin if pd.isnull(cabin) else cabin[0])
    _log(X.Deck.value_counts()) 
    
    # Длина имени
    _log('\n------NameLen')
    X['NameLen'] = (X.Name.str.len() / 10).astype(int)    
    X.NameLen = X.NameLen.where(X.NameLen < 4, 4)
    _log(X.NameLen.value_counts()) 
    
    X = pd.get_dummies(X, columns=['Embarked', 'Title', 'Pclass', 'Deck', 'FareClass', 'NameLen'])

    # Удаляем номер билета, каюты и имя пассажира
    X = X.drop(['Cabin', 'Ticket', 'Name'], axis=1)
    
    return X, is_test, PassengerId

In [71]:
from sklearn.preprocessing import StandardScaler

def normalize_data(X):
    """Возвращает нормализованные значения"""
    index = X.index
    X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)
    X.index = index
    return X

In [72]:
def split_data(X, is_test):
    """Разделяет данные на тренировочную и тестовую выбрки по сохранённому флагу is_test"""
    X_train = X[is_test == 0]
    X_test = X[is_test == 1]
    return X_train, X_test

In [73]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from operator import itemgetter

def tune_model(model, X, y, params, n_splits=3, scoring='accuracy', n_jobs=-1):
    """Тюнит модель и возвращает лучшую модель и [топ-5 лучших параметров]"""    
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1)
    grid = GridSearchCV(model, params, cv=cv, scoring=scoring, refit=True, n_jobs=n_jobs)
    grid.fit(X, y)
    top_params = sorted(
        zip(grid.cv_results_['params'], grid.cv_results_['mean_test_score']), 
        key=itemgetter(1),
        reverse=True
    )
    top_5_params = list(map(itemgetter(0), top_params[:5]))
    return grid.best_estimator_, top_5_params

In [74]:
def test_model(model, X, y, n_splits=3, scoring='accuracy', n_jobs=-1):
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1)
    return cross_val_score(model, X, y, cv=cv,  scoring=scoring, n_jobs=n_jobs)

In [75]:
from sklearn.model_selection import cross_val_score

def get_prediction(model, X, y, X_test):
    """Тренирует модель и возвращает предсказания"""
    model.fit(X, y)
    predict = model.predict(X_test)
    predict_proba = model.predict_proba(X_test)
    return predict, predict_proba

In [76]:
from sklearn.feature_selection import SelectFromModel

def select_best_features(model, X, y, threshold='mean'):
    """Выбирает лучшие фичи на основе модели с помощью SelectFromModel"""
    feature_select = SelectFromModel(model, threshold)
    X_best = feature_select.fit_transform(X, y)
    best_features = list(
        column for selected, column 
        in zip(feature_select.get_support(), X_train.columns) 
        if selected
    )
    print("Best featues ({}): {}".format(len(best_features), ', '.join(map(str, best_features))))
    return best_features

In [77]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection.from_model import _get_feature_importances

def get_scored_features(X, y):
    """Возвращает датафрейм feature/score с момощью SelectKBest отсортированный по score """
    selector = SelectKBest(f_classif, k='all')
    selector.fit(X, y)
    return pd.DataFrame(
        list(zip(X_train.columns, -np.log10(selector.pvalues_))),
        columns=['feature', 'score']
    ).set_index('feature').sort_values('score', ascending=False)

def get_scored_features_by_model(X, model):
    """Возвращает датафрейм feature/score по с фичами которые модель посчитала наиболее важными для себя"""
    try:
        feature_importances = _get_feature_importances(model)
    except ValueError:
        return None
    
    return pd.DataFrame(
        list(zip(X.columns, feature_importances)),
        columns=['feature', 'score']
    ).set_index('feature').sort_values('score', ascending=False)

def plot_features(feature_scores):
    """Рисует график фич по датасету feature/score"""
    features_range = range(len(feature_scores))
    plt.bar(features_range, feature_scores.score.ravel())
    plt.xticks(features_range, feature_scores.index.ravel(), rotation='vertical')
    
# plot_features(get_scored_features(X, y))
# plot_features(get_scored_features_by_model(X, model))

In [78]:
def get_feature_importance(model, X):
    return pd.DataFrame(
        list(zip(X.columns, model.feature_importances_)),
        columns = ['feature', 'importance']
    ).set_index(
        'feature'
    ).sort_values(
        'importance', ascending=False
    )  

def get_feature_importance_bar(model, X):
    return get_feature_importance(model, X).style.bar()

In [79]:
from sklearn.metrics import roc_curve, roc_auc_score

def plot_roc_curve(y, y_predict_proba):
    """Рисует кривую ROC"""
    fpr, tpr, thres = roc_curve(y, y_predict_proba[:, 1])
    plt.plot(fpr, tpr);
    plt.title('ROC: {:.3f}'.format(roc_auc_score(y, y_predict_proba[:, 1])))

In [80]:
def save_submission(file_name, PassengerId, predict):
    """Сохраняет результат в файл"""
    submussion = 'PassengerId,Survived\n'
    submussion += "\n".join(["{},{}".format(pid, prediction) for pid, prediction in zip(PassengerId, predict)])
    
    with open(file_name, 'w') as file:
        file.write(submussion) 

In [81]:
from collections import namedtuple

class TuneResult(object):
    def __init__(self, name, params, features, score):
        self.name = name
        self.score_mean = score.mean()
        self.score_std = score.std()
        self.params = params
        self.features = features
        
    @property
    def min_score(self):
        return self.score_mean - self.score_std
    
    def __str__(self):
        return '{:.5f} ±{:.3f}'.format(self.score_mean, self.score_std)
    
    def __repr__(self):
        return 'TuneResult "{}" {}'.format(self.name, self)
        

def tune_model_and_get_predictions(
    model, 
    X, y, X_test, 
    params, 
    scoring='accuracy',
    n_best_features = [10, 11, 12, 13],
    n_best_model_featues = [10, 11, 12, 13],
    tune_for_each_features_subset=True
):

    results = []
    best_result = None
    def add_result(name, model, features, score):
        nonlocal best_result
        model_params = {param: value for param, value in model.get_params().items() if param in params}
        result = TuneResult(name, model_params, features, score)
        print("Score ({}): {}".format(scoring, result))
        results.append(result)
        if not best_result or best_result.min_score < result.min_score:
            best_result = result
            
    def test_model_add_result(name, model, X, y, features=None):
        X = X if not features else X[features]
        score = test_model(model, X, y, scoring=scoring)
        add_result(name, model, features, score)
        
    def best_model():
        model.set_params(**best_result.params)
        return model
    
    def test_with_features(name, features):
        if tune_for_each_features_subset:
            model_tuned, _ = tune_model(model, X[features], y, params, scoring=scoring)
            test_model_add_result('{}_{}_tuned'.format(name, len(features)), model_tuned, X, y, features)
        else:
            model_tuned = best_model()
            model_tuned.fit(X[features], y)
            test_model_add_result('{}_{}'.format(name, len(features)), model_tuned, X, y, features)
        
    print(model.__class__.__name__)
    
    print('\n1. Default params')
    test_model_add_result('default', model, X, y)
    
    print('\n2. Tune model')
    model_tuned, best_params = tune_model(model, X, y, params, scoring=scoring)
    print('best params:')
    print("\n".join(map(str, best_params[:3])))
    test_model_add_result('tuned', model_tuned, X, y)
    
    print('\n3. Selecting best features')
    
    print('\n3.1. Testing best features from KBest')
    best_features = get_scored_features(X, y).index.ravel()
    for n_features in n_best_features:
        print('n features = {}'.format(n_features))
        test_with_features('best_features', list(best_features[:n_features]))
        
    print('\n3.2. Testing best features from model')
    model = best_model()
    model.fit(X, y)
    best_model_features = get_scored_features_by_model(X, best_model())
    if best_model_features is not None:
        best_model_features = best_model_features.index.ravel()
        for n_features in n_best_model_featues:
            print('n features = {}'.format(n_features))
            features = list(best_model_features[:n_features])
            test_with_features('best_model_features', list(best_model_features[:n_features]))
    else:
        print('\n{} has no coef_ or feature_importances_, can\'t select best features'.format(
            model.__class__.__name__
        ))
        
    if not tune_for_each_features_subset:
        print('\n5. Final model tuning')
        model = best_model()
        features = best_result.features
        if features:
            model_tuned, best_params = tune_model(model, X[features], y, params, scoring=scoring)
        else:
            model_tuned, best_params = tune_model(model, X, y, params, scoring=scoring)

        print('best params:')
        print("\n".join(map(str, best_params[:3])))
        test_model_add_result('final_tuned', model_tuned, X, y, features) 
        
    print('\nBEST: {} {}'.format(best_result.name, best_result))
    print("Params: {}".format(best_result.params))
    
    if best_result.features:
        print("Features: {}".format(', '.join(map(str, best_result.features))))
        X = X[best_result.features]
        X_test = X_test[best_result.features]
    else:
        print("Features: ALL")
        
    return best_result, get_prediction(best_model(), X, y, X_test)

In [82]:
X, y_train = load_titanic()
X, is_test, PassengerId = clean_data(X)
X = normalize_data(X)
X_train, X_test = split_data(X, is_test)

In [83]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [84]:
result, (predict, predict_proba) = tune_model_and_get_predictions(
    DecisionTreeClassifier(random_state=1),
    X_train, y_train, X_test,
    {
        'max_depth': list(range(2, 10)),
        "min_samples_split": [2, 3, 4, 5, 6, 8, 10],
        "min_samples_leaf": [1, 2, 4]        
    },
)
result

DecisionTreeClassifier

1. Default params
Score (accuracy): 0.78339 ±0.006

2. Tune model
best params:
{'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 8}
{'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 10}
{'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 6}
Score (accuracy): 0.82267 ±0.009

3. Selecting best features

3.1. Testing best features from KBest
n features = 10
Score (accuracy): 0.82492 ±0.016
n features = 11
Score (accuracy): 0.82828 ±0.019
n features = 12
Score (accuracy): 0.83277 ±0.010
n features = 13
Score (accuracy): 0.83277 ±0.010

3.2. Testing best features from model
n features = 10
Score (accuracy): 0.82828 ±0.005
n features = 11
Score (accuracy): 0.82716 ±0.009
n features = 12
Score (accuracy): 0.82828 ±0.005
n features = 13
Score (accuracy): 0.82492 ±0.007

BEST: best_model_features_10_tuned 0.82828 ±0.005
Params: {'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 8}
Features: Title_Mr, FamilySize, Pclass2, Fare, Tit

TuneResult "best_model_features_10_tuned" 0.82828 ±0.005

In [85]:
# {'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 150}
score, (predict, predict_proba) = tune_model_and_get_predictions(
    RandomForestClassifier(random_state=1), 
    X_train, y_train, X_test, dict(
        n_estimators=[150 , 300, 500],
        min_samples_split=[4, 5, 6, 8, 10], 
        min_samples_leaf=[1, 2, 4],
        max_depth=[7, 8, 9],        
    ),
    tune_for_each_features_subset=False    
)
score

RandomForestClassifier

1. Default params
Score (accuracy): 0.83165 ±0.027

2. Tune model
best params:
{'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 150}
{'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 150}
{'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 150}
Score (accuracy): 0.83389 ±0.004

3. Selecting best features

3.1. Testing best features from KBest
n features = 10
Score (accuracy): 0.80022 ±0.012
n features = 11
Score (accuracy): 0.80584 ±0.009
n features = 12
Score (accuracy): 0.80584 ±0.019
n features = 13
Score (accuracy): 0.80471 ±0.021

3.2. Testing best features from model
n features = 10
Score (accuracy): 0.82941 ±0.010
n features = 11
Score (accuracy): 0.82043 ±0.013
n features = 12
Score (accuracy): 0.82492 ±0.013
n features = 13
Score (accuracy): 0.81818 ±0.008

5. Final model tuning
best params:
{'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estima

TuneResult "tuned" 0.83389 ±0.004

In [86]:
score, (predict, predict_proba) = tune_model_and_get_predictions(
    KNeighborsClassifier(), 
    X_train, y_train, X_test, {
        'n_neighbors': [3,4,5,6,7]
    }, 
    n_best_features = [10, 11, 12 ,13, 14, 15, 16],
)
score

KNeighborsClassifier

1. Default params
Score (accuracy): 0.80696 ±0.011

2. Tune model
best params:
{'n_neighbors': 5}
{'n_neighbors': 7}
{'n_neighbors': 6}
Score (accuracy): 0.80696 ±0.011

3. Selecting best features

3.1. Testing best features from KBest
n features = 10
Score (accuracy): 0.80247 ±0.002
n features = 11
Score (accuracy): 0.80584 ±0.007
n features = 12
Score (accuracy): 0.80584 ±0.014
n features = 13
Score (accuracy): 0.80808 ±0.013
n features = 14
Score (accuracy): 0.81481 ±0.013
n features = 15
Score (accuracy): 0.81481 ±0.013
n features = 16
Score (accuracy): 0.81481 ±0.010

3.2. Testing best features from model

KNeighborsClassifier has no coef_ or feature_importances_, can't select best features

BEST: best_features_16_tuned 0.81481 ±0.010
Params: {'n_neighbors': 5}
Features: Title_Mr, Sex, Title_Mrs, Pclass2, Title_Miss, Pclass_3, HasCabin, Pclass_1, NameLen_4, Fare, Alone, NameLen_1, FareClass_0-100, Deck_B, FareClass_100-500, Embarked_C


TuneResult "best_features_16_tuned" 0.81481 ±0.010

In [87]:
score, (predict, predict_proba) = tune_model_and_get_predictions(
    SVC(probability=True, random_state=1), 
    X_train, y_train, X_test, dict(
        C=[0.8, 1, 1.5, 2, 5, 10],
        kernel=['rbf'],
        gamma=[0.01, 0.001, 0.0001],
        degree=[2, 3, 5], 
    ),
    tune_for_each_features_subset=False
)
score

SVC

1. Default params
Score (accuracy): 0.83053 ±0.010

2. Tune model
best params:
{'C': 2, 'degree': 2, 'gamma': 0.01, 'kernel': 'rbf'}
{'C': 2, 'degree': 3, 'gamma': 0.01, 'kernel': 'rbf'}
{'C': 2, 'degree': 5, 'gamma': 0.01, 'kernel': 'rbf'}
Score (accuracy): 0.83389 ±0.014

3. Selecting best features

3.1. Testing best features from KBest
n features = 10
Score (accuracy): 0.78563 ±0.010
n features = 11
Score (accuracy): 0.79686 ±0.004
n features = 12
Score (accuracy): 0.79461 ±0.005
n features = 13
Score (accuracy): 0.79686 ±0.004

3.2. Testing best features from model

SVC has no coef_ or feature_importances_, can't select best features

5. Final model tuning
best params:
{'C': 2, 'degree': 2, 'gamma': 0.01, 'kernel': 'rbf'}
{'C': 2, 'degree': 3, 'gamma': 0.01, 'kernel': 'rbf'}
{'C': 2, 'degree': 5, 'gamma': 0.01, 'kernel': 'rbf'}
Score (accuracy): 0.83389 ±0.014

BEST: default 0.83053 ±0.010
Params: {'C': 1.0, 'degree': 3, 'gamma': 'auto', 'kernel': 'rbf'}
Features: ALL


TuneResult "default" 0.83053 ±0.010

In [88]:
score, (predict, predict_proba) = tune_model_and_get_predictions(
    LogisticRegression(random_state=1), 
    X_train, y_train, X_test, dict(
        penalty=['l2', 'l1'],
        C=[0.5, 0.8, 1.0, 1.2, 1.5],
        tol=[0.0001, 0.001, 0.01, 0.1]
    ),
    n_best_model_featues=[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
)
score

LogisticRegression

1. Default params
Score (accuracy): 0.82492 ±0.003

2. Tune model
best params:
{'C': 0.5, 'penalty': 'l1', 'tol': 0.0001}
{'C': 0.5, 'penalty': 'l1', 'tol': 0.001}
{'C': 0.5, 'penalty': 'l1', 'tol': 0.01}
Score (accuracy): 0.82828 ±0.007

3. Selecting best features

3.1. Testing best features from KBest
n features = 10
Score (accuracy): 0.78676 ±0.019
n features = 11
Score (accuracy): 0.78563 ±0.011
n features = 12
Score (accuracy): 0.79012 ±0.018
n features = 13
Score (accuracy): 0.78563 ±0.011

3.2. Testing best features from model
n features = 10
Score (accuracy): 0.83389 ±0.004
n features = 11
Score (accuracy): 0.83389 ±0.006
n features = 12
Score (accuracy): 0.83389 ±0.010
n features = 13
Score (accuracy): 0.83277 ±0.012
n features = 14
Score (accuracy): 0.83726 ±0.010
n features = 15
Score (accuracy): 0.83502 ±0.010
n features = 16
Score (accuracy): 0.84175 ±0.005
n features = 17
Score (accuracy): 0.84175 ±0.005
n features = 18
Score (accuracy): 0.84175 ±0.005

TuneResult "best_model_features_16_tuned" 0.84175 ±0.005

In [89]:
import xgboost
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from sklearn.metrics import accuracy_score

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

# в этой функции мы проверяем, как ведёт себя модель при заданных параметрах
def score(params):
    print("Training with params : ")
    print(params)
    params['n_estimators'] = int(params['n_estimators'])
    model = xgboost.XGBClassifier(**params)
    scores = []
    for train_idx, test_idx in cv.split(X_train, y_train):
        X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]
        model.fit(X_train_fold, y_train_fold)
        preds = model.predict(X_test_fold)
        score = accuracy_score(y_test_fold, preds)
        scores.append(score)
    score = np.mean(scores)
    result = {'loss': 1 - score, 'status': STATUS_OK}
    print(result)
    return result

# это наша главная функция, в которой мы задаём параметры
def optimize(trials):
    space = {
        'n_estimators' : hp.quniform('n_estimators', 100, 1000, 5), # (название параметра, от, до, шаг)
        'learning_rate' : hp.quniform('learning_rate', 0.025, 0.5, 0.025),
        'max_depth' : scope.int(hp.quniform('max_depth', 1, 10, 1)),
        'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1),
        'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'objective': 'reg:linear',
        'silent' : 1,
        'scale_pos_weight': hp.quniform('scale_pos_weight', 0.5, 10., 0.5),
        'reg_alpha': 0.0,
        'reg_lambda': 1.0
    }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)
    print('Best: ')
    print(best)

#сюда будет записана
trials = Trials()

optimize(trials)

Training with params : 
{'colsample_bytree': 0.5, 'gamma': 0.7000000000000001, 'learning_rate': 0.45, 'max_depth': 8, 'min_child_weight': 4.0, 'n_estimators': 525.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 0.5, 'silent': 1, 'subsample': 0.65}
{'loss': 0.18181818181818177, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.65, 'gamma': 0.65, 'learning_rate': 0.42500000000000004, 'max_depth': 8, 'min_child_weight': 2.0, 'n_estimators': 330.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 4.0, 'silent': 1, 'subsample': 0.7000000000000001}
{'loss': 0.22671156004489335, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.55, 'gamma': 0.5, 'learning_rate': 0.15000000000000002, 'max_depth': 3, 'min_child_weight': 2.0, 'n_estimators': 375.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 2.5, 'silent': 1, 'subsample': 1.0}
{'loss': 0.19191919191919193, '

{'loss': 0.17396184062850739, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8500000000000001, 'gamma': 0.75, 'learning_rate': 0.275, 'max_depth': 4, 'min_child_weight': 3.0, 'n_estimators': 580.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 6.0, 'silent': 1, 'subsample': 0.9500000000000001}
{'loss': 0.23120089786756459, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.9500000000000001, 'gamma': 0.8500000000000001, 'learning_rate': 0.25, 'max_depth': 3, 'min_child_weight': 6.0, 'n_estimators': 660.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 2.0, 'silent': 1, 'subsample': 0.8500000000000001}
{'loss': 0.18406285072951734, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8500000000000001, 'gamma': 0.7000000000000001, 'learning_rate': 0.325, 'max_depth': 6, 'min_child_weight': 3.0, 'n_estimators': 750.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.

{'loss': 0.1728395061728395, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.7000000000000001, 'gamma': 0.5, 'learning_rate': 0.2, 'max_depth': 2, 'min_child_weight': 4.0, 'n_estimators': 135.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 9.5, 'silent': 1, 'subsample': 0.8}
{'loss': 0.36139169472502797, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.55, 'gamma': 0.9500000000000001, 'learning_rate': 0.15000000000000002, 'max_depth': 1, 'min_child_weight': 6.0, 'n_estimators': 290.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 3.0, 'silent': 1, 'subsample': 0.7000000000000001}
{'loss': 0.2008978675645342, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.6000000000000001, 'gamma': 0.8500000000000001, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 6.0, 'n_estimators': 410.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weigh

{'loss': 0.16722783389450058, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8, 'gamma': 0.9, 'learning_rate': 0.35000000000000003, 'max_depth': 1, 'min_child_weight': 5.0, 'n_estimators': 475.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 2.5, 'silent': 1, 'subsample': 0.6000000000000001}
{'loss': 0.18294051627384966, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8, 'gamma': 0.9500000000000001, 'learning_rate': 0.375, 'max_depth': 1, 'min_child_weight': 4.0, 'n_estimators': 315.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 2.0, 'silent': 1, 'subsample': 0.65}
{'loss': 0.18294051627384966, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8500000000000001, 'gamma': 0.8500000000000001, 'learning_rate': 0.30000000000000004, 'max_depth': 2, 'min_child_weight': 3.0, 'n_estimators': 355.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_we

{'loss': 0.19079685746352404, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8, 'gamma': 1.0, 'learning_rate': 0.325, 'max_depth': 8, 'min_child_weight': 3.0, 'n_estimators': 400.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 4.0, 'silent': 1, 'subsample': 0.75}
{'loss': 0.20987654320987659, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.7000000000000001, 'gamma': 0.8, 'learning_rate': 0.275, 'max_depth': 4, 'min_child_weight': 4.0, 'n_estimators': 455.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 6.5, 'silent': 1, 'subsample': 0.55}
{'loss': 0.24691358024691346, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.6000000000000001, 'gamma': 0.9, 'learning_rate': 0.225, 'max_depth': 5, 'min_child_weight': 5.0, 'n_estimators': 545.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 0.5, 'silent': 1, 'subsample': 0.6000000000000001}

{'loss': 0.23793490460157118, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.9, 'gamma': 0.9500000000000001, 'learning_rate': 0.325, 'max_depth': 7, 'min_child_weight': 1.0, 'n_estimators': 900.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 3.0, 'silent': 1, 'subsample': 0.8}
{'loss': 0.20089786756453432, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8500000000000001, 'gamma': 1.0, 'learning_rate': 0.5, 'max_depth': 1, 'min_child_weight': 2.0, 'n_estimators': 660.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 5.0, 'silent': 1, 'subsample': 0.9}
{'loss': 0.27609427609427606, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8, 'gamma': 0.9, 'learning_rate': 0.30000000000000004, 'max_depth': 2, 'min_child_weight': 3.0, 'n_estimators': 210.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 1.0, 'silent': 1, 'subsample': 0.7000000

{'loss': 0.16610549943883279, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8500000000000001, 'gamma': 0.9500000000000001, 'learning_rate': 0.30000000000000004, 'max_depth': 2, 'min_child_weight': 2.0, 'n_estimators': 730.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 1.5, 'silent': 1, 'subsample': 0.75}
{'loss': 0.17171717171717171, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8, 'gamma': 0.9500000000000001, 'learning_rate': 0.275, 'max_depth': 1, 'min_child_weight': 3.0, 'n_estimators': 680.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 2.0, 'silent': 1, 'subsample': 0.7000000000000001}
{'loss': 0.17508417508417506, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8, 'gamma': 1.0, 'learning_rate': 0.25, 'max_depth': 2, 'min_child_weight': 3.0, 'n_estimators': 650.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 2.5, 'si

{'loss': 0.25364758698092038, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8500000000000001, 'gamma': 0.9, 'learning_rate': 0.2, 'max_depth': 4, 'min_child_weight': 6.0, 'n_estimators': 810.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 4.0, 'silent': 1, 'subsample': 0.7000000000000001}
{'loss': 0.21997755331088664, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.65, 'gamma': 0.8500000000000001, 'learning_rate': 0.07500000000000001, 'max_depth': 4, 'min_child_weight': 4.0, 'n_estimators': 865.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 5.5, 'silent': 1, 'subsample': 0.65}
{'loss': 0.24130190796857465, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8, 'gamma': 1.0, 'learning_rate': 0.30000000000000004, 'max_depth': 2, 'min_child_weight': 3.0, 'n_estimators': 385.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 0.5, 'sil

{'loss': 0.23007856341189681, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.8, 'gamma': 0.7000000000000001, 'learning_rate': 0.025, 'max_depth': 2, 'min_child_weight': 5.0, 'n_estimators': 850.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 5.0, 'silent': 1, 'subsample': 1.0}
{'loss': 0.25140291806958481, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.7000000000000001, 'gamma': 0.55, 'learning_rate': 0.07500000000000001, 'max_depth': 5, 'min_child_weight': 5.0, 'n_estimators': 725.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 1.0, 'silent': 1, 'subsample': 0.8500000000000001}
{'loss': 0.16722783389450058, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.7000000000000001, 'gamma': 1.0, 'learning_rate': 0.275, 'max_depth': 5, 'min_child_weight': 6.0, 'n_estimators': 365.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 0.5, 's

{'loss': 0.1863075196408529, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.5, 'gamma': 1.0, 'learning_rate': 0.17500000000000002, 'max_depth': 1, 'min_child_weight': 6.0, 'n_estimators': 690.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 2.0, 'silent': 1, 'subsample': 0.8500000000000001}
{'loss': 0.16386083052749723, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.5, 'gamma': 1.0, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 6.0, 'n_estimators': 780.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 2.0, 'silent': 1, 'subsample': 0.9}
{'loss': 0.17396184062850717, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.5, 'gamma': 0.55, 'learning_rate': 0.15000000000000002, 'max_depth': 5, 'min_child_weight': 6.0, 'n_estimators': 585.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 3.5, 'silent': 1, 'subsample': 0.95000000

{'loss': 0.17508417508417506, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.65, 'gamma': 0.8500000000000001, 'learning_rate': 0.225, 'max_depth': 6, 'min_child_weight': 4.0, 'n_estimators': 790.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 3.0, 'silent': 1, 'subsample': 1.0}
{'loss': 0.18855218855218858, 'status': 'ok'}
Training with params : 
{'colsample_bytree': 0.5, 'gamma': 0.9500000000000001, 'learning_rate': 0.07500000000000001, 'max_depth': 3, 'min_child_weight': 5.0, 'n_estimators': 740.0, 'objective': 'reg:linear', 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 0.5, 'silent': 1, 'subsample': 0.9}
{'loss': 0.18406285072951734, 'status': 'ok'}
Best: 
{'colsample_bytree': 0.65, 'gamma': 0.75, 'learning_rate': 0.1, 'max_depth': 3.0, 'min_child_weight': 6.0, 'n_estimators': 570.0, 'scale_pos_weight': 1.0, 'subsample': 0.65}


In [94]:
xgboost_param = {
    'colsample_bytree': 0.65, 
    'gamma': 0.75, 
    'learning_rate': 0.1, 
    'max_depth': 3, 
    'min_child_weight': 6, 
    'n_estimators': 570, 
    'scale_pos_weight': 1, 
    'subsample': 0.65
}
score = test_model(xgboost.XGBClassifier(**xgboost_param), X_train, y_train, n_jobs=1)

# (0.82512050426399675, 0.056796792398344419)
score.mean(), score.std()

(0.82491582491582494, 0.0072735585840716906)

In [91]:
score = test_model(xgboost.XGBClassifier(**{
    'colsample_bytree': 0.7, 
    'gamma': 0.7, 
    'learning_rate': 0.1, 
    'max_depth': 3, 
    'min_child_weight': 1, 
    'n_estimators': 505, 
    'scale_pos_weight': 1, 
    'subsample': 0.8
}), X_train, y_train, n_jobs=1)

#(0.82845878136200701, 0.05527740770697482)
score.mean(), score.std()

(0.82267115600448937, 0.0079361030436200779)

In [92]:
from mlxtend.classifier import StackingClassifier
from sklearn.pipeline import make_pipeline

from sklearn.base import BaseEstimator

class ColumnSelector(BaseEstimator):
    def __init__(self, cols=None):
        self.cols = cols

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

    def transform(self, X, y=None):
        if self.cols:
            return X[self.cols]
        else:
            return X

    def fit(self, X, y=None):
        return self

In [171]:
import xgboost
random_state = 1
models = {}

models['DecisionTree'] = make_pipeline(
    ColumnSelector(),
    DecisionTreeClassifier(**{
        'random_state': random_state,
        'max_depth': 6, 
        'min_samples_leaf': 1, 
        'min_samples_split': 8
    })
)

models['RandomForest'] = make_pipeline(
    ColumnSelector(cols=[
        'Title_Mr', 'Sex', 'Fare', 'Pclass2', 'Title_Miss', 'Age', 'Title_Mrs', 
        'Pclass_3', 'FamilySize', 'HasCabin', 'NameLen_4', 'BigFamily'    
    ]),
    RandomForestClassifier(**{
        'random_state': random_state,
        'max_depth': 7, 
        'min_samples_leaf': 2, 
        'min_samples_split': 10, 
        'n_estimators': 150
    })
)

models['KNeighbors'] = make_pipeline(
    ColumnSelector(),
    KNeighborsClassifier(**{
        'n_neighbors': 5
    })
)

models['LogisticRegression'] = make_pipeline(
    ColumnSelector(cols=[
        'Sex', 'BigFamily', 'Title_Master', 'Title_Mr', 'Age', 'Fare', 'Alone', 
        'Pclass2', 'Pclass_3', 'Deck_E', 'Title_Mrs', 'Pclass_1', 'HasCabin', 
        'NameLen_4', 'Deck_D', 'FareClass_>500'    
    ]),
    LogisticRegression(**{
        'random_state': random_state,
        'C': 0.5, 
        'penalty': 'l1', 
        'tol': 0.0001
    })
)

models['SVC'] = make_pipeline(
    ColumnSelector(),
    SVC(**{
        'random_state': random_state,
        'probability': True,    
        'C': 2, 
        'degree': 2, 
        'gamma': 0.01, 
        'kernel': 'rbf'
    })
)

models['XGBoost'] = make_pipeline(
    ColumnSelector(),
    xgboost.XGBClassifier(**{
        'colsample_bytree': 0.7, 
        'gamma': 0.7, 
        'learning_rate': 0.1, 
        'max_depth': 3, 
        'min_child_weight': 1, 
        'n_estimators': 505, 
        'scale_pos_weight': 1, 
        'subsample': 0.8
    })
)


selected_model_names = ['DecisionTree', 'RandomForest', 'KNeighbors', 'SVC', 'LogisticRegression', 'XGBoost']
selected_models = [models[name] for name in selected_model_names]
sclf = StackingClassifier(
    classifiers=selected_models, 
    meta_classifier=LogisticRegression()
)

In [172]:
sclf.fit(X_train, y_train)

StackingClassifier(average_probas=False,
          classifiers=[Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(cols=None)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_sp...stic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8))])],
          meta_classifier=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          use_features_in_secondary=False, use_probas=False, verbose=0)

In [146]:
# 0.81248 ±0.042 [DecisionTree]
# 0.83624 ±0.056 [RandomForest]
# 0.81600 ±0.044 [KNeighbors]
# 0.82382 ±0.059 [SVC]
# 0.83413 ±0.056 [LogisticRegression]
# 0.82846 ±0.055 [XGBoost]
# 0.82846 ±0.055 [StackingClassifier]

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
for clf, label in zip(selected_models + [sclf], selected_model_names + ['StackingClassifier']):
    scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy')
    print('{:.5f} ±{:.3f} [{}]'.format(scores.mean(), scores.std(), label))

0.80695 ±0.044 [DecisionTree]
0.83287 ±0.057 [RandomForest]
0.81715 ±0.046 [KNeighbors]
0.81700 ±0.055 [SVC]
0.83413 ±0.056 [LogisticRegression]
0.82864 ±0.055 [XGBoost]
0.82864 ±0.055 [StackingClassifier]


In [118]:
y_test_predict, y_test_predict_proba = get_prediction(models['LogisticRegression'], X_train, y_train, X_test)

In [119]:
save_submission('submission.txt', PassengerId, y_test_predict)

In [110]:
model = xgboost.XGBClassifier(**{
        'colsample_bytree': 0.7, 
        'gamma': 0.7, 
        'learning_rate': 0.1, 
        'max_depth': 3, 
        'min_child_weight': 1, 
        'n_estimators': 505, 
        'scale_pos_weight': 1, 
        'subsample': 0.8
    })

In [111]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0.7, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=505, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)

In [112]:
y_test_predict, y_test_predict_proba = get_prediction(model, X_train, y_train, X_test)

In [113]:
save_submission('submission.txt', PassengerId, y_test_predict)

In [166]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [32]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

efs = EFS(
    KNeighborsClassifier(n_neighbors=3), 
    min_features=5,
    max_features=5,#len(X.columns),
    scoring='accuracy',
    print_progress=True,
    cv=2,
    n_jobs=-1
)

efs.fit(X_train.as_matrix(), y_train.as_matrix())

print('Best negtive mean squared error: %.2f' % efs.best_score_)
## Print the IDX of the best features 
print('Best subset:', efs.best_idx_)

df = pd.DataFrame.from_dict(efs.get_metric_dict()).T
df.sort_values('avg_score', inplace=True, ascending=False)
df['features'] = df.feature_idx.map(lambda idx: itemgetter(*idx)(X.columns))
df[['avg_score', 'std_err', 'features']].head(15)

KeyboardInterrupt: 

### KNeighborsClassifier(n_neighbors=3)
 - 3 **Sex, BigFamily, Title_Master** 0.828279	0.00355973
 - 4 **Sex, BigFamily, Title_Master, FareClass_>500** 0.830526	0.00131254

In [61]:
features = ['Sex', 'BigFamily', 'Title_Master', 'FareClass_>500']
score = test_model(KNeighborsClassifier(n_neighbors=3), X_train[features], y_train, n_splits=3)

# (0.82512050426399675, 0.056796792398344419)
score.mean(), score.std()

(0.82940516273849607, 0.011110544261068089)

In [56]:
score

array([ 0.80645161,  0.87096774,  0.74193548,  0.90322581,  0.80645161,
        0.83870968,  0.80645161,  0.83870968,  0.70967742,  0.9       ,
        0.83333333,  0.76666667,  0.79310345,  0.75862069,  0.86206897,
        0.86206897,  0.93103448,  0.79310345,  0.79310345,  0.79310345,
        0.82758621,  0.86206897,  0.89655172,  0.72413793,  0.93103448,
        0.82758621,  0.82758621,  0.89655172,  0.86206897,  0.75862069])

In [67]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=2, shuffle=True, random_state=1)
list(cv.split(list(range(10))))

[(array([1, 3, 5, 7, 8]), array([0, 2, 4, 6, 9])),
 (array([0, 2, 4, 6, 9]), array([1, 3, 5, 7, 8]))]