In [2]:
import numpy as np
import pandas as pd
from  matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

%config InlineBackend.figure_format = 'retina'
from pylab import rcParams
rcParams['figure.figsize'] = (9, 6)

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import roc_auc_score

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, LogisticRegression

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import GradientBoostingClassifier

In [3]:
# Прочитаем данные еще раз
train = pd.read_csv('titanic/train.csv')
test  = pd.read_csv('titanic/test.csv')

y_train = train.Survived
# Удалим целевую функцию из train
train.drop('Survived',axis=1,inplace=True)
# Пометим выборки
train['is_test'] = 0
test['is_test'] = 1
# Склеим
df = pd.concat([train,test])
# Заменим пол со строковой переменной на числовую
df['IsMale'] = df.Sex.replace({'male':1, 'female':0})
#Давим фичу HaveCabin - проверим у кого указана каюта.
#df['HaveCabin'] = df.Cabin.isnull()
#df.HaveCabin.replace({True : 1, False : 0},inplace=True)
#get_titles(df)
#process_age(df)
#process_ticket(df)
#process_family(df)
#process_cabin(df)

In [11]:
#df_dummies = pd.get_dummies(df, columns=['Pclass','Title','Embarked'])
df_dummies = pd.get_dummies(df, columns=['Pclass','Embarked'])

#df_dummies.head()
df_dummies.drop(['PassengerId','Name','Sex','Ticket','Cabin'],axis=1,inplace=True)
# Разделим тренировочную и тестовую выборку.

X_train = df_dummies[df_dummies.is_test == 0].drop('is_test',axis = 1)
X_test = df_dummies[df_dummies.is_test == 1].drop('is_test', axis = 1)

# Заполнение пустых значений
columns = X_train.columns
imputer = Imputer(missing_values='NaN', strategy='mean', axis = 0, verbose=1, copy = True)
imputer.fit(X_train)

X_train_imputed = imputer.transform(X_train)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=columns)

X_test_imputed = imputer.transform(X_test)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=columns)

# Нормировка значений
scaler = StandardScaler()
scaler.fit(X_train_imputed)
X_train_i_s = scaler.transform(X_train_imputed)
X_train_i_s = pd.DataFrame(X_train_i_s, columns = columns)

# Заполним тестовую выборку
X_test_i_s = scaler.transform(imputer.transform(X_test))
X_test_i_s = pd.DataFrame(X_test_i_s, columns = columns)

# Library

In [4]:
def process_cabin(df):

    # replacing missing cabins with U (for Uknown)
    df.Cabin.fillna('U', inplace=True)
    
    # mapping each Cabin value with the cabin letter
    df['Cabin'] = df['Cabin'].map(lambda c : c[0])
    
    # dummy encoding ...
    cabin_dummies = pd.get_dummies(df['Cabin'], prefix='Cabin')
    
    df = pd.concat([df,cabin_dummies], axis=1)
    
    df.drop('Cabin', axis=1, inplace=True)

In [5]:
def process_family(df):

    # introducing a new feature : the size of families (including the passenger)
    df['FamilySize'] = df['Parch'] + df['SibSp'] + 1
    
    # introducing other features based on the family size
    df['Singleton'] = df['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    df['SmallFamily'] = df['FamilySize'].map(lambda s: 1 if 2<=s<=4 else 0)
    df['LargeFamily'] = df['FamilySize'].map(lambda s: 1 if 5<=s else 0)

In [6]:
def process_ticket(df):
    
    # a function that extracts each prefix of the ticket, returns 'XXX' if no prefix (i.e the ticket is a digit)
    def cleanTicket(ticket):
        ticket = ticket.replace('.','')
        ticket = ticket.replace('/','')
        ticket = ticket.split()
        ticket = map(lambda t : t.strip(), ticket)
        ticket = list(filter(lambda t : not t.isdigit(), ticket))
        if len(ticket) > 0:
            return ticket[0]
        else: 
            return 'XXX'
    

    # Extracting dummy variables from tickets:

    df['Ticket'] = df['Ticket'].map(cleanTicket)
    tickets_dummies = pd.get_dummies(df['Ticket'], prefix='Ticket')
    df = pd.concat([df, tickets_dummies], axis=1)
    df.drop('Ticket', inplace=True, axis=1)

In [7]:
def process_age(df):
    
    grouped = df.groupby(['Sex','Pclass','Title'])
    grouped_median = grouped.median()

    #grouped_test = df.iloc[891:].groupby(['Sex','Pclass','Title'])
    #grouped_median_test = grouped_test.median()       
    # a function that fills the missing values of the Age variable
    
    def fillAges(row, grouped_median):
        if row['Sex']=='female' and row['Pclass'] == 1:
            if row['Title'] == 'Miss':
                return grouped_median.loc['female', 1, 'Miss']['Age']
            elif row['Title'] == 'Mrs':
                return grouped_median.loc['female', 1, 'Mrs']['Age']
            elif row['Title'] == 'Officer':
                return grouped_median.loc['female', 1, 'Officer']['Age']
            elif row['Title'] == 'Royalty':
                return grouped_median.loc['female', 1, 'Royalty']['Age']

        elif row['Sex']=='female' and row['Pclass'] == 2:
            if row['Title'] == 'Miss':
                return grouped_median.loc['female', 2, 'Miss']['Age']
            elif row['Title'] == 'Mrs':
                return grouped_median.loc['female', 2, 'Mrs']['Age']

        elif row['Sex']=='female' and row['Pclass'] == 3:
            if row['Title'] == 'Miss':
                return grouped_median.loc['female', 3, 'Miss']['Age']
            elif row['Title'] == 'Mrs':
                return grouped_median.loc['female', 3, 'Mrs']['Age']

        elif row['Sex']=='male' and row['Pclass'] == 1:
            if row['Title'] == 'Master':
                return grouped_median.loc['male', 1, 'Master']['Age']
            elif row['Title'] == 'Mr':
                return grouped_median.loc['male', 1, 'Mr']['Age']
            elif row['Title'] == 'Officer':
                return grouped_median.loc['male', 1, 'Officer']['Age']
            elif row['Title'] == 'Royalty':
                return grouped_median.loc['male', 1, 'Royalty']['Age']

        elif row['Sex']=='male' and row['Pclass'] == 2:
            if row['Title'] == 'Master':
                return grouped_median.loc['male', 2, 'Master']['Age']
            elif row['Title'] == 'Mr':
                return grouped_median.loc['male', 2, 'Mr']['Age']
            elif row['Title'] == 'Officer':
                return grouped_median.loc['male', 2, 'Officer']['Age']

        elif row['Sex']=='male' and row['Pclass'] == 3:
            if row['Title'] == 'Master':
                return grouped_median.loc['male', 3, 'Master']['Age']
            elif row['Title'] == 'Mr':
                return grouped_median.loc['male', 3, 'Mr']['Age']
    
    df.Age = df.apply(lambda r : fillAges(r, grouped_median_train) if np.isnan(r['Age']) 
                                                      else r['Age'], axis=1)
    
    #df.iloc[891:].Age = df.iloc[891:].apply(lambda r : fillAges(r, grouped_median_test) if np.isnan(r['Age']) 
     #                                                 else r['Age'], axis=1)
    

In [8]:
def get_titles(df):
    
    # we extract the title from each name
    df['Title'] = df['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
    
    # a map of more aggregated titles
    Title_Dictionary = {
                        "Capt":       "Officer",
                        "Col":        "Officer",
                        "Major":      "Officer",
                        "Jonkheer":   "Royalty",
                        "Don":        "Royalty",
                        "Sir" :       "Royalty",
                        "Dr":         "Officer",
                        "Rev":        "Officer",
                        "the Countess":"Royalty",
                        "Dona":       "Royalty",
                        "Mme":        "Mrs",
                        "Mlle":       "Miss",
                        "Ms":         "Mrs",
                        "Mr" :        "Mr",
                        "Mrs" :       "Mrs",
                        "Miss" :      "Miss",
                        "Master" :    "Master",
                        "Lady" :      "Royalty"

                        }
    
    # we map each title
    df['Title'] = df.Title.map(Title_Dictionary)

In [9]:
def get_gboost_best_model(X_train,y_train, score_type,N_est = 200):
    ones_ratio = y_train[y_train == 1].shape[0] * 1.0 / y_train[y_train == 0].shape[0] 
    param_grid = {
        # параметры ансамбля
        'n_estimators': [50, 100, 200, 400],
        #'n_estimators': [N_est],
        'max_depth' : [5],
        'warm_start' : [True],
        'max_features' : ['sqrt','log2',0.5, 0.7]
    }
    cv = KFold(n_splits=2, shuffle=True)

    clf = GradientBoostingClassifier()
    gs = GridSearchCV(clf, param_grid, scoring=score_type, cv=cv, verbose=0)
    gs.fit(X_train,y_train)
    
    best_params = gs.best_estimator_.get_params()
    print('Best score 1 ({0}):{1} '.format(score_type, gs.best_score_))

    # subsample 
    
    param_grid = {
        'subsample' : [0.1, 0.3, 0.5, 0.7, 0.9]
    }
    clf = GradientBoostingClassifier(**best_params)
    gs = GridSearchCV(clf, param_grid, scoring = score_type, cv=cv, verbose=0)
    gs.fit(X_train, y_train)
    print('Best score subsample ({0}):{1} '.format(score_type, gs.best_score_))

    best_params = gs.best_estimator_.get_params()

    
    # max_depth
    
    param_grid = {
        'max_depth' : range(3,10)
    }
    clf = GradientBoostingClassifier(**best_params)
    gs = GridSearchCV(clf, param_grid, scoring = score_type, cv=cv, verbose=0)
    gs.fit(X_train, y_train)
    print('Best score max_depth ({0}):{1} '.format(score_type, gs.best_score_))

    best_params = gs.best_estimator_.get_params()
    

    # learning_rate
    
    param_grid = {
        'learning_rate' : [0.01,0.03, 0.05, 0.07, 0.09, 0.11]
    }
    clf = GradientBoostingClassifier(**best_params)
    gs = GridSearchCV(clf, param_grid, scoring = score_type, cv=cv, verbose=0)
    gs.fit(X_train, y_train)
    print('Best score learning_rate ({0}):{1} '.format(score_type, gs.best_score_))

    best_params = gs.best_estimator_.get_params()
    return best_params

In [10]:
def get_xgboost_best_model(X_train,y_train, score_type, ones_ratio = 1):
    # посчитаем соотношение между классами
    ones_ratio = y_train[y_train == 1].shape[0] * 1.0 / y_train[y_train == 0].shape[0] 
    """ 
    Шаг 1: Зафиксируем learning_rate и параметры дерева и подберём n_estimators¶

    Параметры:
    
    max_depth. Как указанов в таблице выше, обычно варьируется в интервале от 3 до 10 
        (но от задачи к задаче значения могут меняться). В качестве начального значения обычно используют 5
    min_child_weight. Если выборка сильно несбалансирована, то лучше выбрать значение "1". 
        Иначе лучше выбрать значение "2" и зафиксировать
    gamma. Обычно выставляют значение в интервале от 0 до 0.2 и фиксируют. 
        В дальнейшем этот параметр всегда можно затюнить отдельно
    subsample, colsample_bytree. Выставим 0.8 и зафиксируем. 
        Можно также проварьировать в интервале 0.5-0.9.
    scale_pos_weight. Выставляется в зафисимости от соотношения классов в выборке и фиксируется
    """
    param_grid = {
        # параметры ансамбля
        #'n_estimators': [10, 30, 50, 100, 200, 400, 600, 1000],
        'n_estimators': [400, 600, 1000, 1200],
        'learning_rate': [0.1, ],
        
        # параметры дерева
        'max_depth': [5],
        'min_child_weight': [2],
        'gamma': [0.1],
        'subsample': [0.8],
        'colsample_bytree': [0.8],
        'scale_pos_weight': [ones_ratio],
        
        # параметры регуляризации
        'reg_alpha': [0.0],
        'reg_lambda': [1.0]
    }

    cv = KFold(n_splits=2, shuffle=True)

    clf = xgboost.XGBClassifier()
    gs = GridSearchCV(clf, param_grid, scoring=score_type, cv=cv, verbose=0)

    gs.fit(X_train, y_train)
    best_params_1 = gs.best_estimator_.get_params()
    print('Best score 1 ({0}):{1} '.format(score_type, gs.best_score_))    
    #print('Best params: ', best_params_1)

    """
    Шаг 2. Подбираем параметры дерева

        max_depth - будем варьировать от 3 до 10 с шагом 2
        min_child_weight - от 1 до 6 с шагом 2

    """

    param_grid = {
        'max_depth': range(3, 10, 1),
        'min_child_weight': range(1, 6, 2)
    }

    clf = xgboost.XGBClassifier(**best_params_1) # в качестве отправной точки возьмём модель с наилучшими параметрами предыдущего шага

    gs = GridSearchCV(clf, param_grid, scoring=score_type, cv=cv, verbose=1)

    gs.fit(X_train, y_train)
    best_params_2 = gs.best_estimator_.get_params()
    print('Best score 2 ({0}):{1} '.format(score_type, gs.best_score_))

    #print('Best params: ', best_params_2)
    
    """
    Шаг 3. Подбираем gamma (критерий создания поддерева)
        gamma - от 0 до 0.5 с шагом 0.1
    """

    param_grid = {
        'gamma': [0.1*i for i in range(6)]
    }

    clf = xgboost.XGBClassifier(**best_params_2)
    gs = GridSearchCV(clf, param_grid, scoring=score_type, cv=cv, verbose=0)
    gs.fit(X_train, y_train)
    best_params_3 = gs.best_estimator_.get_params()
    print('Best score 3 ({0}):{1} '.format(score_type, gs.best_score_))
    #print('Best params: ', best_params_3)

    """
    Шаг 4. Затюним subsample и colsample_bytree¶

    subsample - от 0.5 до 1.0 с шагом 0.1
    colsample_bytree - от 0.5 до 1.0 с шагом 0.1
    """

    param_grid = {
        'subsample': [0.5 + 0.1*i for i in range(6)],
        'colsample_bytree': [0.5 + 0.1*i for i in range(6)]
    }

    clf = xgboost.XGBClassifier(**best_params_3)

    gs = GridSearchCV(clf, param_grid, scoring=score_type, cv=cv, verbose=0)

    gs.fit(X_train, y_train)
    best_params_4 = gs.best_estimator_.get_params()
    print('Best score 4 ({0}):{1} '.format(score_type, gs.best_score_))

    #print('Best params: ', best_params_4)
    
    """
    Шаг 5. Регуляризация
    reg_alpha [1e-5, 1e-2, 0.1, 1, 100]
    reg_lambda [1e-5, 1e-2, 0.1, 1, 100]
    """
    param_grid = {
        'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100],
        'reg_lambda': [1e-5, 1e-2, 0.1, 1, 100]
    }

    clf = xgboost.XGBClassifier(**best_params_4)

    gs = GridSearchCV(clf, param_grid, scoring=score_type, cv=cv, verbose=1)

    gs.fit(X_train, y_train)
    best_params_5 = gs.best_estimator_.get_params()
    print('Best score 5 ({0}):{1} '.format(score_type, gs.best_score_))

    #print('Best params: ', best_params_5)
    
    """
    Шаг 6. Learning rate

    Чем меньше у нас n_estimators в ансамбле, тем быстрее нам нужно двигаться с каждым шагом 
    (добавлением нового классификатора), т.е. делать больший learning_rate. 
    Обычно learning rate варьируют так, чтобы произведение n_estimators x learning_rate оставалось инвариантным
    """
    best_params_6 = best_params_5.copy()
    clf = xgboost.XGBClassifier(**best_params_5)
    best_n_estimators = clf.get_params()['n_estimators'] # возьмём наилучшие значения n_estimators с предыдущего шага
    best_learning_rate = best_params['learning_rate'] # аналогичная запись
    invariant_composition = best_n_estimators * best_learning_rate
    n_estimators_range = [10, 30, 100, 200, 400, 600, 800, 1000, 1200, 1400]

    best_score = gs.best_score_ # возьмём наилучшее качество с предыдущего шага

    for n_estimators in n_estimators_range:
        learning_rate = invariant_composition / n_estimators
        clf.set_params(n_estimators=n_estimators, learning_rate=learning_rate)
        #aucs = []
        accurs = []
        for train_idx, test_idx in cv.split(X_train):
            X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
            y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]
            clf.fit(X_train_fold, y_train_fold)
            #preds = clf.predict_proba(X_test_fold)
            #auc = roc_auc_score(y_test_fold, preds[:, 1])
            #aucs.append(auc)
            accur = clf.score(X_test_fold, y_test_fold)
            accurs.append(accur)
        accur = np.mean(accurs)
        if accur > best_score:
            best_n_estimators = n_estimators
            best_learning_rate = learning_rate
            best_score = accur
    
    best_params_6['n_estimators'] = best_n_estimators
    best_params_6['learning_rate'] = best_learning_rate

    print('Best score 6: ', best_score)
    
    return best_params_6