# Presets

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from IPython.display import display, HTML

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import metrics

# Hyperparams tuning
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Continuous, Categorical, Integer

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                              ExtraTreesClassifier, VotingClassifier)
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
np.random.seed(42)

In [2]:
def basic_descriptives(df):
    display(HTML("<p>Head:</p>"))
    display(HTML(df.head().to_html()))
    display(HTML("<p>Summary stats:</p>"))
    display(HTML(df.describe().to_html()))
    print(
        "\nNrows: ", df.shape[0], "\n",
        "\nNcols: ", df.shape[1], "\n",
        "\nData types:\n", df.dtypes.to_string(), "\n",
        sep=''
    )
    
def show_basic_plots(df, vars_subset=None):
    # Presets:
    if vars_subset is None:
        vars_subset = df.columns.tolist()
    
    color_background = '#F5F5F5'
    color_gridlines = '#DCDCDC'
    colors_in_use = ['#2C3E50', '#537EA2', '#858F84', '#42A593',
                     '#873E23', '#CFD1A1', '#6A744F', '#BDBDC5',
                     '#7EA253', '#EDB676', '#C26D40']+px.colors.qualitative.Safe
    
    dtypes_num = ['int64', 'int32', 'int16', 'float64', 'float32', 'float16']
    dtypes_str = ['object', 'category']
    vars_num = df.loc[:, vars_subset].select_dtypes(include=dtypes_num).columns
    vars_str = df.loc[:, vars_subset].select_dtypes(include=dtypes_str).columns
    
    # For categorical variables:
    fig_str_lst = [None]*len(vars_str)
    for i in range(0, len(vars_str)):
        fig = go.Figure()
        fig.add_trace(go.Histogram(x=df.loc[:, vars_str[i]],
                                   name=vars_str[i],
                                   showlegend=True))
        fig.update_traces(marker_color=colors_in_use[0],
                          marker_line_color='rgb(8,48,107)',
                          marker_line_width=1.5,
                          opacity=0.8)
        fig.update_layout(xaxis_type='category',
                          xaxis_title=vars_str[i],
                          paper_bgcolor=color_background,
                          plot_bgcolor=color_background)
        fig.update_yaxes(gridcolor=color_gridlines)
        fig.update_xaxes(linecolor=color_gridlines)
        fig_str_lst[i] = fig
        
    # For 2 categorical variables:
    fig_str_lst_mix = []
    for i in range(0, len(vars_str)):
        for j in range(0, len(vars_str)):
            if i == j:
                continue
            else:
                fig = px.histogram(df, x=vars_str[i], color=vars_str[j],
                                   color_discrete_sequence=colors_in_use[1:])
                fig.update_traces(marker_line_color='rgb(8,48,107)',
                                  marker_line_width=1.5,
                                  opacity=0.8)
                fig.update_layout(xaxis_type='category',
                                  xaxis_title=vars_str[i],
                                  paper_bgcolor=color_background,
                                  plot_bgcolor=color_background)
                fig.update_yaxes(gridcolor=color_gridlines, title='')
                fig.update_xaxes(linecolor=color_gridlines)
                fig_str_lst_mix.append(fig)
            
    # For numerical variables:
    fig_num_lst_dist = [None]*len(vars_num)
    for i in range(0, len(vars_num)):
        fig = ff.create_distplot(hist_data=[df[vars_num[i]].dropna()],
                                 group_labels=[vars_num[i]],
                                 show_hist=False,
                                 show_rug=False,
                                 colors = colors_in_use)
        fig.update_layout(paper_bgcolor=color_background,
                              plot_bgcolor=color_background)
        fig.update_yaxes(gridcolor=color_gridlines,
                         zerolinecolor=color_gridlines,
                         title='')
        fig.update_xaxes(gridcolor=color_gridlines,
                         title=vars_num[i])
        fig_num_lst_dist[i] = fig
        
    fig_num_lst_violin = [None]*len(vars_num)
    for i in range(0, len(vars_num)):
        fig = px.violin(df, y=vars_num[i], box=True, points='outliers')
        fig.update_traces(marker_color = colors_in_use[0],
                          opacity=0.8,
                          name=vars_num[i],
                          showlegend=True)
        fig.update_layout(paper_bgcolor=color_background,
                          plot_bgcolor=color_background)
        fig.update_yaxes(gridcolor=color_gridlines,
                         zerolinecolor=color_gridlines,
                         title='')
        fig.update_xaxes(title=vars_num[i])
        fig_num_lst_violin[i] = fig
    
        # For 2 numerical variables:
    fig_num_lst_mix = []
    fig_num_lst_scat = []
    for i in range(0, len(vars_num)):
        for j in range(0, len(vars_num)):
            if i == j:
                continue
            else:
                fig = go.Figure()
                fig.add_trace(go.Histogram2dContour(x = df[vars_num[i]],
                                                        y = df[vars_num[j]],
                                                        colorscale='deep'))
                fig.update_layout(paper_bgcolor=color_background,
                                  plot_bgcolor=color_background)
                fig.update_yaxes(title=vars_num[j])
                fig.update_xaxes(title=vars_num[i])
                fig_num_lst_mix.append(fig)
                
                fig = px.scatter(data,x=data[vars_num[i]],
                                 y=data[vars_num[j]],
                                 trendline='ols')
                fig.update_layout(paper_bgcolor=color_background,
                                  plot_bgcolor=color_background)
                fig.update_traces(marker_color=colors_in_use[0],
                                  opacity=0.8)
                fig.update_yaxes(gridcolor=color_gridlines,
                                 zerolinecolor=color_gridlines,
                                 title=vars_num[j])
                fig.update_xaxes(gridcolor=color_gridlines,
                                 zerolinecolor=color_gridlines,
                                 title=vars_num[i])
                fig_num_lst_scat.append(fig)
        
    # 1 categorical, 1 numeric:
    fig_all_violin_mix = []
    for i in range(0, len(vars_str)):
        for j in range(0, len(vars_num)):
            fig = px.violin(df, y=vars_num[j], color=vars_str[i],
                            color_discrete_sequence=colors_in_use,
                            box=True, points='outliers')
            fig.update_traces(opacity=0.8)
            fig.update_layout(xaxis_title=vars_num[j],
                              showlegend=True,
                              paper_bgcolor=color_background,
                              plot_bgcolor=color_background)
            fig.update_yaxes(gridcolor=color_gridlines, title='')
            fig.update_xaxes(linecolor=color_gridlines)
            fig_all_violin_mix.append(fig)
            
    final_dict = {
        # categorical+target
        'dist_cat': fig_str_lst,
        'dist_mlt_cat': fig_str_lst_mix,
        # numeric+target
        'dist_num': fig_num_lst_dist,
        'violin_num': fig_num_lst_violin,
        'scatter_num': fig_num_lst_scat,
        'dist_mlt_num': fig_num_lst_mix,
        # all
        'violin_mix': fig_all_violin_mix
        #'scatter_mix': None,
        #'dist_mlt_mix': None,
    }
    return(final_dict) 

def corr_heatmap(df):
    color_background = '#F5F5F5'
    color_gridlines = '#DCDCDC'
        
    fig = px.imshow(df.corr().round(3), text_auto=True, color_continuous_scale='deep')
    fig.update_traces(opacity=0.8)
    fig.update_layout(
        coloraxis_showscale=False,
        paper_bgcolor=color_background,
        plot_bgcolor=color_background)
    fig.update_yaxes(gridcolor=color_gridlines, title='')
    fig.update_xaxes(linecolor=color_gridlines)
    return(fig)

In [3]:
def show_model_grid_search_cv(model_grid, classifier, name, cv=3):
    model_pipeline = Pipeline(
        steps=[("preprocessor", preprocessor), ("classifier", classifier)]
    )
    
    model_grid_search_cv = GridSearchCV(
        estimator = model_pipeline,
        param_grid = model_grid,
        cv = cv,
        n_jobs = -1,
        verbose = 2
    ).fit(X_train, y_train)
    print("\nModel:", name, "\n")
    print("Accuracy:", model_grid_search_cv.best_score_, "\n")
    print("Best params", model_grid_search_cv.best_params_, "\n")

In [4]:
def show_model_ga_search_cv(model_grid, classifier, name, cv=10, popsize=50):
    model_grid_search_cv = GASearchCV(
        estimator=classifier,
        cv=cv,
        scoring='accuracy',
        population_size=popsize,
        generations=40,
        tournament_size=3,
        elitism=True,
        crossover_probability=0.8,
        mutation_probability=0.1,
        param_grid=model_grid,
        criteria='max',
        algorithm='eaMuPlusLambda',
        n_jobs=-1,
        verbose=True,
        keep_top_k=4
    ).fit(X_train, y_train)
    print("\nModel:", name, "\n")
    print("Accuracy:", model_grid_search_cv.best_score_, "\n")
    print("Best params", model_grid_search_cv.best_params_, "\n")

In [5]:
data = pd.read_csv('train.csv')

# EDA

## Initial

In [None]:
basic_descriptives(data)  

In [None]:
tmp = show_basic_plots(data)

In [None]:
print(tmp.keys()) # options what to show

In [None]:
for i in tmp['dist_num']: 
    i.show()
del tmp

Main technical clues here:
- Name, PassengerId are rather unimportant as predictors (or at least it is hard to see some reasons to think so)
- Pclass, Survided, Parch, SibSp are actually character variables
- Distribuion of Cabins and Tickets are almost unimodal

From the analytical perspective:
- Most of passangers embarked in S (Southampton)
- There were more men than women on the board
- Fare have some big outliers
- Age have a lot of missings

So, recode & drop to explore better:

## EDA continuation

In [None]:
data_cleared = data.copy()
data_cleared['Survived'] = data_cleared.Survived.astype(str)
data_cleared['Pclass'] = data_cleared.Pclass.astype(str)
data_cleared['SibSp'] = data_cleared.SibSp.astype(str)
data_cleared['Parch'] = data_cleared.Parch.astype(str)
data_cleared.drop(columns=['PassengerId', 'Cabin', 'Ticket', 'Name'], inplace=True)

In [None]:
data_cleared.head()

In [None]:
corr_heatmap(data_cleared)

In [None]:
data_corr = data_cleared.copy()
data_corr = DataFrameExtended(data_corr)
data_corr['Survived'] = data_corr.Survived.astype(np.int64)
data_corr['Pclass'] = data_corr.Pclass.astype(np.int64)
data_corr['SibSp'] = data_corr.SibSp.astype(np.int64)
data_corr['Parch'] = data_corr.Parch.astype(np.int64)
corr_heatmap(data_corr)

In [None]:
tmp = show_basic_plots(data_cleared)
print(data_cleared_plots.keys())

In [None]:
for i in data_cleared_plots['violin_num']: 
    i.show()
#del tmp

Main clues:
- Most passengers travelled 3rd class
- Mostly, passengers travelled alone (no siblings, spouces) or maximum with 1 spouce/child. Other examples are rather outliers that (maybe) can be grouped
- Among the survived, the proportion of women was much bigger. Moreover, the proportion of survation between Pclass also differs
- Fare/Age distribution almost uniform
- Fare of people survived was bigger
- More uniform distribution of Age in the 1st class, mean Age is also upper
- Numeric variables are not correlated between each other
- Parch & SibSp are pretty much correlated

# Feature creation / transformation

## Functions

In [6]:
def bin_family(size):
    family_group = ''
    if (size == 1):
        family_group = 'alone'
    elif (size == 2):
        family_group = 'pair'
    elif (size == 3):
        family_group = 'pair_1_child'
    elif (size == 4):
        family_group = 'pair_2_child'
    else:
        family_group = 'large'
    return family_group

def bin_age(age):
    age_group = ''
    if age <= 1:
        age_group = 'infant'
    elif age <= 4: 
        age_group = 'toddler'
    elif age <= 13:
        age_group = 'child'
    elif age <= 18:
        age_group = 'teenager'
    elif age <= 35:
        age_group = 'young_adult'
    elif age <= 45:
        age_group = 'adult'
    elif age <= 55:
        age_group = 'middle_aged'
    elif age <= 65:
        age_group = 'senior'
    else:
        age_group = 'very_old'
    return age_group

def bin_fare(fare):
    fare_group = ''
    if fare <= 4:
        fare_group = 'Very_low'
    elif fare <= 10:
        fare_group = 'low'
    elif fare <= 20:
        fare_group = 'mid'
    elif fare <= 45:
        fare_group = 'high'
    else:
        fare_group = "very_high"
    return fare_group

In [7]:
def titanic_transform(X):
    name_recoded = X.Name.str.extract(
        r'(Mrs|Miss|Lady|Ms|Mme|Mlle|Dona|Mr|Don|Master|Rev|Col|Jonkheer|Dr|Major)'
    )
    name_recoded = name_recoded.fillna('Officer')
    X['Title'] = name_recoded
    X['Title'] = X['Title'].str.replace('Ms|Mlle|Lady', 'Miss', regex=True)
    X['Title'] = X['Title'].str.replace('Mme|Dona|Countess', 'Mrs', regex=True)
    X['Title'] = X['Title'].str.replace('Don|Rev|Sir', 'Mr', regex=True) 
    X['Title'] = X['Title'].str.replace('Col|Jonkheer|Dr|Major', 'Officer', regex=True)
    
    
    #X['LastName'] = X['Name'].apply(lambda x: str.split(x, ',')[0])
    # df['Last_Name'] = df['Name'].apply(lambda x: str.split(x, ',')[0])
    # create family survival
    # check Age binned with qbin
    # check Fare binned with qbin

    #X['DeckMissing'] = np.where(X['Cabin'].isnull(), 1, 0)
    X['Deck'] = X['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
    X['Deck'] = X['Deck'].replace(['A', 'B', 'C', 'T'], 'ABC')
    X['Deck'] = X['Deck'].replace(['D', 'E'], 'DE')
    X['Deck'] = X['Deck'].replace(['F', 'G'], 'FG')
    #X['Ticket_Frequency'] = X.groupby('Ticket')['Ticket'].transform('count')
    
    
    #X['Cabin'].str.extract(r'([A-z])').fillna('Unknown')
    X['Embarked'] = X['Embarked'].fillna('S')
    
    X['AgeMissing'] = np.where(X['Age'].isnull(), 1, 0)
    
    X['Age'] = X.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))
    X['AgeBin'] = X['Age'].map(bin_age)
    
    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
    X['FamilyBin'] = X['FamilySize'].map(bin_family)
    
    median_fare = X.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
    X['Fare'] = X['Fare'].fillna(median_fare)
    X['FarePerFamily'] = X['Fare']/X['FamilySize']
    X['FareBin'] = X['FarePerFamily'].map(bin_fare)
    
    X['Pclass'] = X.Pclass.astype(str)
    cols_to_drop = ['Name', 'PassengerId', 'Ticket', 'Cabin',
                    'FamilySize', 'Age', 'Fare', 'FarePerFamily', 'SibSp', 'Parch']
    X.drop(columns=cols_to_drop, inplace=True)
    
    
    cols_to_dummy = ['Pclass', 'Sex', 'Embarked', 'Title',
                     'AgeBin', 'FamilyBin', 'FareBin', 'Deck'
                    ]
    X = pd.get_dummies(X, columns=cols_to_dummy, prefix=cols_to_dummy)
    
    return X

In [8]:
X = data.copy()
y = X['Survived']
X.drop(columns=['Survived'], inplace=True)
X = titanic_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Models calibration: GASearch

## Grids

In [None]:
model_grid_ga_logistic = {
    'C': Continuous(0.1, 1000, distribution='uniform')
}

model_grid_ga_logistic_net = {
    'C': Continuous(0.1, 1000, distribution='uniform'),
    'l1_ratio': Continuous(0, 1, distribution='uniform')
}

model_grid_ga_knn = {
    'n_neighbors': Integer(3, 20),
    'leaf_size': Integer(20, 50)
}

model_grid_ga_svm = {
    'C': Continuous(0.1, 1000, distribution='uniform'),
    'gamma': Continuous(0.0001, 1, distribution='uniform'),
}

model_grid_ga_rf = {
    'max_depth': Integer(10, 80),
    'max_features': Integer(1, 7),
    'min_samples_leaf': Integer(1, 7),
    'min_samples_split': Integer(2, 10),
    'n_estimators': Integer(25, 500)#,
}

model_grid_ga_extra_trees = {
    'max_depth': Integer(10, 80),
    'max_features': Integer(1, 7),
    'min_samples_leaf': Integer(1, 7),
    'min_samples_split': Integer(2, 10),
    'n_estimators': Integer(25, 500)#,
}

model_grid_ga_adaboost = {
    "learning_rate": Continuous(0.01, 0.50, distribution='uniform'),
    "n_estimators": Integer(25, 500)
}

model_grid_ga_xgboost = {
    "subsample": Continuous(0.75, 1, distribution='uniform'),
    "colsample_bytree": Continuous(0.75, 1, distribution='uniform'),
    "max_depth": Integer(2, 16),
    "min_child_weight": Integer(2, 15),
    "learning_rate": Continuous(0.01, 0.50, distribution='uniform'),
    "n_estimators": Integer(25, 500)
}

model_grid_ga_lgbm = {
    "learning_rate": Continuous(0.01, 0.50, distribution='uniform'),
    "n_estimators": Integer(25, 500),
    "num_iterations": Integer(100, 1000),
    "lambda_l2": Integer(0, 3),
    "bagging_fraction": Continuous(0.8, 1, distribution='uniform'),
    "min_data_in_leaf": Integer(10, 40),
    "num_leaves": Integer(21, 51),
}

model_grid_ga_nnet = {
    'learning_rate_init': Continuous(0.01, 0.50, distribution='uniform'),
    'max_iter': Integer(200, 2000),
    'hidden_layer_sizes': Integer(100, 1000)
}

## Logistic regression

In [None]:
show_model_ga_search_cv(model_grid_ga_logistic,
                        LogisticRegression(penalty='none'),
                        'logistic_reg')

In [None]:
show_model_ga_search_cv(model_grid_ga_logistic_net,
                        LogisticRegression(solver='saga', penalty='elasticnet'),
                        'logistic_reg')

In [None]:
show_model_ga_search_cv(model_grid_ga_logistic,
                        LogisticRegression(penalty='l1', solver='saga'),
                        'logistic_reg')

In [None]:
show_model_ga_search_cv(model_grid_ga_logistic, LogisticRegression(penalty='l2'), 'logistic_reg')

## KNN

In [None]:
show_model_ga_search_cv(model_grid_ga_knn, KNeighborsClassifier(), 'knn')

## SVM

In [None]:
show_model_ga_search_cv(model_grid_ga_svm, SVC(kernel='linear'), 'svm_linear')

In [None]:
show_model_ga_search_cv(model_grid_ga_svm, SVC(kernel='rbf'), 'svm_rbf')

## Random Forest

In [None]:
show_model_ga_search_cv(model_grid_ga_rf, RandomForestClassifier(), 'random_forest')

## Extra trees

In [None]:
show_model_ga_search_cv(model_grid_ga_extra_trees, ExtraTreesClassifier(), 'extra_trees')

## AdaBoost

In [None]:
#show_model_ga_search_cv(model_grid_ga_adaboost, AdaBoostClassifier(), "adaboost")

pretty time-consuming

## xgboost

In [None]:
show_model_ga_search_cv(model_grid_ga_xgboost, XGBClassifier(), 'xgboost')

## lightGBM

In [None]:
show_model_ga_search_cv(model_grid_ga_lgbm, LGBMClassifier(boosting_type='dart'), 'light_gbm')

In [None]:
show_model_ga_search_cv(model_grid_ga_lgbm, LGBMClassifier(boosting_type='gbdt'), 'light_gbm')

## NNET

In [None]:
show_model_ga_search_cv(model_grid_ga_nnet,
                        MLPClassifier(learning_rate='invscaling'),
                        'nnet')

In [None]:
show_model_ga_search_cv(model_grid_ga_nnet,
                        MLPClassifier(learning_rate='constant'),
                        'nnet')

In [None]:
show_model_ga_search_cv(model_grid_ga_nnet,
                        MLPClassifier(learning_rate='adaptive'),
                        'nnet')

## Results

the best models are xgboost, svm

## Ensembling best models

In [10]:
best_grid = {
    'subsample': 0.816261607081251,
    'colsample_bytree': 0.8565208459725928,
    'max_depth': 2,
    'min_child_weight': 2,
    'learning_rate': 0.017752334965135397,
    'n_estimators': 178
}
best_grid = {
    'subsample': 0.9506464198949532,
    'colsample_bytree': 0.8263276691741828,
    'max_depth': 2,
    'min_child_weight': 4,
    'learning_rate': 0.0866270841576895,
    'n_estimators': 32
}

model_xgb = XGBClassifier(**best_grid)
model_xgb.fit(X, y)
print('best xgb')

best xgb


In [11]:
best_grid = {
    'C': 326.7,#0658550096834,
    'gamma': 0.74#82090936576415
}
model_svc = SVC(kernel='rbf', **best_grid)
model_svc.fit(X, y)
print('best svc')

best svc


In [12]:
best_grid = {
    'max_depth': 36,
    'max_features': 7,
    'min_samples_leaf': 6,
    'min_samples_split': 6,
    'n_estimators': 135
}
model_rf = RandomForestClassifier(**best_grid)
model_rf.fit(X, y)
print('best rf')

best rf


In [13]:
estimators = [('xgb', model_xgb), ('svc', model_svc), ('rf', model_rf)]
ensemble = VotingClassifier(estimators, voting='hard')
ensemble.fit(X, y)
print('best ensemble')

best ensemble


# Prediction on new data (Kaggle submission)

In [14]:
X_test_new = pd.read_csv('test.csv')
X_test_new = titanic_transform(X_test_new)
predictions = ensemble.predict(X_test_new) #model_xgb ensemble

X_test_ps = pd.read_csv('test.csv')
submission = pd.DataFrame({'PassengerId': X_test_ps.PassengerId, 'Survived': predictions})
submission.to_csv('submission.csv', index=False)