# Presets

In [108]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from IPython.display import display, HTML

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import metrics

# Hyperparams tuning
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Continuous, Categorical, Integer

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                              ExtraTreesClassifier, VotingClassifier)
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
np.random.seed(42)

In [2]:
def basic_descriptives(df):
    display(HTML("<p>Head:</p>"))
    display(HTML(df.head().to_html()))
    display(HTML("<p>Summary stats:</p>"))
    display(HTML(df.describe().to_html()))
    print(
        "\nNrows: ", df.shape[0], "\n",
        "\nNcols: ", df.shape[1], "\n",
        "\nData types:\n", df.dtypes.to_string(), "\n",
        sep=''
    )
    
def show_basic_plots(df, vars_subset=None):
    # Presets:
    if vars_subset is None:
        vars_subset = df.columns.tolist()
    
    color_background = '#F5F5F5'
    color_gridlines = '#DCDCDC'
    colors_in_use = ['#2C3E50', '#537EA2', '#858F84', '#42A593',
                     '#873E23', '#CFD1A1', '#6A744F', '#BDBDC5',
                     '#7EA253', '#EDB676', '#C26D40']+px.colors.qualitative.Safe
    
    dtypes_num = ['int64', 'int32', 'int16', 'float64', 'float32', 'float16']
    dtypes_str = ['object', 'category']
    vars_num = df.loc[:, vars_subset].select_dtypes(include=dtypes_num).columns
    vars_str = df.loc[:, vars_subset].select_dtypes(include=dtypes_str).columns
    
    # For categorical variables:
    fig_str_lst = [None]*len(vars_str)
    for i in range(0, len(vars_str)):
        fig = go.Figure()
        fig.add_trace(go.Histogram(x=df.loc[:, vars_str[i]],
                                   name=vars_str[i],
                                   showlegend=True))
        fig.update_traces(marker_color=colors_in_use[0],
                          marker_line_color='rgb(8,48,107)',
                          marker_line_width=1.5,
                          opacity=0.8)
        fig.update_layout(xaxis_type='category',
                          xaxis_title=vars_str[i],
                          paper_bgcolor=color_background,
                          plot_bgcolor=color_background)
        fig.update_yaxes(gridcolor=color_gridlines)
        fig.update_xaxes(linecolor=color_gridlines)
        fig_str_lst[i] = fig
        
    # For 2 categorical variables:
    fig_str_lst_mix = []
    for i in range(0, len(vars_str)):
        for j in range(0, len(vars_str)):
            if i == j:
                continue
            else:
                fig = px.histogram(df, x=vars_str[i], color=vars_str[j],
                                   color_discrete_sequence=colors_in_use[1:])
                fig.update_traces(marker_line_color='rgb(8,48,107)',
                                  marker_line_width=1.5,
                                  opacity=0.8)
                fig.update_layout(xaxis_type='category',
                                  xaxis_title=vars_str[i],
                                  paper_bgcolor=color_background,
                                  plot_bgcolor=color_background)
                fig.update_yaxes(gridcolor=color_gridlines, title='')
                fig.update_xaxes(linecolor=color_gridlines)
                fig_str_lst_mix.append(fig)
            
    # For numerical variables:
    fig_num_lst_dist = [None]*len(vars_num)
    for i in range(0, len(vars_num)):
        fig = ff.create_distplot(hist_data=[df[vars_num[i]].dropna()],
                                 group_labels=[vars_num[i]],
                                 show_hist=False,
                                 show_rug=False,
                                 colors = colors_in_use)
        fig.update_layout(paper_bgcolor=color_background,
                              plot_bgcolor=color_background)
        fig.update_yaxes(gridcolor=color_gridlines,
                         zerolinecolor=color_gridlines,
                         title='')
        fig.update_xaxes(gridcolor=color_gridlines,
                         title=vars_num[i])
        fig_num_lst_dist[i] = fig
        
    fig_num_lst_violin = [None]*len(vars_num)
    for i in range(0, len(vars_num)):
        fig = px.violin(df, y=vars_num[i], box=True, points='outliers')
        fig.update_traces(marker_color = colors_in_use[0],
                          opacity=0.8,
                          name=vars_num[i],
                          showlegend=True)
        fig.update_layout(paper_bgcolor=color_background,
                          plot_bgcolor=color_background)
        fig.update_yaxes(gridcolor=color_gridlines,
                         zerolinecolor=color_gridlines,
                         title='')
        fig.update_xaxes(title=vars_num[i])
        fig_num_lst_violin[i] = fig
    
        # For 2 numerical variables:
    fig_num_lst_mix = []
    fig_num_lst_scat = []
    for i in range(0, len(vars_num)):
        for j in range(0, len(vars_num)):
            if i == j:
                continue
            else:
                fig = go.Figure()
                fig.add_trace(go.Histogram2dContour(x = df[vars_num[i]],
                                                        y = df[vars_num[j]],
                                                        colorscale='deep'))
                fig.update_layout(paper_bgcolor=color_background,
                                  plot_bgcolor=color_background)
                fig.update_yaxes(title=vars_num[j])
                fig.update_xaxes(title=vars_num[i])
                fig_num_lst_mix.append(fig)
                
                fig = px.scatter(data,x=data[vars_num[i]],
                                 y=data[vars_num[j]],
                                 trendline='ols')
                fig.update_layout(paper_bgcolor=color_background,
                                  plot_bgcolor=color_background)
                fig.update_traces(marker_color=colors_in_use[0],
                                  opacity=0.8)
                fig.update_yaxes(gridcolor=color_gridlines,
                                 zerolinecolor=color_gridlines,
                                 title=vars_num[j])
                fig.update_xaxes(gridcolor=color_gridlines,
                                 zerolinecolor=color_gridlines,
                                 title=vars_num[i])
                fig_num_lst_scat.append(fig)
        
    # 1 categorical, 1 numeric:
    fig_all_violin_mix = []
    for i in range(0, len(vars_str)):
        for j in range(0, len(vars_num)):
            fig = px.violin(df, y=vars_num[j], color=vars_str[i],
                            color_discrete_sequence=colors_in_use,
                            box=True, points='outliers')
            fig.update_traces(opacity=0.8)
            fig.update_layout(xaxis_title=vars_num[j],
                              showlegend=True,
                              paper_bgcolor=color_background,
                              plot_bgcolor=color_background)
            fig.update_yaxes(gridcolor=color_gridlines, title='')
            fig.update_xaxes(linecolor=color_gridlines)
            fig_all_violin_mix.append(fig)
            
    final_dict = {
        # categorical+target
        'dist_cat': fig_str_lst,
        'dist_mlt_cat': fig_str_lst_mix,
        # numeric+target
        'dist_num': fig_num_lst_dist,
        'violin_num': fig_num_lst_violin,
        'scatter_num': fig_num_lst_scat,
        'dist_mlt_num': fig_num_lst_mix,
        # all
        'violin_mix': fig_all_violin_mix
        #'scatter_mix': None,
        #'dist_mlt_mix': None,
    }
    return(final_dict) 

def corr_heatmap(df):
    color_background = '#F5F5F5'
    color_gridlines = '#DCDCDC'
        
    fig = px.imshow(df.corr().round(3), text_auto=True, color_continuous_scale='deep')
    fig.update_traces(opacity=0.8)
    fig.update_layout(
        coloraxis_showscale=False,
        paper_bgcolor=color_background,
        plot_bgcolor=color_background)
    fig.update_yaxes(gridcolor=color_gridlines, title='')
    fig.update_xaxes(linecolor=color_gridlines)
    return(fig)

In [4]:
def show_model_grid_search_cv(model_grid, classifier, name, cv=3):
    model_pipeline = Pipeline(
        steps=[("preprocessor", preprocessor), ("classifier", classifier)]
    )
    
    model_grid_search_cv = GridSearchCV(
        estimator = model_pipeline,
        param_grid = model_grid,
        cv = cv,
        n_jobs = -1,
        verbose = 2
    ).fit(X_train, y_train)
    print("\nModel:", name, "\n")
    print("Accuracy:", model_grid_search_cv.best_score_, "\n")
    print("Best params", model_grid_search_cv.best_params_, "\n")

In [23]:
def show_model_ga_search_cv(model_grid, classifier, name, cv=10, popsize=50):
    model_grid_search_cv = GASearchCV(
        estimator=classifier,
        cv=cv,
        scoring='accuracy',
        population_size=popsize,
        generations=40,
        tournament_size=3,
        elitism=True,
        crossover_probability=0.8,
        mutation_probability=0.1,
        param_grid=model_grid,
        criteria='max',
        algorithm='eaMuPlusLambda',
        n_jobs=-1,
        verbose=True,
        keep_top_k=4
    ).fit(X_train, y_train)
    print("\nModel:", name, "\n")
    print("Accuracy:", model_grid_search_cv.best_score_, "\n")
    print("Best params", model_grid_search_cv.best_params_, "\n")

In [6]:
data = pd.read_csv('train.csv')

# EDA

## Initial

In [7]:
basic_descriptives(data)  

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292



Nrows: 891

Ncols: 12

Data types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object



In [11]:
tmp = show_basic_plots(data)

In [None]:
print(tmp.keys()) # options what to show

In [None]:
for i in tmp['dist_num']: 
    i.show()
del tmp

Main technical clues here:
- Name, PassengerId are rather unimportant as predictors (or at least it is hard to see some reasons to think so)
- Pclass, Survided, Parch, SibSp are actually character variables
- Distribuion of Cabins and Tickets are almost unimodal

From the analytical perspective:
- Most of passangers embarked in S (Southampton)
- There were more men than women on the board
- Fare have some big outliers
- Age have a lot of missings

So, recode & drop to explore better:

## EDA continuation

In [None]:
data_cleared = data.copy()
data_cleared['Survived'] = data_cleared.Survived.astype(str)
data_cleared['Pclass'] = data_cleared.Pclass.astype(str)
data_cleared['SibSp'] = data_cleared.SibSp.astype(str)
data_cleared['Parch'] = data_cleared.Parch.astype(str)
data_cleared.drop(columns=['PassengerId', 'Cabin', 'Ticket', 'Name'], inplace=True)

In [None]:
data_cleared.head()

In [None]:
corr_heatmap(data_cleared)

In [None]:
data_corr = data_cleared.copy()
data_corr = DataFrameExtended(data_corr)
data_corr['Survived'] = data_corr.Survived.astype(np.int64)
data_corr['Pclass'] = data_corr.Pclass.astype(np.int64)
data_corr['SibSp'] = data_corr.SibSp.astype(np.int64)
data_corr['Parch'] = data_corr.Parch.astype(np.int64)
corr_heatmap(data_corr)

In [None]:
tmp = show_basic_plots(data_cleared)
print(data_cleared_plots.keys())

In [None]:
for i in data_cleared_plots['violin_num']: 
    i.show()
#del tmp

Main clues:
- Most passengers travelled 3rd class
- Mostly, passengers travelled alone (no siblings, spouces) or maximum with 1 spouce/child. Other examples are rather outliers that (maybe) can be grouped
- Among the survived, the proportion of women was much bigger. Moreover, the proportion of survation between Pclass also differs
- Fare/Age distribution almost uniform
- Fare of people survived was bigger
- More uniform distribution of Age in the 1st class, mean Age is also upper
- Numeric variables are not correlated between each other
- Parch & SibSp are pretty much correlated

# Feature creation / transformation

## Functions

In [24]:
def bin_family(size):
    family_group = ''
    if (size == 1):
        family_group = 'alone'
    elif (size == 2):
        family_group = 'pair'
    elif (size == 3):
        family_group = 'pair_1_child'
    elif (size == 4):
        family_group = 'pair_2_child'
    else:
        family_group = 'large'
    return family_group

def bin_age(age):
    age_group = ''
    if age <= 1:
        age_group = 'infant'
    elif age <= 4: 
        age_group = 'toddler'
    elif age <= 13:
        age_group = 'child'
    elif age <= 18:
        age_group = 'teenager'
    elif age <= 35:
        age_group = 'young_adult'
    elif age <= 45:
        age_group = 'adult'
    elif age <= 55:
        age_group = 'middle_aged'
    elif age <= 65:
        age_group = 'senior'
    else:
        age_group = 'very_old'
    return age_group

def bin_fare(fare):
    fare_group = ''
    if fare <= 4:
        fare_group = 'Very_low'
    elif fare <= 10:
        fare_group = 'low'
    elif fare <= 20:
        fare_group = 'mid'
    elif fare <= 45:
        fare_group = 'high'
    else:
        fare_group = "very_high"
    return fare_group

In [35]:
def titanic_transform(X):
    name_recoded = X.Name.str.extract(
        r'(Mrs|Miss|Lady|Ms|Mme|Mlle|Dona|Mr|Don|Master|Rev|Col|Jonkheer|Dr|Major)'
    )
    name_recoded = name_recoded.fillna('Officer')
    X['Title'] = name_recoded
    X['Title'] = X['Title'].str.replace('Ms|Mlle|Lady', 'Miss', regex=True)
    X['Title'] = X['Title'].str.replace('Mme|Dona|Countess', 'Mrs', regex=True)
    X['Title'] = X['Title'].str.replace('Don|Rev|Sir', 'Mr', regex=True) 
    X['Title'] = X['Title'].str.replace('Col|Jonkheer|Dr|Major', 'Officer', regex=True)
    
    
    #X['LastName'] = X['Name'].apply(lambda x: str.split(x, ',')[0])
    # df['Last_Name'] = df['Name'].apply(lambda x: str.split(x, ',')[0])
    # create family survival
    # check Age binned with qbin
    # check Fare binned with qbin

    X['DeckMissing'] = np.where(X['Cabin'].isnull(), 1, 0)
    #X['Cabin'].str.extract(r'([A-z])').fillna('Unknown')
    X['Embarked'] = X['Embarked'].fillna('S')
    
    X['AgeMissing'] = np.where(X['Age'].isnull(), 1, 0)
    X['Age'].fillna(X['Age'].median(), inplace=True)
    X['AgeBin'] = X['Age'].map(bin_age)
    
    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
    X['FamilyBin'] = X['FamilySize'].map(bin_family)
    
    X['FarePerFamily'] = X['Fare']/X['FamilySize']
    X['FareBin'] = X['FarePerFamily'].map(bin_fare)
    
    X['Pclass'] = X.Pclass.astype(str)
    cols_to_drop = ['Name', 'PassengerId', 'Ticket', 'Cabin',
                    'FamilySize', 'Age', 'Fare', 'FarePerFamily', 'SibSp', 'Parch']
    X.drop(columns=cols_to_drop, inplace=True)
    
    
    cols_to_dummy = ['Pclass', 'Sex', 'Embarked', 'Title', 'AgeBin', 'FamilyBin', 'FareBin']
    X = pd.get_dummies(X, columns=cols_to_dummy, prefix=cols_to_dummy)
    
    return X

In [36]:
X = data.copy()
y = X['Survived']
X.drop(columns=['Survived'], inplace=True)
X = titanic_transform(X)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Visualizations of new variables

# Models calibration: GASearch

## Grids

In [106]:
model_grid_ga_logistic = {
    'C': Continuous(0.1, 1000, distribution='uniform')
}

model_grid_ga_logistic_net = {
    'C': Continuous(0.1, 1000, distribution='uniform'),
    'l1_ratio': Continuous(0, 1, distribution='uniform')
}

model_grid_ga_knn = {
    'n_neighbors': Integer(3, 20),
    'leaf_size': Integer(20, 50)
}

model_grid_ga_svm = {
    'C': Continuous(0.1, 1000, distribution='uniform'),
    'gamma': Continuous(0.0001, 1, distribution='uniform'),
}

model_grid_ga_rf = {
    'max_depth': Integer(10, 80),
    'max_features': Integer(1, 7),
    'min_samples_leaf': Integer(1, 7),
    'min_samples_split': Integer(2, 10),
    'n_estimators': Integer(25, 500)#,
}

model_grid_ga_extra_trees = {
    'max_depth': Integer(10, 80),
    'max_features': Integer(1, 7),
    'min_samples_leaf': Integer(1, 7),
    'min_samples_split': Integer(2, 10),
    'n_estimators': Integer(25, 500)#,
}

model_grid_ga_adaboost = {
    "learning_rate": Continuous(0.01, 0.50, distribution='uniform'),
    "n_estimators": Integer(25, 500)
}

model_grid_ga_xgboost = {
    "subsample": Continuous(0.75, 1, distribution='uniform'),
    "colsample_bytree": Continuous(0.75, 1, distribution='uniform'),
    "max_depth": Integer(2, 16),
    "min_child_weight": Integer(2, 15),
    "learning_rate": Continuous(0.01, 0.50, distribution='uniform'),
    "n_estimators": Integer(25, 500)
}

model_grid_ga_lgbm = {
    "learning_rate": Continuous(0.01, 0.50, distribution='uniform'),
    "n_estimators": Integer(25, 500),
    "num_iterations": Integer(100, 1000),
    "lambda_l2": Integer(0, 3),
    "bagging_fraction": Continuous(0.8, 1, distribution='uniform'),
    "min_data_in_leaf": Integer(10, 40),
    "num_leaves": Integer(21, 51),
}

model_grid_ga_nnet = {
    'learning_rate_init': Continuous(0.01, 0.50, distribution='uniform'),
    'max_iter': Integer(200, 2000),
    'hidden_layer_sizes': Integer(100, 1000)
}

## Logistic regression

In [85]:
show_model_ga_search_cv(model_grid_ga_logistic,
                        LogisticRegression(penalty='none'),
                        'logistic_reg')





gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.828717	1.11022e-16	0.828717   	0.828717   
1  	89    	0.828717	1.11022e-16	0.828717   	0.828717   
2  	87    	0.828717	1.11022e-16	0.828717   	0.828717   
3  	92    	0.828717	1.11022e-16	0.828717   	0.828717   
4  	94    	0.828717	1.11022e-16	0.828717   	0.828717   
5  	86    	0.828717	1.11022e-16	0.828717   	0.828717   
6  	88    	0.828717	1.11022e-16	0.828717   	0.828717   
7  	85    	0.828717	1.11022e-16	0.828717   	0.828717   
8  	92    	0.828717	1.11022e-16	0.828717   	0.828717   
9  	89    	0.828717	1.11022e-16	0.828717   	0.828717   
10 	91    	0.828717	1.11022e-16	0.828717   	0.828717   
11 	90    	0.828717	1.11022e-16	0.828717   	0.828717   
12 	90    	0.828717	1.11022e-16	0.828717   	0.828717   
13 	95    	0.828717	1.11022e-16	0.828717   	0.828717   
14 	91    	0.828717	1.11022e-16	0.828717   	0.828717   
15 	87    	0.828717	1.11022e-16	0.828717   	0.828717   
16 	92    	0.828717	1.11022e-16	0.828717   	0.82


Setting penalty='none' will ignore the C and l1_ratio parameters


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [84]:
show_model_ga_search_cv(model_grid_ga_logistic_net,
                        LogisticRegression(solver='saga', penalty='elasticnet'),
                        'logistic_reg')


A class named 'FitnessMax' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.


A class named 'Individual' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.827363	0.000272352	0.828697   	0.827289   
1  	90    	0.827392	0.000329843	0.828697   	0.827308   
2  	89    	0.827614	0.000575342	0.828697   	0.827308   
3  	88    	0.827781	0.000657929	0.828697   	0.827308   
4  	91    	0.828281	0.000636469	0.828697   	0.827308   
5  	86    	0.828669	0.000194444	0.828697   	0.827308   
6  	91    	0.828697	1.11022e-16	0.828697   	0.828697   
7  	92    	0.828697	1.11022e-16	0.828697   	0.828697   
8  	94    	0.828697	1.11022e-16	0.828697   	0.828697   
9  	93    	0.828697	1.11022e-16	0.828697   	0.828697   
10 	87    	0.828697	1.11022e-16	0.828697   	0.828697   
11 	88    	0.828697	1.11022e-16	0.828697   	0.828697   
12 	87    	0.828697	1.11022e-16	0.828697   	0.828697   
13 	88    	0.828697	1.11022e-16	0.828697   	0.828697   
14 	88    	0.828697	1.11022e-16	0.828697   	0.828697   
15 	91    	0.828697	1.11022e-16	0.828697   	0.828697   
16 	92    	0.828697	1.11022e-16	0.828697   	0.82


The max_iter was reached which means the coef_ did not converge



In [72]:
show_model_ga_search_cv(model_grid_ga_logistic,
                        LogisticRegression(penalty='l1', solver='saga'),
                        'logistic_reg')




A class named 'FitnessMax' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.


A class named 'Individual' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.



gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	50    	0.82728	0.000199922	0.827308   	0.82588    
1  	92    	0.827308	1.11022e-16	0.827308   	0.827308   
2  	88    	0.827308	1.11022e-16	0.827308   	0.827308   
3  	90    	0.827308	1.11022e-16	0.827308   	0.827308   
4  	89    	0.827308	1.11022e-16	0.827308   	0.827308   
5  	93    	0.827308	1.11022e-16	0.827308   	0.827308   
6  	88    	0.827336	0.000194444	0.828697   	0.827308   
7  	87    	0.827503	0.000481926	0.828697   	0.827308   
8  	94    	0.827642	0.000593171	0.828697   	0.827308   
9  	90    	0.828114	0.000685498	0.828697   	0.827308   
10 	88    	0.828503	0.000481926	0.828697   	0.827308   
11 	91    	0.828697	1.11022e-16	0.828697   	0.828697   
12 	90    	0.828697	1.11022e-16	0.828697   	0.828697   
13 	95    	0.828697	1.11022e-16	0.828697   	0.828697   
14 	93    	0.828697	1.11022e-16	0.828697   	0.828697   
15 	88    	0.828697	1.11022e-16	0.828697   	0.828697   
16 	89    	0.828697	1.11022e-16	0.828697   	0.8286


The max_iter was reached which means the coef_ did not converge



In [64]:
show_model_ga_search_cv(model_grid_ga_logistic, LogisticRegression(penalty='l2'), 'logistic_reg')





gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.828264	0.000715424	0.828717   	0.82588    
1  	91    	0.828689	0.000197183	0.828717   	0.827308   
2  	87    	0.828717	1.11022e-16	0.828717   	0.828717   
3  	93    	0.828717	1.11022e-16	0.828717   	0.828717   
4  	91    	0.828717	1.11022e-16	0.828717   	0.828717   
5  	89    	0.828717	1.11022e-16	0.828717   	0.828717   
6  	88    	0.828717	1.11022e-16	0.828717   	0.828717   
7  	91    	0.828717	1.11022e-16	0.828717   	0.828717   
8  	94    	0.828717	1.11022e-16	0.828717   	0.828717   
9  	87    	0.828717	1.11022e-16	0.828717   	0.828717   
10 	85    	0.828717	1.11022e-16	0.828717   	0.828717   
11 	85    	0.828717	1.11022e-16	0.828717   	0.828717   
12 	92    	0.828717	1.11022e-16	0.828717   	0.828717   
13 	89    	0.828717	1.11022e-16	0.828717   	0.828717   
14 	88    	0.828717	1.11022e-16	0.828717   	0.828717   
15 	87    	0.828717	1.11022e-16	0.828717   	0.828717   
16 	87    	0.828717	1.11022e-16	0.828717   	0.82

## KNN

In [58]:
show_model_ga_search_cv(model_grid_ga_knn, KNeighborsClassifier(), 'knn')


A class named 'FitnessMax' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.


A class named 'Individual' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.802011	0.00550473 	0.810407   	0.790728   
1  	91    	0.80756 	0.00235624 	0.810407   	0.7991     
2  	91    	0.809438	0.00144731 	0.810407   	0.806221   
3  	90    	0.810352	0.00038615 	0.810407   	0.807649   
4  	94    	0.810407	1.11022e-16	0.810407   	0.810407   
5  	93    	0.810407	1.11022e-16	0.810407   	0.810407   
6  	90    	0.810407	1.11022e-16	0.810407   	0.810407   
7  	86    	0.810407	1.11022e-16	0.810407   	0.810407   
8  	92    	0.810407	1.11022e-16	0.810407   	0.810407   
9  	89    	0.810407	1.11022e-16	0.810407   	0.810407   
10 	92    	0.810407	1.11022e-16	0.810407   	0.810407   
11 	84    	0.810407	1.11022e-16	0.810407   	0.810407   
12 	92    	0.810407	1.11022e-16	0.810407   	0.810407   
13 	91    	0.810407	1.11022e-16	0.810407   	0.810407   
14 	87    	0.810407	1.11022e-16	0.810407   	0.810407   
15 	91    	0.810407	1.11022e-16	0.810407   	0.810407   
16 	94    	0.810407	1.11022e-16	0.810407   	0.81

## SVM

In [48]:
show_model_ga_search_cv(model_grid_ga_svm, SVC(kernel='linear'), 'svm_linear')

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.828736	2.22045e-16	0.828736   	0.828736   
1  	96    	0.828736	2.22045e-16	0.828736   	0.828736   
2  	91    	0.828736	2.22045e-16	0.828736   	0.828736   
3  	97    	0.828736	2.22045e-16	0.828736   	0.828736   
4  	94    	0.828736	2.22045e-16	0.828736   	0.828736   
5  	91    	0.828736	2.22045e-16	0.828736   	0.828736   
6  	94    	0.828736	2.22045e-16	0.828736   	0.828736   
7  	92    	0.828736	2.22045e-16	0.828736   	0.828736   
8  	91    	0.828736	2.22045e-16	0.828736   	0.828736   
9  	93    	0.828736	2.22045e-16	0.828736   	0.828736   
10 	90    	0.828736	2.22045e-16	0.828736   	0.828736   
11 	89    	0.828736	2.22045e-16	0.828736   	0.828736   
12 	93    	0.828736	2.22045e-16	0.828736   	0.828736   
13 	94    	0.828736	2.22045e-16	0.828736   	0.828736   
14 	92    	0.828736	2.22045e-16	0.828736   	0.828736   
15 	92    	0.828736	2.22045e-16	0.828736   	0.828736   
16 	89    	0.828736	2.22045e-16	0.828736   	0.82

In [49]:
show_model_ga_search_cv(model_grid_ga_svm, SVC(kernel='rbf'), 'svm_rbf')

gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	50    	0.80245	0.00659638 	0.8259     	0.786463   
1  	90    	0.807931	0.0044431  	0.8259     	0.79779    
2  	91    	0.809168	0.00363554 	0.8259     	0.801976   
3  	89    	0.811674	0.00686683 	0.834351   	0.80759    
4  	87    	0.813196	0.00807862 	0.834351   	0.808998   
5  	94    	0.816041	0.00920571 	0.834351   	0.808998   
6  	91    	0.818576	0.00991533 	0.834351   	0.808998   
7  	89    	0.822773	0.00912037 	0.834351   	0.808998   
8  	93    	0.828717	0.00377927 	0.834351   	0.8259     
9  	90    	0.833196	0.00224806 	0.834351   	0.8259     
10 	84    	0.83421 	0.000422535	0.834351   	0.832942   
11 	87    	0.834351	1.11022e-16	0.834351   	0.834351   
12 	88    	0.834351	1.11022e-16	0.834351   	0.834351   
13 	93    	0.834351	1.11022e-16	0.834351   	0.834351   
14 	97    	0.834351	1.11022e-16	0.834351   	0.834351   
15 	89    	0.834351	1.11022e-16	0.834351   	0.834351   
16 	82    	0.834351	1.11022e-16	0.834351   	0.8343

## Random Forest

In [39]:
show_model_ga_search_cv(model_grid_ga_rf, RandomForestClassifier(), 'random_forest')


A class named 'FitnessMax' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.


A class named 'Individual' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.812748	0.00509    	0.823005   	0.797868   
1  	90    	0.816771	0.00269815 	0.823005   	0.810466   
2  	88    	0.818093	0.00251222 	0.823005   	0.811894   
3  	90    	0.818569	0.0025247  	0.823005   	0.811835   
4  	91    	0.819048	0.00257088 	0.823044   	0.813282   
5  	92    	0.819894	0.00256784 	0.824433   	0.813322   
6  	92    	0.820088	0.00288673 	0.824433   	0.813263   
7  	93    	0.819979	0.00307164 	0.824433   	0.813263   
8  	92    	0.821189	0.00245725 	0.82725    	0.81608    
9  	89    	0.820571	0.00325079 	0.82725    	0.813263   
10 	89    	0.821554	0.00276856 	0.82725    	0.814632   
11 	89    	0.820682	0.00302874 	0.82725    	0.814574   
12 	87    	0.820703	0.00316401 	0.824433   	0.814574   
13 	90    	0.820788	0.00286526 	0.824433   	0.81473    
14 	95    	0.820618	0.00334678 	0.824433   	0.813224   
15 	94    	0.821671	0.0030982  	0.825822   	0.813263   
16 	92    	0.82189 	0.0027042  	0.82725    	0.81

## Extra trees

In [88]:
show_model_ga_search_cv(model_grid_ga_extra_trees, ExtraTreesClassifier(), 'extra_trees')

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.811575	0.00458875 	0.818916   	0.79779    
1  	91    	0.814591	0.00298448 	0.818916   	0.806182   
2  	87    	0.816817	0.00204961 	0.820246   	0.811835   
3  	88    	0.81811 	0.00156795 	0.823083   	0.814632   
4  	85    	0.818701	0.00157765 	0.823083   	0.814632   
5  	91    	0.81901 	0.0019478  	0.823083   	0.814652   
6  	87    	0.819516	0.0015899  	0.823083   	0.81608    
7  	93    	0.819544	0.00219758 	0.823103   	0.813263   
8  	93    	0.819319	0.00278453 	0.823103   	0.811854   
9  	86    	0.820305	0.00238099 	0.823103   	0.813224   
10 	87    	0.820673	0.00240656 	0.823103   	0.816041   
11 	85    	0.8207  	0.00245801 	0.823103   	0.814652   
12 	92    	0.820953	0.00217031 	0.823103   	0.81606    
13 	92    	0.820953	0.00238225 	0.823103   	0.816021   
14 	87    	0.821321	0.00264457 	0.824511   	0.814652   
15 	90    	0.820702	0.00311838 	0.824511   	0.813243   
16 	92    	0.82039 	0.00296894 	0.824511   	0.81

## AdaBoost

In [92]:
#show_model_ga_search_cv(model_grid_ga_adaboost, AdaBoostClassifier(), "adaboost")

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.821097	0.00340737 	0.830106   	0.81606    
1  	94    	0.82357 	0.00265787 	0.830106   	0.817488   
2  	87    	0.825426	0.00223941 	0.830106   	0.821674   
3  	92    	0.82638 	0.00225638 	0.831514   	0.821674   
4  	86    	0.828065	0.00261864 	0.831514   	0.821674   
5  	90    	0.829355	0.00224723 	0.831514   	0.821674   
6  	94    	0.829493	0.00266627 	0.831514   	0.821674   
7  	88    	0.831064	0.000713559	0.831514   	0.828717   
8  	92    	0.831149	0.00217307 	0.831514   	0.81606    
9  	86    	0.831514	0          	0.831514   	0.831514   
10 	86    	0.831514	0          	0.831514   	0.831514   
11 	89    	0.831514	0          	0.831514   	0.831514   
12 	91    	0.831514	0          	0.831514   	0.831514   
13 	87    	0.831514	0          	0.831514   	0.831514   
14 	92    	0.831514	0          	0.831514   	0.831514   
15 	94    	0.831514	0          	0.831514   	0.831514   
16 	90    	0.831514	0          	0.831514   	0.83

pretty time-consuming

## xgboost

In [42]:
show_model_ga_search_cv(model_grid_ga_xgboost, XGBClassifier(), 'xgboost')


A class named 'FitnessMax' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.


A class named 'Individual' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.809142	0.00588586 	0.820305   	0.794894   
1  	85    	0.813752	0.00368218 	0.821596   	0.806221   
2  	92    	0.816063	0.00269351 	0.821596   	0.810446   
3  	91    	0.818011	0.00271812 	0.827308   	0.813282   
4  	92    	0.818445	0.00284205 	0.827308   	0.813263   
5  	91    	0.819008	0.00252252 	0.827308   	0.811835   
6  	91    	0.820316	0.00346716 	0.831534   	0.811854   
7  	91    	0.82361 	0.0048598  	0.832923   	0.81608    
8  	90    	0.826969	0.00474934 	0.832923   	0.818858   
9  	93    	0.830573	0.00244402 	0.832923   	0.823103   
10 	87    	0.831558	0.00117912 	0.832923   	0.824452   
11 	87    	0.832061	0.000674148	0.832923   	0.831534   
12 	85    	0.832589	0.000593171	0.832923   	0.831534   
13 	86    	0.832727	0.000685552	0.832923   	0.828717   
14 	92    	0.832867	0.000272166	0.832923   	0.831534   
15 	87    	0.832923	0          	0.832923   	0.832923   
16 	88    	0.832951	0.000197183	0.834331   	0.83

## lightGBM

In [44]:
show_model_ga_search_cv(model_grid_ga_lgbm, LGBMClassifier(boosting_type='dart'), 'light_gbm')


A class named 'FitnessMax' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.


A class named 'Individual' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.808228	0.00719228 	0.823103   	0.794894   
1  	89    	0.812807	0.00575651 	0.823103   	0.803345   
2  	90    	0.819108	0.00474662 	0.827269   	0.80757    
3  	92    	0.822481	0.00362311 	0.830125   	0.813224   
4  	93    	0.824137	0.00249851 	0.828658   	0.814671   
5  	92    	0.825914	0.00195957 	0.828678   	0.823024   
6  	92    	0.826844	0.00211983 	0.828678   	0.81741    
7  	87    	0.827038	0.00349932 	0.828678   	0.810426   
8  	93    	0.827486	0.00347349 	0.828678   	0.810426   
9  	93    	0.828578	0.000430713	0.828678   	0.82588    
10 	92    	0.828664	9.12512e-06	0.828678   	0.828658   
11 	91    	0.828669	9.6549e-06 	0.828678   	0.828658   
12 	86    	0.828478	0.00119732 	0.828678   	0.820207   
13 	87    	0.828678	1.11022e-16	0.828678   	0.828678   
14 	95    	0.828678	1.11022e-16	0.828678   	0.828678   
15 	86    	0.828678	1.11022e-16	0.828678   	0.828678   
16 	88    	0.828678	1.11022e-16	0.828678   	0.82


Found `num_iterations` in params. Will use it instead of argument




Model: light_gbm 

Accuracy: 0.8301251956181532 

Best params {'learning_rate': 0.010766901543821723, 'n_estimators': 282, 'num_iterations': 876, 'lambda_l2': 3, 'bagging_fraction': 0.8174185533591544, 'min_data_in_leaf': 20, 'num_leaves': 37} 



In [46]:
show_model_ga_search_cv(model_grid_ga_lgbm, LGBMClassifier(boosting_type='gbdt'), 'light_gbm')


A class named 'FitnessMax' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.


A class named 'Individual' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.800436	0.0084322  	0.821635   	0.787911   
1  	90    	0.808866	0.00636555 	0.821635   	0.794953   
2  	90    	0.813703	0.00415456 	0.821635   	0.803365   
3  	90    	0.817477	0.00305386 	0.823083   	0.813165   
4  	94    	0.819958	0.00285629 	0.82588    	0.814613   
5  	85    	0.822127	0.00269901 	0.827289   	0.814613   
6  	85    	0.824304	0.0020826  	0.827289   	0.817449   
7  	90    	0.825686	0.00108586 	0.827289   	0.821674   
8  	91    	0.826473	0.000799263	0.827289   	0.824491   
9  	86    	0.826894	0.000632393	0.827289   	0.82588    
10 	88    	0.827289	1.11022e-16	0.827289   	0.827289   
11 	92    	0.827289	1.11022e-16	0.827289   	0.827289   
12 	92    	0.827289	1.11022e-16	0.827289   	0.827289   
13 	93    	0.827289	1.11022e-16	0.827289   	0.827289   
14 	86    	0.827289	1.11022e-16	0.827289   	0.827289   
15 	92    	0.827289	1.11022e-16	0.827289   	0.827289   
16 	93    	0.827289	1.11022e-16	0.827289   	0.82


Found `num_iterations` in params. Will use it instead of argument



## NNET

In [119]:
show_model_ga_search_cv(model_grid_ga_nnet,
                        MLPClassifier(learning_rate='invscaling'),
                        'nnet')


A class named 'FitnessMax' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.


A class named 'Individual' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.796898	0.00835422 	0.813204   	0.77813    
1  	90    	0.802733	0.00551564 	0.813204   	0.788048   
2  	86    	0.807564	0.00550082 	0.823142   	0.7893     
3  	94    	0.808262	0.00610343 	0.823142   	0.79511    
4  	92    	0.810047	0.00746986 	0.823142   	0.788009   
5  	89    	0.810717	0.00703664 	0.823142   	0.788009   
6  	93    	0.813029	0.00695653 	0.823142   	0.799218   
7  	91    	0.812185	0.00830172 	0.823142   	0.792156   
8  	91    	0.815236	0.00703466 	0.823142   	0.797868   
9  	95    	0.816138	0.00784926 	0.823142   	0.793545   
10 	91    	0.817822	0.00622713 	0.825939   	0.803482   
11 	92    	0.817262	0.00654253 	0.8259     	0.803365   
12 	83    	0.818773	0.00606083 	0.8259     	0.799159   
13 	93    	0.817281	0.00776736 	0.827308   	0.79644    
14 	88    	0.819605	0.00603253 	0.827308   	0.804773   
15 	89    	0.818987	0.00733472 	0.827308   	0.799178   
16 	91    	0.819811	0.00669861 	0.827308   	0.80

In [120]:
show_model_ga_search_cv(model_grid_ga_nnet,
                        MLPClassifier(learning_rate='constant'),
                        'nnet')

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.796768	0.0110448  	0.832942   	0.765532   
1  	91    	0.802444	0.00785906 	0.832942   	0.790826   
2  	88    	0.805655	0.00737293 	0.832942   	0.796342   
3  	92    	0.805726	0.00634879 	0.832942   	0.790806   
4  	85    	0.809351	0.00708984 	0.832942   	0.799159   
5  	89    	0.810555	0.00823491 	0.832942   	0.786581   
6  	86    	0.81543 	0.00969156 	0.832942   	0.799198   
7  	90    	0.817166	0.0103342  	0.832942   	0.799257   
8  	87    	0.81927 	0.0098839  	0.832942   	0.804773   
9  	93    	0.817966	0.00781265 	0.832942   	0.804832   
10 	87    	0.819156	0.00986549 	0.832942   	0.800606   
11 	94    	0.817183	0.0101856  	0.832942   	0.794992   
12 	88    	0.815897	0.0105446  	0.832942   	0.797809   
13 	90    	0.81463 	0.0102775  	0.832942   	0.797731   
14 	87    	0.818547	0.0102567  	0.832942   	0.800626   
15 	90    	0.817334	0.0115275  	0.832942   	0.796342   
16 	94    	0.818795	0.0127544  	0.832942   	0.79


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



25 	95    	0.8215  	0.0129336  	0.832942   	0.794953   
26 	89    	0.825094	0.00988623 	0.832942   	0.80757    
28 	91    	0.820931	0.0118421  	0.832942   	0.79777    

sklearn-genetic-opt closed prematurely. Will use the current best model.
INFO: Stopping the algorithm

Model: nnet 

Accuracy: 0.832942097026604 

Best params {'learning_rate_init': 0.3215659511077361, 'max_iter': 1011, 'hidden_layer_sizes': 764} 



In [122]:
show_model_ga_search_cv(model_grid_ga_nnet,
                        MLPClassifier(learning_rate='adaptive'),
                        'nnet')

## Results

the best models are xgboost, svm

## Ensembling best models

In [113]:
best_grid = {
    'subsample': 0.816261607081251,
    'colsample_bytree': 0.8565208459725928,
    'max_depth': 2,
    'min_child_weight': 2,
    'learning_rate': 0.017752334965135397,
    'n_estimators': 178
}
model_xgb = XGBClassifier()
model_xgb.fit(X, y)
print('best xgb')

best xgb


In [114]:
best_grid = {
    'C': 326.70658550096834,
    'gamma': 0.7482090936576415
}
model_svc = SVC(kernel='rbf')
model_svc.fit(X, y)
print('best svc')

best svc


In [115]:
estimators = [('xgb', model_xgb), ('svc', model_svc)]
ensemble = VotingClassifier(estimators, voting='hard')
ensemble.fit(X, y)
print('best ensemble')

In [None]:
#{'learning_rate_init': 0.4047953483157897, 'max_iter': 782, 'hidden_layer_sizes': 182} 

# Prediction on new data (Kaggle submission)

In [117]:
X_test_new = pd.read_csv('test.csv')
X_test_new = titanic_transform(X_test_new)

predictions = ensemble.predict(X_test_new)

X_test_ps = pd.read_csv('test.csv')
submission = pd.DataFrame({'PassengerId': X_test_ps.PassengerId, 'Survived': predictions})
submission.to_csv('submission.csv', index=False)