In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
titanic_train_original = pd.read_csv('titanic/train.csv')
titanic_test_original = pd.read_csv('titanic/test.csv')

### Meta info

In [3]:
titanic_train = titanic_train_original.copy()

In [4]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
titanic_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
cat_features = ['Pclass', 'Sex', 'Embarked']
str_features = ['Name', 'Ticket', 'Cabin']
num_features = ['Age', 'SibSp', 'Parch', 'Fare']

### Imputing

In [8]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
titanic_train['Age'] = imputer.fit_transform(titanic_train.Age.values.reshape(-1, 1))
imputer = SimpleImputer(strategy='constant', fill_value='Empty')
titanic_train['Cabin'] = imputer.fit_transform(titanic_train.Cabin.values.reshape(-1, 1))
titanic_train['Embarked'] = imputer.fit_transform(titanic_train.Embarked.values.reshape(-1, 1))

### Encoding

In [9]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

ticket_encoder = OrdinalEncoder()
titanic_train['Ticket_ord'] = ticket_encoder.fit_transform(titanic_train.Ticket.values.reshape(-1, 1))

In [43]:
def get_encoded_features(df, features):
    encoder = OneHotEncoder(sparse=False, drop='first')
    encoded_features = encoder.fit_transform(df[features])
    
    categories = []
    for feature, category in zip(features, encoder.categories_):
        categories.extend(feature + '_' + str(cat) for cat in category[1:])
        
    return pd.DataFrame(encoded_features, columns=categories, index=df.index)

In [44]:
cat_encoded = get_encoded_features(titanic_train, cat_features)

In [45]:
titanic_train = pd.concat([
    titanic_train,
    cat_encoded
], axis=1)

In [46]:
titanic_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Ticket_ord,Pclass_2,Pclass_3,...,Embarked_Empty,Embarked_Q,Embarked_S,cabin_known,Pclass_2.1,Pclass_3.1,Sex_male,Embarked_Empty.1,Embarked_Q.1,Embarked_S.1
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,338.52862,0.20651,0.551066,...,0.002245,0.08642,0.722783,0.228956,0.20651,0.551066,0.647587,0.002245,0.08642,0.722783
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429,200.850657,0.405028,0.497665,...,0.047351,0.281141,0.447876,0.420397,0.405028,0.497665,0.47799,0.047351,0.281141,0.447876
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104,158.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542,337.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0,519.5,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,680.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


- Cabin: imuting using different techniques, feature engineering, scatter with all others 
- Same with Ticket and Name

### Cabin feature presence

In [15]:
titanic_train['cabin_known'] = titanic_train.Cabin.apply(
    lambda x: 0 if x == 'Empty' else 1
)

### Name analysis

In [39]:
names = pd.DataFrame(index=titanic_train.index)
names['full_name'] = titanic_train.Name
names['last_name'] = names.full_name.apply(lambda x: x.split()[0][:-1])
names['title_name'] = names.full_name.apply(lambda x: x.split()[1])
names['title_name'] = names.title_name.apply(
    lambda x: x if names.title_name.value_counts()[x] > 6 else 'Other'
)
names['first_letter'] = names.full_name.apply(lambda x: x[0])

In [220]:
encoded_name_features = get_encoded_features(names, ['title_name', 'first_letter'])
for feature in encoded_name_features.columns:
    print(feature, np.corrcoef(encoded_name_features[feature], y_train)[0,1])

title_name_Master. 0.08522056083929422
title_name_Miss. 0.32999928131271145
title_name_Mr. -0.5290078593371816
title_name_Mrs. 0.3405722968353644
title_name_Other -0.023098516517696215
first_letter_B 0.062353755171776465
first_letter_C 0.004447879429616431
first_letter_D 0.056744075678256546
first_letter_E -0.032155505992781386
first_letter_F 0.0012720764472610147
first_letter_G -0.06319609287597487
first_letter_H 0.0476184738935401
first_letter_I -0.008551007323102105
first_letter_J -0.0065909991737594644
first_letter_K -0.023115589520008386
first_letter_L -0.00433668790339465
first_letter_M 0.038435080296566324
first_letter_N 0.05031406729446108
first_letter_O -0.04116020555112222
first_letter_P -0.05203836914329196
first_letter_Q 0.06009484737835673
first_letter_R -0.03015165698836123
first_letter_S -0.03133910757347408
first_letter_T 0.05738046339508236
first_letter_U -0.026456468796962264
first_letter_V -0.09603981150428546
first_letter_W 0.004073395790013073
first_letter_Y -0.017

### Cabin first letter

In [120]:
titanic_train['cabin_first_letter'] = titanic_train.Cabin.apply(
    lambda x: '0' if x == 'Empty' else x[0]
)
encoder = OneHotEncoder(sparse=False, drop='first')
cabin_features = encoder.fit_transform(titanic_train.cabin_first_letter.values.reshape(-1,1))
categories = ['cabin_first_letter_' + str(cat) for cat in encoder.categories_[0][1:]]
cabin_features = pd.DataFrame(cabin_features, index=titanic_train.index, columns=categories)

In [121]:
cabin_features

Unnamed: 0,cabin_first_letter_A,cabin_first_letter_B,cabin_first_letter_C,cabin_first_letter_D,cabin_first_letter_E,cabin_first_letter_F,cabin_first_letter_G,cabin_first_letter_T
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
889,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### X y

In [127]:
y_train = titanic_train['Survived']

X_train = pd.concat([
    titanic_train[num_features],
    cat_encoded,
    encoded_name_features,
    cabin_features
], axis=1)

### Scaling

In [128]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

### Trying simple models

In [137]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score

preds = cross_val_predict(LogisticRegression(max_iter=1000), X_train, y_train, cv=10)

In [138]:
confusion_matrix(y_train, preds)

array([[481,  68],
       [ 90, 252]])

In [139]:
f1_score(y_train, preds)

0.7613293051359517

In [140]:
accuracy_score(y_train, preds)

0.8226711560044894

### Gradient Boosting

In [67]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [1e-2, 1e-3, 1e-4],
    'n_estimators': [100, 250, 500],
    'max_depth': np.arange(1, 15),
    'max_features': [.33, 'sqrt', 'log2', None]
}

grid_search = GridSearchCV(
    GradientBoostingClassifier(random_state=1, verbose=1),
    param_grid, verbose=1, cv=10, n_jobs=-1
)

In [69]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 1008 candidates, totalling 10080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 4976 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done 6026 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 7176 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 8426 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done 9776 tasks      | elapsed: 22.5min
[Parallel(n_jobs=-1)]: Done 10080 out of 1008

      Iter       Train Loss   Remaining Time 
         1           1.3200            1.24s
         2           1.3084            1.17s
         3           1.2969            1.13s
         4           1.2858            1.11s
         5           1.2749            1.09s
         6           1.2643            1.07s
         7           1.2539            1.06s
         8           1.2437            1.06s
         9           1.2337            1.06s
        10           1.2238            1.06s
        20           1.1341            1.01s
        30           1.0593            0.98s
        40           0.9944            0.96s
        50           0.9380            0.93s
        60           0.8894            0.91s
        70           0.8470            0.89s
        80           0.8081            0.87s
        90           0.7734            0.85s
       100           0.7418            0.85s
       200           0.5279            0.65s
       300           0.4396            0.43s
       40

GridSearchCV(cv=10,
             estimator=GradientBoostingClassifier(random_state=1, verbose=1),
             n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.001, 0.0001],
                         'loss': ['deviance', 'exponential'],
                         'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                         'max_features': [0.33, 'sqrt', 'log2', None],
                         'n_estimators': [100, 250, 500]},
             verbose=1)

In [70]:
grid_search.best_params_

{'learning_rate': 0.01,
 'loss': 'deviance',
 'max_depth': 6,
 'max_features': None,
 'n_estimators': 500}

In [87]:
best_est = GradientBoostingClassifier(**grid_search.best_estimator_.get_params())
best_est.n_estimators = 1500

preds = cross_val_predict(best_est, X_train, y_train, cv=10)

      Iter       Train Loss   Remaining Time 
         1           1.3190            3.66s
         2           1.3069            3.41s
         3           1.2952            3.22s
         4           1.2836            3.17s
         5           1.2723            3.14s
         6           1.2613            3.13s
         7           1.2505            3.13s
         8           1.2397            3.16s
         9           1.2291            3.14s
        10           1.2189            3.16s
        20           1.1255            3.15s
        30           1.0473            3.08s
        40           0.9808            3.03s
        50           0.9233            2.98s
        60           0.8741            2.94s
        70           0.8321            2.90s
        80           0.7948            2.87s
        90           0.7594            2.84s
       100           0.7281            2.82s
       200           0.5047            2.67s
       300           0.4072            2.42s
       40

       100           0.7237            3.00s
       200           0.5224            2.73s
       300           0.4333            2.47s
       400           0.3642            2.28s
       500           0.3148            2.05s
       600           0.2759            1.83s
       700           0.2431            1.61s
       800           0.2192            1.40s
       900           0.1945            1.19s
      1000           0.1737            0.99s
      Iter       Train Loss   Remaining Time 
         1           1.3198            2.83s
         2           1.3078            2.94s
         3           1.2961            2.92s
         4           1.2846            2.93s
         5           1.2734            2.92s
         6           1.2624            2.92s
         7           1.2517            2.91s
         8           1.2411            2.93s
         9           1.2308            2.91s
        10           1.2207            2.90s
        20           1.1301            2.85s
        3

In [88]:
confusion_matrix(y_train, preds)

array([[488,  61],
       [ 88, 254]])

In [89]:
f1_score(y_train, preds)

0.7732115677321156

### Random Forest

In [92]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [100, 250, 500, 1000],
    'max_depth': np.arange(1, 15),
    'max_features': [.33, 'sqrt', 'log2', None]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=1, verbose=1),
    param_grid, verbose=1, cv=10, n_jobs=-1
)

In [93]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 224 candidates, totalling 2240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   53.1s
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2240 out of 2240 | elapsed:  3.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=1, verbose=1),
             n_jobs=-1,
             param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                         'max_features': [0.33, 'sqrt', 'log2', None],
                         'n_estimators': [100, 250, 500, 1000]},
             verbose=1)

In [94]:
grid_search.best_params_

{'max_depth': 7, 'max_features': 0.33, 'n_estimators': 100}

In [102]:
preds = cross_val_predict(grid_search.best_estimator_, X_train, y_train, cv=10)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

In [103]:
confusion_matrix(y_train, preds)

array([[501,  48],
       [ 95, 247]])

In [104]:
f1_score(y_train, preds)

0.7755102040816327

In [133]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, preds)

0.8226711560044894

## Pipeline

In [141]:
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator

### Imputer

In [232]:
class CustomImputer(BaseEstimator, TransformerMixin):
    """
    Performs specified imputing for each of Age, Cabin and Embarked features.
    """
    def __init__(self, age_strategy='mean', cabin_strategy='constant', embarked_strategy='drop_observations'):
        self.feature_names = ['Age', 'Cabin', 'Embarked']
        self.strategies = [age_strategy, cabin_strategy, embarked_strategy]
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        for feature_name, strategy in zip(self.feature_names, self.strategies):
            if strategy == 'drop_feature':
                X.drop([feature_name], axis=1, inplace=True)
            elif strategy == 'drop_observations':
                X.dropna(subset=[feature_name], inplace=True)
            else:
                imputer = SimpleImputer(strategy=strategy, fill_value='Empty') if strategy != 'drop' else 'drop'
                X[feature_name] = imputer.fit_transform(X[feature_name].values.reshape(-1, 1))
        return X

### Ticket Processor

In [233]:
class TicketProcessor(TransformerMixin, BaseEstimator):
    """
    Splits ticket to number and series.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X['Ticket_series'] = X.Ticket.apply(self._get_ticket_series)
        X['Ticket_number'] = X.Ticket.apply(self._get_ticket_number)
        encoded_series = self._get_encoded_series(X.Ticket_series)
        X.drop(['Ticket'], axis=1, inplace=True)
        X.drop(['Ticket_series'], axis=1, inplace=True)
        return pd.concat([X, encoded_series], axis=1)

    def _get_ticket_series(self, ticket):
        splitted = ticket.split()
        if len(splitted) > 1:
            return '_'.join(splitted[:-1])
        elif splitted[0] == 'LINE':
            return 'LINE'
        else:
            return 'Empty'

    def _get_ticket_number(self, ticket):
        splitted = ticket.split()
        if len(splitted) > 1:
            return int(splitted[-1])
        elif splitted[0] == 'LINE':
            return 0
        else:
            return int(splitted[0])

    def _get_encoded_series(self, ticket_series):
        encoder = OneHotEncoder(sparse=False, drop='first')
        encoded_series = encoder.fit_transform(ticket_series.values.reshape(-1, 1))
        categories = ['Ticket_series_' + cat for cat in encoder.categories_[0][1:]]
        return pd.DataFrame(encoded_series, index=ticket_series.index, columns=categories)

### Encoder

In [240]:
class CustomEncoder(BaseEstimator, TransformerMixin):
    """
    Encodes provided features.
    """
    def __init__(self, feature_names=None):
        self.feature_names = feature_names or ['Pclass', 'Sex', 'Embarked']
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        encoder = OneHotEncoder(sparse=False, drop='first')
        encoded_features = encoder.fit_transform(X[self.feature_names])

        categories = []
        for feature, category in zip(self.feature_names, encoder.categories_):
            categories.extend(feature + '_' + str(cat) for cat in category[1:])

        X = pd.concat([
            X, pd.DataFrame(encoded_features, columns=categories, index=X.index)
        ], axis=1)

        X.drop(self.feature_names, axis=1, inplace=True)
        return X

### Cabin Encoder

In [252]:
class CabinEncoder(BaseEstimator, TransformerMixin):
    """
    Get some features from Cabin feature.
    """
    def __init__(self, encode_by='first_letter', known=True):
        self.encode_by = encode_by
        self.known = known
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        encoder = OneHotEncoder(sparse=False, drop='first')
        if self.encode_by == 'first_letter':
            cabin_first_letter = X.Cabin.apply(
                lambda x: '0' if x == 'Empty' else x[0]
            )
            cabin_features = encoder.fit_transform(cabin_first_letter.values.reshape(-1,1))
            categories = ['Cabin_first_letter_' + cat for cat in encoder.categories_[0][1:]]
            cabin_features = pd.DataFrame(cabin_features, index=X.index, columns=categories)
        else:
            cabin_features = encoder.fit_transform(X.Cabin.values.reshape(-1,1))
            categories = ['Cabin_' + cat for cat in encoder.categories_[0][1:]]
            cabin_features = pd.DataFrame(cabin_features, index=X.index, columns=categories)
        if self.known:
            X['Cabin_known'] = X.Cabin.apply(lambda x: 0 if x == 'Empty' else 1)
        X.drop(['Cabin'], axis=1, inplace=True)
        return pd.concat([X, cabin_features], axis=1)

### Name Processor

In [260]:
class NameProcessor(BaseEstimator, TransformerMixin):
    def __init__(self, first_letter=True, in_braces=True, title=True):
        self.first_letter = first_letter
        self.in_braces = in_braces
        self.title = title
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.first_letter:
            X['Name_first_letter'] = X.Name.apply(lambda x: x[0])
        if self.title:
            X['Name_title'] = X.Name.apply(lambda x: x.split()[1])
            X['Name_title'] = X.Name_title.apply(
                lambda x: x if X.Name_title.value_counts()[x] > 6 else 'Other'
            )
        if self.in_braces:
            X['Name_in_braces'] = X.Name.apply(
                lambda x: x.split('(', 1)[1].split(')')[0] if '(' in x else 'Empty'
            )
        X.drop(['Name'], axis=1, inplace=True)
        return X

In [262]:
temp = titanic_train_original.copy()
temp = CustomImputer().transform(temp)
temp = TicketProcessor().transform(temp)
temp = CustomEncoder().transform(temp)
temp = CabinEncoder().transform(temp)
temp = NameProcessor().transform(temp)

In [280]:
temp.shape

(889, 68)

### Full Pipeline

In [295]:
full_pipeline = Pipeline([
    ('imputer', CustomImputer()),
    ('ticket_processor', TicketProcessor()),
    ('encoder', CustomEncoder()),
    ('cabin_encoder', CabinEncoder()),
    ('name_processor', NameProcessor()),
    ('estimator', RandomForestClassifier())
])

### Grid Search

In [286]:
SEED = 1

In [288]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    titanic_train_original, test_size=100,
    random_state=SEED, shuffle=True,
    stratify=titanic_train_original.Survived
)

X_train = train.drop(['Survived'], axis=1)
X_test = test.drop(['Survived'], axis=1)
y_train = train.Survived
y_test = test.Survived

In [296]:
param_grid = {
    'imputer__age_strategy': ['mean', 'most_frequent', 'median'],
    'imputer__embarked_strategy': ['most_frequent'],
    'cabin_encoder__encode_by': ['first_letter', 'whole'],
    'name_processor__first_letter': [True, False],
    'name_processor__in_braces': [True, False],
    'name_processor__title': [True, False],
    
    #'estimator__n_estimators': [100, 200, 500, 1000],
    #'estimator__criterion': ['gini', 'entropy'],
    #'estimator__max_depth': np.arange(1, 16),
    #'estimator__max_features': [.33, 'sqrt', 'log2', None],
}

grid_search = GridSearchCV(
    full_pipeline,
    param_grid,
    scoring='accuracy',
    n_jobs=10,
    cv=6,
    verbose=2,
)

In [297]:
grid_search.fit(X_train, y_train)

Fitting 6 folds for each of 48 candidates, totalling 288 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:    0.6s
[Parallel(n_jobs=10)]: Done 288 out of 288 | elapsed:    1.1s finished


RuntimeError: Cannot clone object CustomImputer(cabin_strategy=None, embarked_strategy='most_frequent'), as the constructor either does not set or modifies parameter age_strategy

### Submition

In [135]:
submition = pd.read_csv('titanic/gender_submission.csv')

In [136]:
submition

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
