#### No anomaly detection, feature selection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
titanic_train_original = pd.read_csv('dataset/train.csv')
titanic_test_original = pd.read_csv('dataset/test.csv')

## Pipeline

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator, clone

### Imputer

In [4]:
from sklearn.impute import SimpleImputer

class CustomImputer(BaseEstimator, TransformerMixin):
    """
    Performs specified imputing for each of Age, Cabin and Embarked features.
    """
    def __init__(self, age_strategy='mean', cabin_strategy='constant', embarked_strategy='most_frequent'):
        self.age_strategy = age_strategy
        self.cabin_strategy = cabin_strategy
        self.embarked_strategy = embarked_strategy
        self.fare_strategy = 'mean'
        self.feature_names = ['Age', 'Fare', 'Cabin', 'Embarked']
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        strategies = [self.age_strategy, self.fare_strategy, self.cabin_strategy, self.embarked_strategy]
        for feature_name, strategy in zip(self.feature_names, strategies):
            imputer = SimpleImputer(strategy=strategy, fill_value='Empty')
            imputed = imputer.fit_transform(X[feature_name].values.reshape(-1, 1))
            X = X.assign(**{feature_name: imputed})
        return X

### Ticket Processor

In [5]:
from sklearn.preprocessing import OneHotEncoder

class TicketProcessor(TransformerMixin, BaseEstimator):
    """
    Splits ticket to number and series.
    """
    def fit(self, X, y=None):
        self.encoder_ = OneHotEncoder(sparse=False, handle_unknown='ignore')
        ticket_series = X.Ticket.apply(self._get_ticket_series)
        self.encoder_.fit(ticket_series.values.reshape(-1, 1))
        return self

    def transform(self, X, y=None):
        ticket_series = X.Ticket.apply(self._get_ticket_series)
        encoded_series = self._get_encoded_series(ticket_series)
        X = X.assign(Ticket_number=X.Ticket.apply(self._get_ticket_number))
        X.drop(['Ticket'], axis=1, inplace=True)
        return pd.concat([X, encoded_series], axis=1)

    def _get_ticket_series(self, ticket):
        splitted = ticket.split()
        if len(splitted) > 1:
            return '_'.join(splitted[:-1])
        elif splitted[0] == 'LINE':
            return 'LINE'
        else:
            return 'Empty'

    def _get_ticket_number(self, ticket):
        splitted = ticket.split()
        if len(splitted) > 1:
            return int(splitted[-1])
        elif splitted[0] == 'LINE':
            return 0
        else:
            return int(splitted[0])

    def _get_encoded_series(self, ticket_series):
        encoded_series = self.encoder_.transform(ticket_series.values.reshape(-1, 1))
        categories = ['Ticket_series_' + cat for cat in self.encoder_.categories_[0]]
        return pd.DataFrame(encoded_series, index=ticket_series.index, columns=categories)

### Encoder

In [6]:
class CustomEncoder(BaseEstimator, TransformerMixin):
    """
    Encodes provided features.
    """
    def __init__(self, feature_names=None):
        self.feature_names = feature_names or ['Pclass', 'Sex', 'Embarked']

    def fit(self, X, y=None):
        self.encoder_ = OneHotEncoder(sparse=False, handle_unknown='ignore')
        self.encoder_.fit(X[self.feature_names])
        return self

    def transform(self, X, y=None):
        encoded_features = self.encoder_.transform(X[self.feature_names])

        categories = []
        for feature, category in zip(self.feature_names, self.encoder_.categories_):
            categories.extend(feature + '_' + str(cat) for cat in category)

        X = pd.concat([
            X, pd.DataFrame(encoded_features, columns=categories, index=X.index)
        ], axis=1)

        X.drop(self.feature_names, axis=1, inplace=True)
        return X

### Cabin Encoder

In [7]:
class CabinEncoder(BaseEstimator, TransformerMixin):
    """
    Get some features from Cabin feature.
    """
    def __init__(self, encode_by='first_letter', known=True):
        self.encode_by = encode_by
        self.known = known

    def fit(self, X, y=None):
        self.encoder_ = OneHotEncoder(sparse=False, handle_unknown='ignore')
        if self.encode_by == 'first_letter':
            cabin_first_letter = X.Cabin.apply(
                lambda x: '0' if x == 'Empty' else x[0]
            )
            self.encoder_.fit(cabin_first_letter.values.reshape(-1,1))
        else:
            self.encoder_.fit(X.Cabin.values.reshape(-1,1))
        return self

    def transform(self, X, y=None):
        if self.encode_by == 'first_letter':
            cabin_first_letter = X.Cabin.apply(
                lambda x: '0' if x == 'Empty' else x[0]
            )
            cabin_features = self.encoder_.transform(cabin_first_letter.values.reshape(-1,1))
            categories = ['Cabin_first_letter_' + cat for cat in self.encoder_.categories_[0]]
            cabin_features = pd.DataFrame(cabin_features, index=X.index, columns=categories)
        else:
            cabin_features = self.encoder_.transform(X.Cabin.values.reshape(-1,1))
            categories = ['Cabin_' + cat for cat in self.encoder_.categories_[0]]
            cabin_features = pd.DataFrame(cabin_features, index=X.index, columns=categories)
        if self.known:
            X = X.assign(Cabin_known=X.Cabin.apply(lambda x: 0 if x == 'Empty' else 1))
        X.drop(['Cabin'], axis=1, inplace=True)
        return pd.concat([X, cabin_features], axis=1)

### Name Processor

In [8]:
class NameProcessor(BaseEstimator, TransformerMixin):
    def __init__(self, first_letter=True, in_braces=True, title=True):
        self.first_letter = first_letter
        self.in_braces = in_braces
        self.title = title

    def fit(self, X, y=None):
        if self.first_letter:
            name_first_letter = X.Name.apply(lambda x: x[0])
            self.first_letter_encoder_ = OneHotEncoder(sparse=False, handle_unknown='ignore')
            self.first_letter_encoder_.fit(name_first_letter.values.reshape(-1,1))
        if self.title:
            name_title = X.Name.apply(lambda x: x.split()[1])
            name_title = name_title.apply(
                lambda x: x if name_title.value_counts()[x] > 6 else 'Other'
            )
            self.title_encoder_ = OneHotEncoder(sparse=False, handle_unknown='ignore')
            self.title_encoder_.fit(name_title.values.reshape(-1,1))
        if self.in_braces:
            name_in_braces = X.Name.apply(
                lambda x: x.split('(', 1)[1].split(')')[0] if '(' in x else 'Empty'
            )
            self.in_braces_encoder_ = OneHotEncoder(sparse=False, handle_unknown='ignore')
            self.in_braces_encoder_.fit(name_in_braces.values.reshape(-1,1))
        return self

    def transform(self, X, y=None):
        encoder = OneHotEncoder(sparse=False, drop='first')
        if self.first_letter:
            name_first_letter = X.Name.apply(lambda x: x[0])
            first_letter_features = self.first_letter_encoder_.transform(name_first_letter.values.reshape(-1,1))
            categories = ['Name_first_letter' + cat for cat in self.first_letter_encoder_.categories_[0]]
            first_letter_features = pd.DataFrame(first_letter_features, index=X.index, columns=categories)
            X = pd.concat([X, first_letter_features], axis=1)
        if self.title:
            name_title = X.Name.apply(lambda x: x.split()[1])
            name_title = name_title.apply(
                lambda x: x if name_title.value_counts()[x] > 6 else 'Other'
            )
            title_features = self.title_encoder_.transform(name_title.values.reshape(-1,1))
            categories = ['Name_title_' + cat for cat in self.title_encoder_.categories_[0]]
            title_features = pd.DataFrame(title_features, index=X.index, columns=categories)
            X = pd.concat([X, title_features], axis=1)
        if self.in_braces:
            name_in_braces = X.Name.apply(
                lambda x: x.split('(', 1)[1].split(')')[0] if '(' in x else 'Empty'
            )
            in_braces_features = self.in_braces_encoder_.transform(name_in_braces.values.reshape(-1,1))
            categories = ['Name_in_braces_' + cat for cat in self.in_braces_encoder_.categories_[0]]
            in_braces_features = pd.DataFrame(in_braces_features, index=X.index, columns=categories)
            X = pd.concat([X, in_braces_features], axis=1)
        X.drop(['Name'], axis=1, inplace=True)
        return X

### Full Pipeline

In [9]:
from sklearn.ensemble import RandomForestClassifier

SEED = 4

In [10]:
full_pipeline = Pipeline([
    ('imputer', CustomImputer()),
    ('ticket_processor', TicketProcessor()),
    ('encoder', CustomEncoder()),
    ('cabin_encoder', CabinEncoder()),
    ('name_processor', NameProcessor()),
    ('estimator', RandomForestClassifier(random_state=SEED))
])

### Grid Search

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=6, shuffle=True, random_state=SEED)
X = titanic_train_original.drop(['Survived'], axis=1)
y = titanic_train_original.Survived.copy()

In [13]:
param_grid = {
    'imputer__age_strategy': ['mean', 'most_frequent', 'median'],
    'cabin_encoder__encode_by': ['first_letter', 'whole'],
    'name_processor__first_letter': [True, False],
    'name_processor__in_braces': [True, False],
    'name_processor__title': [True, False],
    
    'estimator__n_estimators': [100, 200, 500, 1000],
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__max_depth': np.arange(1, 16),
    'estimator__max_features': [.33, 'sqrt', 'log2', None],
}

grid_search = GridSearchCV(
    full_pipeline,
    param_grid,
    scoring='accuracy',
    n_jobs=4,
    cv=cv,
    verbose=3,
)

In [14]:
grid_search.fit(X, y)

Fitting 6 folds for each of 23040 candidates, totalling 138240 fits


KeyboardInterrupt: 

In [None]:
rank = grid_search.cv_results_['rank_test_score']
sort_idx = np.argsort(rank)
best_params = np.array(grid_search.cv_results_['params'])[sort_idx]
scores = grid_search.cv_results_['mean_test_score'][sort_idx]

for i in range(5):
    print(i+1)
    print(scores[i])
    print(best_params[i])
    print()

### Submition

In [None]:
best_estimators = [clone(full_pipeline).set_params(**params).fit(X, y) for params in best_params[:3]]

In [None]:
preds = []
for estimator in best_estimators:
    preds.append(estimator.predict(titanic_test_original))

In [None]:
submition = pd.read_csv('dataset/gender_submission.csv')

In [None]:
submition