In [9]:
import numpy as np
import pandas as pd
from numpy import interp
from matplotlib import pyplot as plt

pd.set_option("display.max_columns", None)

from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.preprocessing import StandardScaler, FunctionTransformer, PolynomialFeatures

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.metrics import plot_confusion_matrix, roc_auc_score,  auc, \
    precision_recall_fscore_support, classification_report, roc_curve, plot_roc_curve

from sklearn import set_config
set_config(display='diagram')   

from catboost import CatBoostClassifier


from itertools import cycle
from time import time

from lightgbm import LGBMClassifier

# Import data

In [10]:
df_train = pd.read_csv('data/train.csv')

In [11]:
df_test = pd.read_csv('data/test.csv')

In [12]:
sample_sub = pd.read_csv('data/sample_submission.csv')

In [13]:
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.00,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.00,0,0,427635,7.76,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,1,2,"Bell, Adele",female,62.00,0,0,PC 15008,14.86,D17243,C
99996,99996,0,2,"Brown, Herman",male,66.00,0,0,13273,11.15,,S
99997,99997,0,3,"Childress, Charles",male,37.00,0,0,,9.95,,S
99998,99998,0,3,"Caughlin, Thomas",male,51.00,0,1,458654,30.92,,S


In [14]:
df_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,100000,3,"Holliday, Daniel",male,19.0,0,0,24745,63.01,,S
1,100001,3,"Nguyen, Lorraine",female,53.0,0,0,13264,5.81,,S
2,100002,1,"Harris, Heather",female,19.0,0,0,25990,38.91,B15315,C
3,100003,2,"Larsen, Eric",male,25.0,0,0,314011,12.93,,S
4,100004,1,"Cleary, Sarah",female,17.0,0,2,26203,26.89,B22515,C
...,...,...,...,...,...,...,...,...,...,...,...
99995,199995,3,"Cash, Cheryle",female,27.0,0,0,7686,10.12,,Q
99996,199996,1,"Brown, Howard",male,59.0,1,0,13004,68.31,,S
99997,199997,3,"Lightfoot, Cameron",male,47.0,0,0,4383317,10.87,,S
99998,199998,1,"Jacobsen, Margaret",female,49.0,1,2,PC 26988,29.68,B20828,C


In [15]:
sample_sub

Unnamed: 0,PassengerId,Survived
0,100000,1
1,100001,1
2,100002,1
3,100003,1
4,100004,1
...,...,...
99995,199995,1
99996,199996,1
99997,199997,1
99998,199998,1


## Basic preprocess

In [16]:
X_train = df_train[[col for col in df_train.columns if col != 'Survived']]
X_train.columns = [c.lower() for c in X_train.columns]
X_train.set_index('passengerid', inplace=True)
X_train.head()

Unnamed: 0_level_0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


In [17]:
y_train = df_train[['PassengerId','Survived']]
y_train.columns = [c.lower() for c in y_train.columns]
y_train.set_index('passengerid', inplace=True)
y_train = y_train['survived']
y_train.head()

passengerid
0    1
1    0
2    0
3    0
4    1
Name: survived, dtype: int64

In [18]:
X_test = df_test[[col for col in df_test.columns if col != 'Survived']]
X_test.columns = [c.lower() for c in X_test.columns]
X_test.set_index('passengerid', inplace=True)
X_test.head()

Unnamed: 0_level_0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100000,3,"Holliday, Daniel",male,19.0,0,0,24745,63.01,,S
100001,3,"Nguyen, Lorraine",female,53.0,0,0,13264,5.81,,S
100002,1,"Harris, Heather",female,19.0,0,0,25990,38.91,B15315,C
100003,2,"Larsen, Eric",male,25.0,0,0,314011,12.93,,S
100004,1,"Cleary, Sarah",female,17.0,0,2,26203,26.89,B22515,C


## pipeline simple

In [19]:
text_features = ['name', 'ticket']
cat_features = ['sex', 'embarked', 'cabin']

In [20]:
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [21]:
original_features = X_train.columns.to_list()

In [22]:
def transform_name(df_base, col):
    df = df_base[col].apply(lambda x: x.split(',')).to_frame('list_name')
    df['surname'] = df['list_name'].apply(lambda x: x[0])
    df['forename'] = df['list_name'].apply(lambda x: x[1])
    return df[['surname', 'forename']]

In [23]:
def get_tck_str(tck):
    try:
        x = tck.split()
        try: 
            a = int(x[0])
            return None
        except:
            return x[0]
    except:
        return None
    
def get_tck_num(tck):
    try:
        x = tck.split()
        try:
            a = int(x[0])
            return a
        except:
            return None
    except:
        return None

def transform_ticket(df_base, col):
    df_ticket = df_base[col].to_frame('ticket')
    df_ticket['ticket_str'] = df_base[col].apply(get_tck_str)
    df_ticket['ticket_num'] = df_base[col].apply(get_tck_num)
    return df_ticket[['ticket_str', 'ticket_num']]

In [24]:
def transform_cabin(df_base, col):
    df_cabin = df_base[col].to_frame('cabin')
    df_cabin_clean = df_cabin[~df_cabin['cabin'].isna()].copy()
    df_cabin_clean['cabin_str'] = df_cabin_clean['cabin'].apply(lambda x: x[0])
    df_cabin_clean['cabin_num'] = df_cabin_clean['cabin'].apply(lambda x: x[1:]).astype(int)
    return df_cabin.join(df_cabin_clean[['cabin_str', 'cabin_num']])[['cabin_str', 'cabin_num']]

In [25]:
from sklearn.base import TransformerMixin, BaseEstimator
class PassNameTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        self.X = transform_name(X, 'name')
        return self.X

    # I have corrected the output here, See point 2
    def get_feature_names(self):
        return self.X.columns.tolist()

class TicketTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        self.X = transform_ticket(X, 'ticket')
        return self.X

    # I have corrected the output here, See point 2
    def get_feature_names(self):
        return self.X.columns.tolist()
    
class CabinTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        self.X = transform_cabin(X, 'cabin')
        return self.X

    # I have corrected the output here, See point 2
    def get_feature_names(self):
        return self.X.columns.tolist()
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        super(ColumnSelectTransformer).__init__()
        self.columns=columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.columns]

In [26]:
def lower_values(df):
    o_features = df.dtypes[df.dtypes=='object'].index.to_list()
    df_copy = df.copy()
    for c in o_features:
        df_copy[c] = df_copy[c].apply(lambda x: x.lower() if type(x)==str else None)
    return df_copy
class LowerTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        return lower_values(X)

In [27]:
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_selector
from tqdm.auto import tqdm

class featureUnion(FeatureUnion):
    def _hstack(self, Xs):
        cols = [X.columns.tolist() for X in Xs]
        dtypes = []
        for X in Xs:
            dtypes.append([str(X[col].dtype) for col in X])
        cols = np.hstack(cols)
        dtypes = np.hstack(dtypes)
        data = pd.DataFrame(super()._hstack(Xs), columns = cols)
        print('====Converting columns types====')
        for col, dtype in tqdm(zip(cols, dtypes)):
            data[col] = data[col].astype(dtype)
        return data

class columnTransformer(ColumnTransformer):
    def _hstack(self, Xs):
        cols = [X.columns.tolist() for X in Xs]
        dtypes = []
        print(cols)
        print([X.shape for X in Xs])
        for X in Xs:
            dtypes.append([str(X[col].dtype) for col in X])
        cols = np.hstack(cols)
        dtypes = np.hstack(dtypes)
        data = pd.DataFrame(super()._hstack(Xs), columns = cols)
        print('====Converting columns types====')
        for col, dtype in tqdm(zip(cols, dtypes)):
            data[col] = data[col].astype(dtype)
        return data

In [28]:
col_processor = featureUnion(transformer_list=[
    ('pass_name', PassNameTransformer()),
    ('ticket', TicketTransformer()),
    ('cabin', CabinTransformer()),
    ('others', ColumnSelectTransformer(columns=[c for c in original_features if c not in ('name', 'ticket', 'cabin')]))
]
)
pl = Pipeline(steps=[
    ('raw_data_processor', col_processor),
    ('lower_text_values', LowerTransformer())
])

In [29]:
pl

In [30]:
df2 = pl.fit_transform(X_train)

====Converting columns types====


0it [00:00, ?it/s]

In [31]:
df2

Unnamed: 0,surname,forename,ticket_str,ticket_num,cabin_str,cabin_num,pclass,sex,age,sibsp,parch,fare,embarked
0,oconnor,frankie,,209245.0,c,12239.0,1,male,,2,0,27.14,s
1,bryan,drew,,27323.0,,,3,male,,0,0,13.35,s
2,owens,kenneth,ca,,,,3,male,0.33,1,2,71.29,s
3,kramer,james,a.,,,,3,male,19.00,0,0,13.04,s
4,bond,michael,,427635.0,,,3,male,25.00,0,0,7.76,s
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,bell,adele,pc,,d,17243.0,2,female,62.00,0,0,14.86,c
99996,brown,herman,,13273.0,,,2,male,66.00,0,0,11.15,s
99997,childress,charles,,,,,3,male,37.00,0,0,9.95,s
99998,caughlin,thomas,,458654.0,,,3,male,51.00,0,1,30.92,s


In [32]:
from sklearn.impute import SimpleImputer

class simpleImputer(SimpleImputer):
    def fit(self, X, y=None):
        self._cols = X.columns.tolist()
        self._dtypes = [str(X[col].dtype) for col in X.columns]
        super().fit(X, y)
        return self
        
    def transform(self, X):
        X_ = super().transform(X)
        data = pd.DataFrame(X_, columns = self._cols)
        for col, dtype in tqdm(zip(self._cols, self._dtypes)):
            data[col] = data[col].astype(dtype)
        return data
        

In [None]:
col_handle_na = columnTransformer(
    transformers=[
        ('text_features', simpleImputer(missing_values=None, strategy='constant', fill_value='unk'), make_column_selector(dtype_include=['object'])),
        ('float_features', simpleImputer(strategy='median'), make_column_selector(dtype_include=['float64'])),
        ('count_features', simpleImputer(strategy='most_frequent'), make_column_selector(dtype_include=['int64']))
    ],
    remainder='passthrough'
)

In [None]:
pl_simple = Pipeline(steps=[
    ('pl', pl),
    ('null_handling', col_handle_na)
])

In [None]:
pl_simple

In [None]:
df3 = pl_simple.fit_transform(X_train)

In [None]:
df3

In [None]:
class PipelineLogger(object):
    def __init__(self):
        pass
        
    def log_start(self):
        self.start_time = time()
        print(f'======== {self.__class__.__name__} - START ========')
        return None
        
    def log_finish(self):
        self.duration = time() - self.start_time
        print(f'======== {self.__class__.__name__} - FINISH =======> Take: {self.duration:.6f}(s)')


class ExperimentBase(BaseEstimator):
    def evaluate(self, X_test, y_test):
        print('Evaluating model')
        print(classification_report(y_true=y_test, y_pred=self.predict(X_test)))
        metrics = self.auc_report(X_test, y_test)
        metrics['precision'], metrics['recall'], metrics['f1_score'], metrics['support'] = precision_recall_fscore_support(y_test, self.predict(X_test))
        return metrics
    
    def auc_report(self, X, y_true):
        classes = self.classes_
        y_pred_classes = self.predict_proba(X)
        n_classes = len(classes)

        lw = 2
        for i in range(len(classes)):
            print(f"""{classes[i]}: {roc_auc_score(y_true=(y_true==classes[i]).astype(int), y_score=y_pred_classes[:,i])}""")

        # Compute ROC curve and ROC area for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()

        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_true=(y_true==classes[i]).astype(int), y_score=y_pred_classes[:,i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(classes))]))

        # Then interpolate all ROC curves at this points
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(len(classes)):
            mean_tpr += interp(all_fpr, fpr[i], tpr[i])

        # Finally average it and compute AUC
        mean_tpr /= n_classes

        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

        # Plot all ROC curves
        plt.figure()

        plt.plot(fpr["macro"], tpr["macro"],
                 label='macro-average ROC curve (area = {0:0.2f})'
                       ''.format(roc_auc["macro"]),
                 color='navy', linestyle=':', linewidth=4)

        colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
        for i, color in zip(range(n_classes), colors):
            plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                     label='ROC curve of class {0} (area = {1:0.2f})'
                     ''.format(classes[i], roc_auc[i]))

        plt.plot([0, 1], [0, 1], 'k--', lw=lw)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Some extension of Receiver operating characteristic to multi-class')
        plt.legend(loc="lower right")
        plt.show()
        metrics = {
            'macro_auc': roc_auc["macro"]
        }
        for i in range(n_classes):
            metrics[f'auc_{classes[i]}'] = roc_auc[i]
        return metrics


In [None]:
class CustomCatBoostClassifier(CatBoostClassifier, ExperimentBase, PipelineLogger):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def fit(self, X, y, self_evaluate=True, **kwargs):
        self.log_start()
        if self._init_params.get('cat_features') is not None:
            cat_features_ = [c for c in self._init_params['cat_features'] if c in X.columns]
            self._init_params['cat_features'] = cat_features_
        else:
            cat_features_ = None
        if self._init_params.get('text_features') is not None:
            text_features_ = [c for c in self._init_params['text_features'] if c in X.columns]
            self._init_params['text_features'] = text_features_
        else:
            text_features_ = None
        
        X_t, X_e, y_t, y_e = self.train_eval_split(X, y, cat_features_, text_features_)
        super().fit(X_t, y_t, eval_set=(X_e, y_e), cat_features=cat_features_, text_features=text_features_)
        if self_evaluate:
            _ = self.evaluate(X_e, y_e)
        self.log_finish()
        return self
        
    def train_eval_split(self, X, y, cat_features_, text_features_, eval_frac=0.1, add_na_Xy=False, na_label=0):
        X_e = X.sample(frac=eval_frac, random_state=42)
        y_e = y.loc[X_e.index]
        X_t = X.drop(X_e.index)
        y_t = y.loc[X_t.index]
        if add_na_Xy:
            X_t = pd.concat([X_t, pd.DataFrame([[np.nan] * X_t.shape[1]], columns=X_t.columns)], ignore_index=True)
            y_t = pd.concat([y_t, pd.Series([na_label])], ignore_index=True)
        if cat_features_ is not None:
            cat_features_ = [c for c in cat_features_ if c in X.columns]
            X_t[cat_features_] = X_t[cat_features_].fillna('unk')
            X_e[cat_features_] = X_e[cat_features_].fillna('unk')
        if text_features_ is not None:
            text_features_ = [c for c in text_features_ if c in X.columns]
            X_t[text_features_] = X_t[text_features_].fillna('unk')
            X_e[text_features_] = X_e[text_features_].fillna('unk')
        
        return X_t, X_e, y_t, y_e


In [None]:
df3.head(2)

In [None]:
cat_features = ['ticket_str', 'cabin_str', 'embarked', 'sex'] + ['surname', 'forename']
# text_features = 
cb_cfg = {
    'iterations': 1000,
    'task_type': 'CPU',
    'cat_features': cat_features,
#     'text_features': text_features,
    'use_best_model': True,
    'early_stopping_rounds':50,
    'verbose': True,
    'metric_period': 25
}

In [None]:
pl4 = Pipeline(steps=[
    ('pl_simple', pl_simple),
    ('catboost_simple', CustomCatBoostClassifier(**cb_cfg))
])
pl4

In [None]:
pl4.fit(X_train, y_train)

In [None]:
b = pl4.predict(X_test)

In [None]:
df2[['surname', 'forename', 'ticket_str', 'cabin_str', 'sex', 'embarked']]

In [None]:
from sklearn.preprocessing import LabelEncoder
class CustomLabelEncoder(PipelineLogger,TransformerMixin, BaseEstimator):
    def __init__(self, columns=None):
        super(CustomLabelEncoder).__init__()
        self.columns = columns
        self.label_encoders = {}
        
    def fit(self, X, y=None):
        X_ = X[self.columns]
        X_ = pd.concat([X_, pd.DataFrame([['unk'] * X_.shape[1]], columns=X_.columns)], ignore_index=True)
        for c in self.columns:
            self.label_encoders[c] = LabelEncoder().fit(X_[c])
        print(self.label_encoders)
        return self
    
    def transform(self, X, y=None):
        X_ = pd.DataFrame()
        self.dummy_dicts = {}
        for c in self.columns:
            dd = {}
            for cl in self.label_encoders[c].classes_:
                dd[cl] = True
            sr = X[c].map(lambda s: 'unk' if dd.get(s) is None else s)
            X_[c] = self.label_encoders[c].transform(sr)
        return X_
            
    def inverse_transform(self, X_encode, y=None):
        X_decode = pd.DataFrame()
        for c in self.columns:
            X_decode[c] = self.label_encoders[c].inverse_transform(X_encode[c])
        return X_decode

In [None]:
mul_le = CustomLabelEncoder(
        columns=['surname', 'forename', 'ticket_str', 'cabin_str', 'sex', 'embarked']
)

In [None]:
label_encoding = columnTransformer(transformers=[
    ('category_encoder', mul_le, ['surname', 'forename', 'ticket_str', 'cabin_str', 'sex', 'embarked'])
], remainder='passthrough')

In [None]:
mul_le.fit_transform(df2)

In [None]:
df2

In [None]:
class CustomLGBMClassifier(ExperimentBase, LGBMClassifier):
    pass

In [None]:
pl_lgbm = Pipeline(steps=[
    ('prepro', pl),
    ('label_encoding', label_encoding),
    ('lgbm', CustomLGBMClassifier())
])

In [None]:
pl_lgbm.fit(X_train, y_train)

In [None]:
pl_lgbm.predict(X_test)

In [None]:
a = pl_lgbm.predict(X_train)

In [None]:
print(classification_report(y_train, a))

In [None]:
a

In [None]:
b = pl4.predict(X_train)

In [None]:
b

In [None]:
(~(a == b)).sum()

In [None]:
pl_lgbm['lgbm'].feature_importances_

In [None]:
from lightgbm import plot_importance as lgbm_importance
lgbm_importance(pl_lgbm['lgbm'])

In [None]:
pl_lgbm.predict(X_test)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
pl_ada = Pipeline(steps=[
    ('prepro', pl),
    ('label_encoding', label_encoding),
    ('na_handler', col_handle_na),
    ('ada', AdaBoostClassifier(n_estimators=300, learning_rate=0.5))
])

In [None]:
pl_ada.fit(X_train, y_train)

In [None]:
pl_ada.predict(X_test)

## Pipeline ensemble

In [None]:
from copy import deepcopy

In [None]:
pl_est_cb = Pipeline(steps=[
    ('null_handling', deepcopy(col_handle_na)),
    ('catboost_simple', CustomCatBoostClassifier(**cb_cfg))
])
pl_est_cb

In [None]:
pl__cb_simple = Pipeline(steps=[
    ('pl', pl),
    ('pl_est_cb', pl_est_cb)
])
pl__cb_simple.fit(X_train, y_train)
y_pred_cb = pl__cb_simple.predict(X_test)

In [None]:
pl_est_lgbm = Pipeline(steps=[
    ('label_encoding', deepcopy(label_encoding)),
    ('lgbm', CustomLGBMClassifier())
])
pl_est_lgbm

In [None]:
pl__lgbm = Pipeline(steps=[
    ('pl', pl),
    ('pl_est_lgbm', pl_est_lgbm)
])
pl__lgbm.fit(X_train, y_train)
y_pred_lgbm = pl__lgbm.predict(X_test)


In [None]:
pl_est_ada = Pipeline(steps=[
    ('label_encoding', deepcopy(label_encoding)),
    ('na_handler', deepcopy(col_handle_na)),
    ('ada', AdaBoostClassifier(n_estimators=300, learning_rate=0.5))
])
pl_est_ada

In [None]:
pl__ada = Pipeline(steps=[
    ('pl', pl),
    ('pl_est_ada', pl_est_ada)
])
pl__ada.fit(X_train, y_train)
y_pred_ada = pl__ada.predict(X_test)

In [None]:
y_ensemble_3_trees = ((y_pred_cb + y_pred_lgbm + y_pred_ada) > 1.5).astype(int)
y_ensemble_3_trees

In [None]:
Y_test_pred = X_test[['name']].copy()

In [None]:
Y_test_pred['cb_simple'] = pl__cb_simple.predict(X_test)
Y_test_pred[[f"cb_simple_{x}" for x in pl__cb_simple['pl_est_cb']['catboost_simple'].classes_]] = \
pl__cb_simple.predict_proba(X_test)

In [None]:
Y_test_pred['lgbm'] = pl__lgbm.predict(X_test)
Y_test_pred[[f"lgbm_{x}" for x in pl__lgbm['pl_est_lgbm']['lgbm'].classes_]] = \
pl__lgbm.predict_proba(X_test)

In [None]:
Y_test_pred['ada'] = pl__ada.predict(X_test)
Y_test_pred[[f"ada_{x}" for x in pl__ada['pl_est_ada']['ada'].classes_]] = \
pl__ada.predict_proba(X_test)

In [None]:
Y_test_pred

In [None]:
Y_test_pred['voting_hard'] = ((Y_test_pred['cb_simple'] + Y_test_pred['lgbm'] + Y_test_pred['ada']) > 1.5).astype(int)

In [None]:
Y_test_pred['voting_soft'] = ((Y_test_pred['cb_simple_1'] + Y_test_pred['lgbm_1'] + Y_test_pred['ada_1']) > 1.5).astype(int)

In [None]:
Y_test_pred

In [None]:
Y_test_pred[Y_test_pred['voting_hard']!=Y_test_pred['voting_soft']]

In [None]:
sm4 = Y_test_pred['cb_simple'].to_frame().reset_index().rename(columns={'passengerid':'PassengerId', 'cb_simple':'Survived'})
sm4.to_csv('sm__cb_simple.csv', index=False)

In [None]:
sm5 = Y_test_pred['voting_soft'].to_frame('survived').reset_index().rename(columns={'passengerid':'PassengerId', 'survived':'Survived'})
sm5.to_csv('sm__voting_soft_1.csv', index=False)

In [None]:
from xgboost import XGBClassifier

In [None]:
print('Done')

In [None]:
a = XGBClassifier()

In [None]:
Y_pred['Survived'] = y_ensemble_3_trees

In [None]:
Y_pred

In [None]:
m = Y_pred['Survived'].to_frame().reset_index()
m.columns = ['PassengerId', 'Survived']

m.to_csv('submission_v2.csv', index=False)

In [None]:
Y_pred

## Error analysis

In [None]:
Y_train_pred = X_train[['name']].copy()

In [None]:
Y_train_pred['y_true'] = y_train

In [None]:
Y_train_pred['cb_simple'] = pl__cb_simple.predict(X_train)
Y_train_pred[[f"cb_simple_{x}" for x in pl__cb_simple['pl_est_cb']['catboost_simple'].classes_]] = \
pl__cb_simple.predict_proba(X_train)

In [None]:
Y_train_pred['lgbm'] = pl__lgbm.predict(X_train)
Y_train_pred[[f"lgbm_{x}" for x in pl__lgbm['pl_est_lgbm']['lgbm'].classes_]] = \
pl__lgbm.predict_proba(X_train)

In [None]:
Y_train_pred['ada'] = pl__ada.predict(X_train)
Y_train_pred[[f"ada_{x}" for x in pl__ada['pl_est_ada']['ada'].classes_]] = \
pl__ada.predict_proba(X_train)

In [None]:
Y_train_pred

In [None]:
Y_train_pred.describe()

In [None]:
Y_train_pred[Y_train_pred['cb_simple'] == Y_train_pred['lgbm']].groupby('cb_simple')['cb_simple_1']\
.agg(['min', 'max', 'mean', 'median', 'std'])

In [None]:
Y_train_pred[Y_train_pred['cb_simple'] != Y_train_pred['lgbm']].groupby('cb_simple')['cb_simple_1']\
.agg(['min', 'max', 'mean', 'median', 'std'])

In [None]:
Y_train_pred[Y_train_pred['cb_simple'] == Y_train_pred['lgbm']].groupby('lgbm')['lgbm_1']\
.agg(['min', 'max', 'mean', 'median', 'std'])

In [None]:
Y_train_pred[Y_train_pred['cb_simple'] != Y_train_pred['lgbm']].groupby('lgbm')['lgbm_1']\
.agg(['min', 'max', 'mean', 'median', 'std'])

In [None]:
Y_train_pred['voting_hard'] = ((Y_train_pred['cb_simple'] + Y_train_pred['lgbm'] + Y_train_pred['ada']) > 1.5).astype(int)

In [None]:
Y_train_pred['voting_soft'] = ((Y_train_pred['cb_simple_1'] + Y_train_pred['lgbm_1'] + Y_train_pred['ada_1']) > 1.5).astype(int)

In [None]:
b = Y_train_pred[Y_train_pred['voting_hard'] != Y_train_pred['voting_soft']]
(b['voting_hard'] == b['y_true']).sum()

In [None]:
(b['voting_soft'] == b['y_true']).sum()

In [None]:
print(classification_report(Y_train_pred['y_true'], Y_train_pred['voting_hard']))

In [None]:
print(classification_report(Y_train_pred['y_true'], Y_train_pred['voting_soft']))

In [None]:
Y_train_pred['cb_simple'] = pl__cb_simple.predict(X_train)
Y_train_pred[[f"cb_simple_{x}" for x in pl__cb_simple['pl_est_cb']['catboost_simple'].classes_]] = \
pl__cb_simple.predict_proba(X_train)
Y_train_pred['lgbm'] = pl__lgbm.predict(X_train)
Y_train_pred[[f"lgbm_{x}" for x in pl__lgbm['pl_est_lgbm']['lgbm'].classes_]] = \
pl__lgbm.predict_proba(X_train)
Y_train_pred['ada'] = pl__ada.predict(X_train)
Y_train_pred[[f"ada_{x}" for x in pl__ada['pl_est_ada']['ada'].classes_]] = \
pl__ada.predict_proba(X_train)

In [None]:
# Try to submit catboost simple to see if it outperform 
m = Y_pred['Survived'].to_frame().reset_index()
m.columns = ['PassengerId', 'Survived']

m.to_csv('submission_v2.csv', index=False)

# Catboost Encoder

In [33]:
X_all = pd.concat([X_train, X_test])

In [34]:
df2 = pl.fit_transform(X_all)

====Converting columns types====


0it [00:00, ?it/s]

In [35]:
df2

Unnamed: 0,surname,forename,ticket_str,ticket_num,cabin_str,cabin_num,pclass,sex,age,sibsp,parch,fare,embarked
0,oconnor,frankie,,209245.0,c,12239.0,1,male,,2,0,27.14,s
1,bryan,drew,,27323.0,,,3,male,,0,0,13.35,s
2,owens,kenneth,ca,,,,3,male,0.33,1,2,71.29,s
3,kramer,james,a.,,,,3,male,19.00,0,0,13.04,s
4,bond,michael,,427635.0,,,3,male,25.00,0,0,7.76,s
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,cash,cheryle,,7686.0,,,3,female,27.00,0,0,10.12,q
199996,brown,howard,,13004.0,,,1,male,59.00,1,0,68.31,s
199997,lightfoot,cameron,,4383317.0,,,3,male,47.00,0,0,10.87,s
199998,jacobsen,margaret,pc,,b,20828.0,1,female,49.00,1,2,29.68,c


In [36]:
from catboost import CatBoostRegressor, CatBoostClassifier

In [37]:
def fill_unk(sr):
    return sr.fillna('unk')

In [38]:
def fill_mean(sr):
    return sr.fillna(sr.mean())

In [39]:
def fill_0(sr):
    return sr.fillna(0)

In [40]:
def fill_popular(sr):
    popular = sr.value_counts().index[0]
    return sr.fillna(popular)

In [41]:
((df2['ticket_str'].value_counts().max() / df2['ticket_str'].value_counts()) ** 0.3).to_dict()

{'pc': 1.0,
 'a.': 1.298597870231504,
 'c.a.': 1.5657620862440709,
 'sc/paris': 1.8380554509089366,
 'ston/o': 1.8894533171027126,
 'a/5.': 2.185284861311075,
 'a/5': 2.234059204823897,
 'pp': 2.2491340279863996,
 'soton/o.q.': 2.2734889936742184,
 'w./c.': 2.27527282276212,
 'f.c.c.': 2.41385944154945,
 'sc/ah': 2.539093689228038,
 's.o.c.': 2.573622157663341,
 'ca.': 2.6897859011757315,
 'ston/o2.': 2.8026576818025823,
 'a/4': 2.8026576818025823,
 's.c./paris': 3.0198711076275746,
 's.o./p.p.': 3.042580570368284,
 'soton/o2': 3.0638830614016856,
 'f.c.': 3.0813997867156724,
 'c': 3.120116910258576,
 'soton/oq': 3.178575350659462,
 'ca': 3.2673873902403923,
 'w.e.p.': 3.2935550331876913,
 'we/p': 3.3845342463709795,
 'sc': 3.571303920489865,
 'a./5.': 3.6884017547174666,
 'a/4.': 3.6884017547174666,
 'p/pp': 3.9008817403085616,
 'a.5.': 4.303013360334782,
 'sco/w': 4.3813903923044935,
 'aq/4': 4.5350231939195655,
 'sc/a4': 4.72921289032816,
 'lp': 4.881082091712631,
 'sc/a.3': 5.05663

In [42]:
((df2['cabin_str'].value_counts().max() / df2['cabin_str'].value_counts()) ** 0.3).to_dict()

{'c': 1.0,
 'b': 1.0588542033595938,
 'a': 1.104304059955087,
 'd': 1.398106110707252,
 'e': 1.6443298990431572,
 'f': 1.7371811100708827,
 'g': 2.77587393719455,
 't': 5.666875105902016}

In [43]:
impute_plan = {
    'features_plan': {
        'fare': {
            'model': CatBoostRegressor,
            'predictors': [c for c in df2.columns if c not in ['fare']],
            'catboost_kwgs': {'iterations':200}
        },
        'embarked': {
            'model': CatBoostClassifier,
            'predictors': [c for c in df2.columns if c not in ['embarked']],
            'catboost_kwgs': {'iterations': 200}
        },
        'age': {
            'model': CatBoostRegressor,
            'predictors': [c for c in df2.columns if c not in ['age']],
            'catboost_kwgs': {'iterations': 200}
        },
        'ticket_num':{
            'model': CatBoostRegressor,
            'predictors': [c for c in df2.columns if c not in ['age']],
            'catboost_kwgs': {'iterations': 200}
        },
        'ticket_str':{
            'model': CatBoostClassifier,
            'predictors': [c for c in df2.columns if c not in ['embarked']],
            'catboost_kwgs': {
                'iterations': 200, 
                'class_weights': ((df2['ticket_str'].value_counts().max() / df2['ticket_str'].value_counts()) ** 0.3).to_dict()
            }
        },
        'cabin_num':{
            'model': CatBoostRegressor,
            'predictors': [c for c in df2.columns if c not in ['age']],
            'catboost_kwgs': {
                'iterations': 200
            }
        },
        'cabin_str':{
            'model': CatBoostClassifier,
            'predictors': [c for c in df2.columns if c not in ['embarked']],
            'catboost_kwgs': {
                'iterations': 200,
                'class_weights': ((df2['cabin_str'].value_counts().max() / df2['cabin_str'].value_counts()) ** 0.3).to_dict()
            }
        }
    },
    'if_na': {
        'ticket_str':fill_unk,
        'ticket_num':fill_mean,
        'cabin_str':fill_unk,
        'cabin_num':fill_mean,
        'age':fill_mean,
        'embarked':fill_popular,
        'fare': fill_mean
    },
    'eval_frac': 0.1,
    'cat_features': ['surname', 'forename', 'ticket_str', 'cabin_str', 'sex', 'embarked']
}

In [44]:
from sklearn.model_selection import train_test_split

class CatBoostImputor(BaseEstimator, TransformerMixin):
    def __init__(self, impute_plan, predictors_order):
        super(CatBoostImputor).__init__()
        self.impute_plan = impute_plan
        self.predictors_order = predictors_order
        
    def fit(self, X, y=None):
        return self
    
    def train_eval_test_split(self, x, y):
        eval_frac = self.impute_plan['eval_frac']
        y_test = y[y.isna()].copy()
        x_test = x[y.isna()].copy()
        x_t = x[~y.isna()].copy()
        y_t = y[~y.isna()].copy()
        x_train, x_eval, y_train, y_eval = train_test_split(x_t, y_t, test_size=eval_frac)
        return x_train, y_train, x_eval, y_eval, x_test, y_test
        
    def transform(self, X):
        self.X = X
        self.X_impute = X.copy()
        for col in self.predictors_order:
            predictor_list = self.impute_plan['features_plan'][col]['predictors']
            cat_features=[c for c in predictor_list if c in self.impute_plan['cat_features']]
            # Prepare data to fit
            print(f'Prepare data to fit, feature: {col}')
            x = self.X_impute[predictor_list].copy()
            y = self.X_impute[col]
            na_cols = [col for col in x.columns if x[col].isna().sum() > 0]
            for c in na_cols:
                x[c] = self.impute_plan['if_na'][c](x[c])
            x_train, y_train, x_eval, y_eval, x_test, y_test = self.train_eval_test_split(x, y)
            
            # Create model
            ModelClass = self.impute_plan['features_plan'][col]['model']
            model = ModelClass(**self.impute_plan['features_plan'][col]['catboost_kwgs'], early_stopping_rounds=50, verbose=True, metric_period=50, use_best_model=True)
            # Fit model
            print(f'Fit model: {col}')
            model.fit(x_train, y_train, eval_set=(x_eval, y_eval), cat_features=cat_features)
            x_test[col] = model.predict(x_test)
            print(f'Predicted for {x_test.shape[0]} unknown examples')
            self.X_impute.loc[y.isna(),col] = x_test[col]
            print(f'Imputed done for {col}')
        print(f'Imputed done for all dataset, num null left: {self.X_impute.isna().sum()}')
        return self.X_impute
            
            

In [45]:
cbi = CatBoostImputor(impute_plan, ['fare', 'embarked', 'age', 'ticket_num', 'ticket_str', 'cabin_num', 'cabin_str'])

In [46]:
imputed_X_all = cbi.fit_transform(df2)

Prepare data to fit, feature: fare
Fit model: fare
Learning rate set to 0.335029




0:	learn: 59.4983218	test: 59.0623694	best: 59.0623694 (0)	total: 120ms	remaining: 24s
50:	learn: 51.1302180	test: 51.3088347	best: 51.2533689 (30)	total: 2.15s	remaining: 6.28s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 51.25336889
bestIteration = 30

Shrink model to first 31 iterations.
Predicted for 267 unknown examples
Imputed done for fare
Prepare data to fit, feature: embarked
Fit model: embarked
Learning rate set to 0.221522




0:	learn: 0.9278676	test: 0.9279810	best: 0.9279810 (0)	total: 127ms	remaining: 25.3s
50:	learn: 0.6115885	test: 0.6189250	best: 0.6189250 (50)	total: 5.61s	remaining: 16.4s
100:	learn: 0.6037331	test: 0.6143046	best: 0.6143046 (100)	total: 11.1s	remaining: 10.8s
150:	learn: 0.5990681	test: 0.6127092	best: 0.6127092 (150)	total: 16.5s	remaining: 5.34s
199:	learn: 0.5960023	test: 0.6125794	best: 0.6124769 (165)	total: 22s	remaining: 0us

bestTest = 0.6124768675
bestIteration = 165

Shrink model to first 166 iterations.
Predicted for 527 unknown examples
Imputed done for embarked
Prepare data to fit, feature: age
Fit model: age
Learning rate set to 0.332936
0:	learn: 16.0280617	test: 16.0313075	best: 16.0313075 (0)	total: 45.8ms	remaining: 9.11s




50:	learn: 15.0491609	test: 15.1258605	best: 15.1258605 (50)	total: 2.25s	remaining: 6.57s
100:	learn: 14.9914318	test: 15.1162922	best: 15.1162922 (100)	total: 4.15s	remaining: 4.07s
150:	learn: 14.9450624	test: 15.1128036	best: 15.1100615 (130)	total: 6.16s	remaining: 2s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 15.11006153
bestIteration = 130

Shrink model to first 131 iterations.
Predicted for 6779 unknown examples
Imputed done for age
Prepare data to fit, feature: ticket_num
Fit model: ticket_num
Learning rate set to 0.313478
0:	learn: 594225.5709626	test: 581641.5007189	best: 581641.5007189 (0)	total: 36ms	remaining: 7.16s




50:	learn: 81206.5906139	test: 78712.1527370	best: 78712.1527370 (50)	total: 1.48s	remaining: 4.31s
100:	learn: 74345.6317695	test: 77874.1743259	best: 77170.3001097 (94)	total: 2.89s	remaining: 2.83s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 77170.30011
bestIteration = 94

Shrink model to first 95 iterations.
Predicted for 59501 unknown examples
Imputed done for ticket_num
Prepare data to fit, feature: ticket_str
Fit model: ticket_str
Learning rate set to 0.21545




0:	learn: 1.8275040	test: 1.8544913	best: 1.8544913 (0)	total: 5.11s	remaining: 16m 56s
50:	learn: 0.0534475	test: 0.0335655	best: 0.0335655 (50)	total: 5m 43s	remaining: 16m 44s
100:	learn: 0.0300609	test: 0.0204671	best: 0.0204671 (100)	total: 12m 13s	remaining: 11m 59s
150:	learn: 0.0198630	test: 0.0149170	best: 0.0149170 (150)	total: 18m 26s	remaining: 5m 59s
199:	learn: 0.0141246	test: 0.0109911	best: 0.0109911 (199)	total: 24m 21s	remaining: 0us

bestTest = 0.01099113226
bestIteration = 199

Predicted for 150303 unknown examples
Imputed done for ticket_str
Prepare data to fit, feature: cabin_num
Fit model: cabin_num
Learning rate set to 0.267997
0:	learn: 4263.7101042	test: 4267.5842280	best: 4267.5842280 (0)	total: 21.6ms	remaining: 4.3s




50:	learn: 101.3957593	test: 101.6658688	best: 101.6658688 (50)	total: 856ms	remaining: 2.5s
100:	learn: 56.9070749	test: 56.5507879	best: 56.5507879 (100)	total: 1.7s	remaining: 1.67s
150:	learn: 42.3045591	test: 41.8517287	best: 41.8517287 (150)	total: 2.5s	remaining: 811ms
199:	learn: 36.2012512	test: 36.3492558	best: 36.3492558 (199)	total: 3.29s	remaining: 0us

bestTest = 36.34925582
bestIteration = 199

Predicted for 138697 unknown examples
Imputed done for cabin_num
Prepare data to fit, feature: cabin_str
Fit model: cabin_str
Learning rate set to 0.216356
0:	learn: 0.8203747	test: 0.8217365	best: 0.8217365 (0)	total: 139ms	remaining: 27.6s




50:	learn: 0.0012447	test: 0.0010260	best: 0.0010260 (50)	total: 8.67s	remaining: 25.3s
100:	learn: 0.0004766	test: 0.0003763	best: 0.0003763 (100)	total: 17.3s	remaining: 17s
150:	learn: 0.0003139	test: 0.0002468	best: 0.0002468 (150)	total: 26.1s	remaining: 8.46s
199:	learn: 0.0001953	test: 0.0001493	best: 0.0001493 (199)	total: 34.6s	remaining: 0us

bestTest = 0.000149295075
bestIteration = 199

Predicted for 138697 unknown examples
Imputed done for cabin_str
Imputed done for all dataset, num null left: surname       0
forename      0
ticket_str    0
ticket_num    0
cabin_str     0
cabin_num     0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      0
dtype: int64


In [47]:
print('Done')

Done


In [48]:
imputed_X_all.to_parquet('data/imputed_X_all.parquet')

In [49]:
print('Done')

Done


In [None]:
y_train

In [None]:
X_train_t, X_train_e, y_train_t, y_train_e = train_test_split(clean_X_train, y_train, test_size=0.1)

In [None]:
model_clean = CatBoostClassifier(iterations=1000, cat_features=['surname', 'forename', 'ticket_str', 'cabin_str', 'sex', 'embarked'], 
                                 early_stopping_rounds=50, verbose=True, metric_period=50, use_best_model=True)

In [None]:
X_train_t

In [None]:
model_clean.fit(X_train_t, y_train_t, eval_set = (X_train_e, y_train_e))

In [None]:
y_train_e_pred = model_clean.predict(X_train_e)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_train_e, model_clean.predict(X_train_e)))

In [None]:
print(classification_report(y_train_t, model_clean.predict(X_train_t)))

In [None]:
def ft_imp(model):
    fi = pd.DataFrame({
        'feature':X_train_e.columns,
        'importance': model.feature_importances_
    })

    return fi.sort_values(by='importance',ascending=False).iloc[:50].style.bar()

In [None]:
ft_imp(model_clean)

In [None]:
X_test_2 = pl.fit_transform(X_test)

In [None]:
X_test_2

In [None]:
cbi_test = CatBoostImputor(impute_plan, ['fare', 'embarked', 'age', 'ticket_num', 'ticket_str', 'cabin_num', 'cabin_str'])

In [None]:
X_test_clean = cbi_test.fit_transform(X_test_2)

In [None]:
X_test_clean.to_parquet('data/clean_X_test.parquet')

In [None]:
X_test_2.ticket_num.hist(bins=50)
plt.show()
X_test_clean.ticket_num.hist(bins=50)
plt.show()

In [None]:
X_test_2.cabin_num.hist(bins=50)
plt.show()
X_test_clean.cabin_num.hist(bins=50)
plt.show()

In [None]:
X_test_2.age.hist(bins=50)
plt.show()
X_test_clean.age.hist(bins=50)
plt.show()

In [None]:
X_test_2.cabin_str.value_counts()[:20]

In [None]:
X_test_clean.cabin_str.value_counts()[:20]

## Extent pipeline

In [None]:
pl

In [None]:
from copy import deepcopy

In [None]:
impute_plan2 = deepcopy(impute_plan)
wgt = 0
impute_plan2['features_plan']['ticket_str']['catboost_kwgs']['class_weights'] = ((df2['ticket_str'].value_counts().max() / df2['ticket_str'].value_counts()) ** wgt).to_dict()
impute_plan2['features_plan']['cabin_str']['catboost_kwgs']['class_weights'] = ((df2['cabin_str'].value_counts().max() / df2['cabin_str'].value_counts()) ** wgt).to_dict()

In [None]:
pl3 = Pipeline(steps = [
    ('aaa', pl), 
    ('catboost_imputor', CatBoostImputor(impute_plan2,  ['fare', 'embarked', 'age', 'ticket_num', 'ticket_str', 'cabin_num', 'cabin_str']))
])

In [None]:
pl3

In [None]:
pl2 = Pipeline(steps=[
    ('raw_data_processor', col_processor),
    ('lower_text_values', LowerTransformer()),
    ('catboost_imputor', CatBoostImputor(impute_plan2,  ['fare', 'embarked', 'age', 'ticket_num', 'ticket_str', 'cabin_num', 'cabin_str']))
])

In [None]:
pl2

In [None]:
clean_X_train2 = pl2.fit_transform(X_train)

In [None]:
pl2.__class__.__name__

In [None]:
a = CatBoostImputor(impute_plan, predictors_order=['fare', 'embarked', 'age', 'ticket_num', 'ticket_str', 'cabin_num', 'cabin_str'])

In [None]:
a.__name__

In [None]:
a.__class__.__name__

In [None]:
a.__class__.__name__

In [None]:
CatBoostImputor.__name__

In [None]:
from time import time

In [None]:
time()

In [None]:
from time import time, sleep
class PipelineLogger:
    def __init__(self):
        pass
        
    def log_start(self):
        self.start_time = time()
        print(f'======== {self.__class__.__name__} - START ========')
        return None
        
    def log_finish(self):
        self.duration = time() - self.start_time
        print(f'======== {self.__class__.__name__} - FINISH =======> Take: {self.duration:.6f}(s)')

In [None]:
class JustSleep(PipelineLogger):
    def __init__(self):
        pass
    
    def sleep(self):
        self.log_start()
        sleep(1)
        self.log_finish()

In [None]:
js = JustSleep()

In [None]:
js.sleep()

In [None]:
pl += Pipeline(steps=[('catboost_imputor', CatBoostImputor(impute_plan,  ['fare', 'embarked', 'age', 'ticket_num', 'ticket_str', 'cabin_num', 'cabin_str']))])

# Catboost without fillna

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_train.value_counts()

In [None]:
df2.info()

In [None]:
X_train_fillsimple = df2.copy()
X_train_fillsimple[['ticket_str', 'cabin_str', 'embarked']] = X_train_fillsimple[['ticket_str', 'cabin_str', 'embarked']].fillna('unk')

In [None]:
X_train_fs_t, X_train_fs_e, y_train_fs_t, y_train_fs_e = train_test_split(X_train_fillsimple, y_train, test_size=0.1)

In [None]:
model_fs = CatBoostClassifier(iterations=1000, cat_features=['surname', 'forename', 'ticket_str', 'cabin_str', 'sex', 'embarked'], 
                                 early_stopping_rounds=50, verbose=True, metric_period=50, use_best_model=True, auto_class_weights=None)

In [None]:
model_fs.fit(X_train_fs_t, y_train_fs_t, eval_set = (X_train_fs_e, y_train_fs_e), )

In [None]:
X_test_fillsimple = X_test_2.copy()
X_test_fillsimple[['ticket_str', 'cabin_str', 'embarked']] = X_test_fillsimple[['ticket_str', 'cabin_str', 'embarked']].fillna('unk')

In [None]:
print(classification_report(y_train_fs_t, model_fs.predict(X_train_fs_t)))

In [None]:
print(classification_report(y_train_fs_e, model_fs.predict(X_train_fs_e)))

In [None]:
X_test_fillsimple

In [None]:
ft_imp(model_fs)

# Submit

## Model based on catboost imputed data

In [None]:
X_test['Survived'] = model_clean.predict(X_test_clean)

In [None]:
X_test

In [None]:
m = X_test['Survived'].to_frame().reset_index()
m.columns = ['PassengerId', 'Survived']

In [None]:
m.to_csv('submission/v1.csv', index=False)

# Model based on simple imputed data

In [None]:
X_test['Survived_0_2'] = model_fs.predict(X_test_fillsimple)
m = X_test['Survived_0_2'].to_frame().reset_index()
m.columns = ['PassengerId', 'Survived']
m.to_csv('submission/v0_2.csv', index=False)