In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier

In [2]:
train = pd.read_csv("D:/py_ml/titanic/data/train.csv")
test = pd.read_csv("D:/py_ml/titanic/data/test.csv")

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
train["Pclass"] = train["Pclass"].astype(str)

In [6]:
train.pivot_table(values="Survived",
                  index="Sex",
                  columns="Pclass",
                  margins=True)

Pclass,1,2,3,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838


In [7]:
train_exp = (train
             .copy()
             .assign(Title=train["Name"]
                     .astype(str)
                     .str.extract(r"\,\s([a-zA-Z\s]*)\.\s")))

In [8]:
# (train_exp
# .groupby(["Title", "Sex"])["PassengerId"]
# .count()
# .unstack()
# .fillna(0))

In [9]:
(train_exp
 .pivot_table(values="PassengerId",
              index="Title",
              columns="Sex",
              aggfunc="count",
              fill_value=0))

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0
Mlle,2,0


In [10]:
features_subset = ["Pclass",
                   "Name",
                   "Sex",
                   "Age",
                   "SibSp",
                   "Parch",
                   "Embarked"]

train_subset = train.loc[~train["Age"].isna(), features_subset]

y_subset = train_subset["Age"]
X_subset = train_subset.drop(["Age"], axis=1)

model_subset = RandomForestRegressor(random_state=1234)


def parse_name(df):
    return (df.assign(Title=df["Name"]
                      .astype(str)
                      .str.extract(r"\,\s([a-zA-Z\s]*)\.\s")))


def family_counter(df):
    return (df.assign(w_family=df.apply(
        lambda r: int(r["SibSp"] + r["Parch"] > 0),
        axis=1)))


construct_features = Pipeline(
    steps=[
        ("add_title", FunctionTransformer(parse_name)),
        ("add_family", FunctionTransformer(family_counter))])

preprocess_cats = Pipeline(
    steps=[
        ("imputing", SimpleImputer(strategy="most_frequent")),
        ("encoding", OneHotEncoder(handle_unknown="ignore"))])

pre_processor = ColumnTransformer(
    transformers=[
        ("drop_cols", "drop", ["Name", "SibSp", "Parch"]),
        ("process_cats",
         preprocess_cats,
         ["Pclass", "Sex", "Embarked", "Title"])])

filling_pipe = Pipeline(
    steps=[
        ("f_engineering", construct_features),
        ("preprocessing", pre_processor),
        ("modeling", model_subset)])

rmse_subset = -1 * cross_val_score(filling_pipe,
                                   X_subset,
                                   y_subset,
                                   scoring="neg_root_mean_squared_error")

print("RMSE: {} ({})".format(rmse_subset.mean(),
                             rmse_subset.std()))

RMSE: 11.517536502747475 (0.2599960895294339)


In [11]:
missed_subset = (train
                 .loc[train["Age"].isna(), features_subset]
                 .drop(["Age"], axis=1))

filling_pipe.fit(X_subset, y_subset)

train.loc[train["Age"].isna(), "Age"] = (filling_pipe
                                         .predict(missed_subset))

In [12]:
train["Age_desc"], age_bins = pd.qcut(train["Age"],
                                      q=6,
                                      precision=0,
                                      retbins=True)

In [13]:
features_class = ["Survived",
                  "Pclass",
                  "Name",
                  "Sex",
                  "Age_desc",
                  "SibSp",
                  "Parch",
                  "Embarked"]

train_class = train[features_class]

y_class = train_class["Survived"]
X_class = train_class.drop(["Survived"], axis=1)

y_count = train["Survived"].value_counts()
spw = y_count[0] / y_count[1]

model_class = XGBClassifier(random_state=1234,
                            objective="binary:logistic",
                            scale_pos_weight=spw,
                            njobs=-1)

pre_processor_class = ColumnTransformer(
    transformers=[
        ("drop_cols", "drop", ["Name", "SibSp", "Parch"]),
        ("process_cats",
         preprocess_cats,
         ["Pclass", "Sex", "Age_desc", "Embarked", "Title"])])

class_pipe = Pipeline(
    steps=[
        ("f_engineering", construct_features),
        ("preprocessing", pre_processor_class),
        ("xgbr", model_class)])

In [14]:
param_grid = {"xgbr__n_estimators": [500, 750, 1000],
              "xgbr__learning_rate": [0.001, 0.01, 0.1]}

search_cv = GridSearchCV(class_pipe,
                         param_grid=param_grid,
                         scoring="roc_auc",
                         n_jobs=-1)
search_cv.fit(X_class,
              y_class)

print("Best roc_auc on CV: {}:".format(search_cv.best_score_))
print(search_cv.best_params_)

Best roc_auc on CV: 0.8682397468905249:
{'xgbr__learning_rate': 0.01, 'xgbr__n_estimators': 750}


In [15]:
class_pipe.set_params(xgbr__n_estimators=750,
                      xgbr__learning_rate=0.01)

class_pipe.fit(X_class,
               y_class)

Pipeline(memory=None,
         steps=[('f_engineering',
                 Pipeline(memory=None,
                          steps=[('add_title',
                                  FunctionTransformer(accept_sparse=False,
                                                      check_inverse=True,
                                                      func=<function parse_name at 0x0000026106435A68>,
                                                      inv_kw_args=None,
                                                      inverse_func=None,
                                                      kw_args=None,
                                                      validate=False)),
                                 ('add_family',
                                  FunctionTransformer(accept_sparse=False,
                                                      check_inverse=True,
                                                      func=<function family_...
                               colsample_by

In [16]:
test["Age_desc"] = pd.cut(test["Age"],
                          bins=age_bins,
                          precision=0)

predicts = class_pipe.predict(test[["Pclass",
                                    "Name",
                                    "Sex",
                                    "Age_desc",
                                    "SibSp",
                                    "Parch",
                                    "Embarked"]])

my_submission = pd.DataFrame(
    np.column_stack((test["PassengerId"], predicts)),
    columns=["PassengerId", "Survived"])

my_submission.to_csv("output.csv", index=False)