In [None]:
# This Python 3
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
# Create full data set
full = pd.concat([train.drop('Survived', axis=1), test], axis=0)
full.drop('PassengerId', axis=1, inplace=True)

In [None]:
# We have some null entries
full.isnull().sum()

In [None]:
# We impute Embarked and Fare with the mode and mean
full['Embarked'].fillna('S', inplace = True)
full['Fare'].fillna(full['Fare'].mean(), inplace = True)

In [None]:
# Number of nulls correlates to survivial
# Instead of imputing we can use this
def null_count(df):
    return df[["Cabin", "Age"]].apply(lambda x: x.isnull().astype(int)).sum(axis=1)
train["nnull"] = null_count(train)
print(train.groupby("nnull")[["Survived"]].mean())
full["nnull"] = null_count(full) # Apply to full dataset

In [None]:
# Cabin type (first letter in cabin) also correlates to survival
def cabin_type(df):
    cab = df['Cabin'].astype(str).str[0] # this captures the letter
    return cab.map(
        {k: i for i, k in enumerate(cab.unique())})
train["Cabin_type"] = cabin_type(train)
# this transforms the letters into numbers
print(train.groupby("Cabin_type")[["Survived"]].mean())
full["Cabin_type"] = cabin_type(train)

In [None]:
# We can drop no longer used columns
full.drop(["Cabin", "Age"], inplace=True, axis=1) # Drop replaced column
# Now there are no more null
full.isnull().sum()

In [None]:
# Titles are correlated to survival, but there are many types so we collapse titles to fewer categories
def extract_titles(df):
    titles = {
        "Mr" :         "Mr",
        "Mme":         "Mrs",
        "Ms":          "Mrs",
        "Mrs" :        "Mrs",
        "Master" :     "Master",
        "Mlle":        "Miss",
        "Miss" :       "Miss",
        "Capt":        "Officer",
        "Col":         "Officer",
        "Major":       "Officer",
        "Dr":          "Officer",
        "Rev":         "Officer",
        "Jonkheer":    "Royalty",
        "Don":         "Royalty",
        "Sir" :        "Royalty",
        "Countess":    "Royalty",
        "Dona":        "Royalty",
        "Lady" :       "Royalty"
    }
    return df["Name"].str.extract(' ([A-Za-z]+)\.',expand=False).map(titles)
train["title"] = extract_titles(train)
# this transforms the letters into numbers
print(train.groupby("title")[["Survived"]].mean())
full["title"] = extract_titles(full)

In [None]:
# Make a famliy size from parch and sibsp
full["Family_size"] = full[["Parch", "SibSp"]].sum(axis=1)
full.drop(["Parch", "SibSp", 'Name', 'Ticket'], inplace=True, axis=1) # Drop useless columns

In [None]:
dummies = pd.get_dummies(full, columns = ['Sex', "title", 'nnull', 'Cabin_type', 'Embarked'])
display(dummies.head())

In [None]:
X = dummies[:len(train)]
new_X = dummies[len(train):]
y = train.Survived

In [None]:
from xgboost import XGBClassifier
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
def select_features(X, y):#
    xgb = XGBClassifier(n_estimators=40)   
    rfecv=RFECV(xgb, cv=20)
    rfecv.fit(X, y)
    best_columns = list(X.columns[rfecv.support_])
    print("Best Columns \n"+"-"*12+"\n{}\n".format(best_columns))
    return best_columns
cols = select_features(X,y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X[cols], y, test_size = .3, random_state = 1, stratify = y)

In [None]:
from sklearn.model_selection import GridSearchCV
gbm_param_grid = {
    'n_estimators': [5,10,15,20,25,30,40,45],
    'max_depth': range(6, 10),
    'learning_rate': [.4, .45, .5, .55, .6],
    'colsample_bytree': [.6, .7, .8, .9, 1]
}

# Instantiate the regressor: gbm
gbm = XGBClassifier()

# Perform random search: grid_mse
xgb_grid = GridSearchCV(
    gbm, gbm_param_grid, 
    cv = 8, verbose=1)

# Fit randomized_mse to the data
xgb_grid.fit(X, y)
# Print the best parameters and lowest RMSE
print("Best parameters found: ", xgb_grid.best_params_)
print("Best accuracy found: ", xgb_grid.best_score_)

In [None]:
xgb_pred = xgb_grid.predict(new_X)
submission = pd.concat([test.PassengerId, pd.DataFrame(xgb_pred)], axis = 'columns')
submission.columns = ["PassengerId", "Survived"]
submission.to_csv('titanic_submission.csv', header = True, index = False)