In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline 

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [3]:
X_full = pd.concat([train.drop('Survived', axis = 1), test], axis = 0)

In [4]:
X_full.shape

(1309, 11)

Clean X_full. Afterwards, we will split it back up into training and test sets.

In [5]:
X_full.drop('PassengerId', axis = 1, inplace=True)

In [6]:
X_full.isnull().sum()

Pclass         0
Name           0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

In [7]:
(X_full.Age.isnull() & X_full.Cabin.isnull()).sum()

240

I would guess that these people died, so we couldn't collect their information. 

In [8]:
train.Survived.mean()

0.3838383838383838

In [9]:
train.Cabin.notnull().mean()

0.22895622895622897

Coincidence? Maybe not.

In [10]:
(train.Cabin.isnull() & (train.Survived == 0)).mean()

0.53984287317620649

In [11]:
selector = (train.Cabin.isnull() & train.Age.isnull())

train[selector].Survived.mean()

0.25949367088607594

In [12]:
train.Survived.mean()

0.3838383838383838

In [13]:
selector = (train.Cabin.isnull())

train[selector].Survived.mean()

0.29985443959243085

We can conclude that not cabin_null is a good indicator of not_survived, but cabin_null and age_null is even better.

In [14]:
X_full['Nulls'] = X_full.Cabin.isnull().astype('int') + X_full.Age.isnull().astype('int')

We can further divide the cabin category by simply extracting the first letter.

In [15]:
X_full['Cabin_mapped'] = X_full['Cabin'].astype(str).str[0] # this captures the letter

# this transforms the letters into numbers
cabin_dict = {k:i for i, k in enumerate(X_full.Cabin_mapped.unique())} 
X_full.loc[:, 'Cabin_mapped'] = X_full.loc[:, 'Cabin_mapped'].map(cabin_dict)

In [16]:
cabin_dict

{'A': 5, 'B': 6, 'C': 1, 'D': 4, 'E': 2, 'F': 7, 'G': 3, 'T': 8, 'n': 0}

In [17]:
X_full.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked', 'Nulls', 'Cabin_mapped'],
      dtype='object')

In [18]:
X_full.drop(['Age', 'Cabin'], inplace = True, axis = 1)

The guy with the missing fare was from thee lower class. Assume he paid the average price.

In [19]:
fare_mean = X_full[X_full.Pclass == 3].Fare.mean()

X_full['Fare'].fillna(fare_mean, inplace = True)

In [20]:
X_full.isnull().sum()

Pclass          0
Name            0
Sex             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Embarked        2
Nulls           0
Cabin_mapped    0
dtype: int64

In [21]:
X_full[X_full.Embarked.isnull()]

Unnamed: 0,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked,Nulls,Cabin_mapped
61,1,"Icard, Miss. Amelie",female,0,0,113572,80.0,,0,6
829,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,0,0,113572,80.0,,0,6


In [22]:
X_full[X_full['Pclass'] == 1].Embarked.value_counts()

S    177
C    141
Q      3
Name: Embarked, dtype: int64

In [23]:
X_full['Embarked'].fillna('S', inplace = True)

In [24]:
X_full.isnull().sum()

Pclass          0
Name            0
Sex             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Embarked        0
Nulls           0
Cabin_mapped    0
dtype: int64

In [25]:
X_full.drop(['Name', 'Ticket'], axis = 1, inplace = True)

In [26]:
X_full.dtypes

SyntaxError: invalid syntax (<ipython-input-26-6631e7f51560>, line 1)

In [None]:
X_dummies = pd.get_dummies(X_full, columns = ['Sex', 'Nulls', 'Cabin_mapped', 'Embarked'], drop_first= True)

In [None]:
X_dummies.dtypes

Now let's train.

In [None]:
X = X_dummies[:len(train)]; new_X = X_dummies[len(train):]
y = train.Survived

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = .3,
                                                    random_state = 5,
                                                   stratify = y)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

rf.fit(X_train, y_train)

rf.score(X_test, y_test)

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier()

In [None]:
xgb.fit(X_train, y_train)

In [None]:
xgb.score(X_test, y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
lg.fit(X_train, y_train)
lg.score(X_test, y_test)

In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

# Create the parameter grid: gbm_param_grid 
gbm_param_grid = {
    'n_estimators': range(8, 20),
    'max_depth': range(6, 10),
    'learning_rate': [.4, .45, .5, .55, .6],
    'colsample_bytree': [.6, .7, .8, .9, 1]
}

# Instantiate the regressor: gbm
gbm = XGBClassifier(n_estimators=10)

# Perform random search: grid_mse
xgb_random = RandomizedSearchCV(param_distributions=gbm_param_grid, 
                                    estimator = gbm, scoring = "accuracy", 
                                    verbose = 1, n_iter = 50, cv = 4)


# Fit randomized_mse to the data
xgb_random.fit(X, y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", xgb_random.best_params_)
print("Best accuracy found: ", xgb_random.best_score_)

In [None]:
xgb_pred = xgb_random.predict(new_X)

In [None]:
submission = pd.concat([test.PassengerId, pd.DataFrame(xgb_pred)], axis = 'columns')

In [None]:
submission.columns = ["PassengerId", "Survived"]

In [None]:
submission.to_csv('titanic_submission.csv', header = True, index = False)