In [104]:
import pandas as pd
import numpy as np

In [55]:
test_data = pd.read_csv('Data/test.csv')
train_data = pd.read_csv('Data/train.csv')
all_data = pd.concat([test_data, train_data])

# Engineer Features

In [56]:
Title_Dictionary = {
                        "Capt":       "Officer",
                        "Col":        "Officer",
                        "Major":      "Officer",
                        "Jonkheer":   "Royalty",
                        "Don":        "Royalty",
                        "Sir" :       "Royalty",
                        "Dr":         "Officer",
                        "Rev":        "Officer",
                        "the Countess":"Royalty",
                        "Dona":       "Royalty",
                        "Mme":        "Mrs",
                        "Mlle":       "Miss",
                        "Ms":         "Mrs",
                        "Mr" :        "Mr",
                        "Mrs" :       "Mrs",
                        "Miss" :      "Miss",
                        "Master" :    "Master",
                        "Lady" :      "Royalty"

                        }

In [57]:
all_data['Fare'] = all_data['Fare'].fillna(all_data['Fare'].mean())
sex_map = {'male' : 0, 'female' : 1}
all_data['Sex'] = all_data['Sex'].apply(lambda x: sex_map[x])
all_data['Title'] = all_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [58]:
all_data['Status'] = all_data['Title'].apply(lambda x: Title_Dictionary[x])
median_ages = all_data.dropna().pivot_table(index=['Sex','Status','Pclass'], values='Age', aggfunc='median')

In [59]:
all_data['Cabin'] = all_data['Cabin'].fillna('N')
all_data['Cabin'] = all_data['Cabin'].apply(lambda x: x[0])

In [60]:
all_data['Family'] = all_data['Parch'] + all_data['SibSp'] + 1

#Fill with most common value
all_data['Embarked'].value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [61]:
all_data['Embarked'] = all_data['Embarked'].fillna('S')

In [62]:
def add_missing_age(row):
    if pd.isnull(row['Age']):
        return median_ages[row['Sex'],row['Status'],row['Pclass']]
    else:
        return row['Age']
all_data['Age'] = all_data.apply(add_missing_age, axis=1)

In [63]:
dummy_columns = ['Status', 'Title', 'Pclass', 'Embarked', 'Cabin']

In [64]:
for dummy_column in dummy_columns:
    all_data = pd.concat([all_data,
                          pd.get_dummies(all_data[dummy_column],
                                         prefix=dummy_column)],
                         axis=1)
all_data.drop(labels=dummy_columns, axis=1, inplace=True)

In [65]:
drop_columns = ['Ticket', 'Name']
all_data.drop(labels=drop_columns, axis=1, inplace=True)

In [86]:
train = all_data[all_data['PassengerId'] < 892]
test = all_data[all_data['PassengerId'] >= 892]

In [87]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score

In [88]:
features = list(all_data.columns)
features.remove('PassengerId')
features.remove('Survived')

In [89]:
model = RandomForestClassifier()

In [90]:
X = train[features]
y = train['Survived']

In [98]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=200)
clf = clf.fit(X, y)

In [99]:
list(zip(features,clf.feature_importances_))

[('Age', 0.19124595772965794),
 ('Fare', 0.18977084724942297),
 ('Parch', 0.020113773166369961),
 ('Sex', 0.081734565534456433),
 ('SibSp', 0.033980761817226604),
 ('Family', 0.038216170969899287),
 ('Status_Master', 0.0067686302106832776),
 ('Status_Miss', 0.024197297450285558),
 ('Status_Mr', 0.065051332369629331),
 ('Status_Mrs', 0.023905751538485193),
 ('Status_Officer', 0.0063170543872246307),
 ('Status_Royalty', 0.0006390069424862825),
 ('Title_Capt', 0.00072319592842692393),
 ('Title_Col', 0.00038433167176785967),
 ('Title_Don', 0.00060698825998795763),
 ('Title_Dona', 0.0),
 ('Title_Dr', 0.0015273972304732813),
 ('Title_Jonkheer', 0.00053839740526596546),
 ('Title_Lady', 2.813072783609545e-05),
 ('Title_Major', 0.00076530645078431547),
 ('Title_Master', 0.0059052746437885661),
 ('Title_Miss', 0.019652057502809198),
 ('Title_Mlle', 0.00017324179081631104),
 ('Title_Mme', 1.2209301159981777e-05),
 ('Title_Mr', 0.086779515370670385),
 ('Title_Mrs', 0.016819250280267751),
 ('Title_

In [101]:
model = SelectFromModel(clf, prefit=True)
train_new = model.transform(X)
train_new.shape

(891, 11)

In [108]:
forest = RandomForestClassifier(max_features='sqrt')

parameters = {
                 'max_depth' : np.arange(1,10),
                 'n_estimators': np.arange(200,300,10),
                 'criterion': ['gini','entropy']
                 }

cross_validation = StratifiedKFold(y, n_folds=5)

grid_search = GridSearchCV(forest,
                           param_grid=parameters,
                           cv=cross_validation)

grid_search.fit(train_new, y)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.8361391694725028
Best parameters: {'criterion': 'gini', 'max_depth': 6, 'n_estimators': 230}


In [117]:
output = grid_search.predict(model.transform(test.drop(['Survived','PassengerId'], axis=1))).astype(int)
df_output = pd.DataFrame()
df_output['PassengerId'] = test['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId','Survived']].to_csv('output.csv',index=False)