In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv('train.csv')

In [6]:
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

age_mean = df['Age'].mean()

from scipy.stats import mode

mode_embarked = mode(df['Embarked'].dropna())[0][0]
df['Embarked'] = df['Embarked'].fillna(mode_embarked)

df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)

pd.get_dummies(df['Embarked'], prefix='Embarked').head(10)
df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)

df = df.drop(['Sex', 'Embarked'], axis=1)

cols = df.columns.tolist()
cols = [cols[1]] + cols[0:1] + cols[2:]

df = df[cols]



In [7]:
df = df.fillna(-1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
Survived       891 non-null int64
PassengerId    891 non-null int64
Pclass         891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Gender         891 non-null int32
Embarked_C     891 non-null uint8
Embarked_Q     891 non-null uint8
Embarked_S     891 non-null uint8
dtypes: float64(2), int32(1), int64(5), uint8(3)
memory usage: 54.9 KB


In [9]:
train_data = df.values

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

imputer = Imputer(strategy = 'mean', missing_values = -1)

classifier = RandomForestClassifier(n_estimators=100)

pipeline = Pipeline([
    ('imp', imputer),
    ('clf', classifier)
])



In [11]:
parameter_grid = {
    'clf__max_features': [0.5, 1],
    'clf__max_depth': [5, None],
}

In [12]:
grid_search = GridSearchCV(pipeline, parameter_grid, cv=5, verbose=3)

In [13]:
grid_search.fit(train_data[0::,1::], train_data[0::,0])

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] clf__max_depth=5, clf__max_features=0.5 .........................
[CV]  clf__max_depth=5, clf__max_features=0.5, score=0.731844 -   0.1s
[CV] clf__max_depth=5, clf__max_features=0.5 .........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV]  clf__max_depth=5, clf__max_features=0.5, score=0.815642 -   0.1s
[CV] clf__max_depth=5, clf__max_features=0.5 .........................
[CV]  clf__max_depth=5, clf__max_features=0.5, score=0.820225 -   0.1s
[CV] clf__max_depth=5, clf__max_features=0.5 .........................
[CV]  clf__max_depth=5, clf__max_features=0.5, score=0.792135 -   0.1s
[CV] clf__max_depth=5, clf__max_features=0.5 .........................
[CV]  clf__max_depth=5, clf__max_features=0.5, score=0.836158 -   0.1s
[CV] clf__max_depth=5, clf__max_features=1 ...........................
[CV] .. clf__max_depth=5, clf__max_features=1, score=0.687151 -   0.1s
[CV] clf__max_depth=5, clf__max_features=1 ...........................
[CV] .. clf__max_depth=5, clf__max_features=1, score=0.832402 -   0.0s
[CV] clf__max_depth=5, clf__max_features=1 ...........................
[CV] .. clf__max_depth=5, clf__max_features=1, score=0.853933 -   0.0s
[CV] clf__max_depth=5, clf__max_features=1 ...........................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    3.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values=-1, strategy='mean', verbose=0)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07...ators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'clf__max_features': [0.5, 1], 'clf__max_depth': [5, None]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [14]:
sorted(grid_search.grid_scores_, key=lambda x: x.mean_validation_score)
grid_search.best_score_
grid_search.best_params_

{'clf__max_depth': 5, 'clf__max_features': 1}

In [15]:
df['Age'].describe()

count    891.000000
mean      23.600640
std       17.867496
min       -1.000000
25%        6.000000
50%       24.000000
75%       35.000000
max       80.000000
Name: Age, dtype: float64

In [16]:
train_data = df.values

In [17]:
model = RandomForestClassifier(n_estimators = 100, max_features=1,max_depth=5)
model = model.fit(train_data[0:,2:],train_data[0:,0])

In [18]:
df_test = pd.read_csv('test.csv')

df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [19]:
df_test['Age'] = df_test['Age'].fillna(age_mean)

In [20]:
fare_means = df.pivot_table('Fare', index='Pclass', aggfunc='mean')
df_test['Fare'] = df_test[['Fare', 'Pclass']].apply(lambda x:
                            fare_means[x['Pclass']] if pd.isnull(x['Fare'])
                            else x['Fare'], axis=1)

df_test['Gender'] = df_test['Sex'].map({'female': 0, 'male': 1}).astype(int)
df_test = pd.concat([df_test, pd.get_dummies(df_test['Embarked'], prefix='Embarked')],
                axis=1)

df_test = df_test.drop(['Sex', 'Embarked'], axis=1)

test_data = df_test.values

output = model.predict(test_data[:,1:])

In [21]:
result = np.c_[test_data[:,0].astype(int), output.astype(int)]

df_result = pd.DataFrame(result[:,0:2], columns=['PassengerId', 'Survived'])
df_result.to_csv('submissions/pipelineSubmission.csv', index=False)