In [33]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.shape, test_df.shape

((891, 12), (418, 11))

In [4]:
train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [5]:
y = train_df['Survived']
x = train_df.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_x = test_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
x.shape, test_x.shape

((891, 7), (418, 7))

In [6]:
num_feat = x.select_dtypes('number').columns.values
cat_feat = x.select_dtypes('object').columns.values
x_num = x[num_feat]
x_cat = x[cat_feat]

x_num = (x_num - x_num.mean()) / x_num.std()
x_num = x_num.fillna(x_num.mean())

x_cat = pd.get_dummies(x_cat)

x = pd.concat([x_num, x_cat], axis=1)

In [39]:
x.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.826913,-0.530005,0.43255,-0.473408,-0.502163,0,1,0,0,1
1,-1.565228,0.57143,0.43255,-0.473408,0.786404,1,0,1,0,0
2,0.826913,-0.254646,-0.474279,-0.473408,-0.48858,1,0,0,0,1


In [7]:
num_feat = test_x.select_dtypes('number').columns.values
cat_feat = test_x.select_dtypes('object').columns.values
test_x_num = test_x[num_feat]
test_x_cat = test_x[cat_feat]

test_x_num = (test_x_num - test_x_num.mean()) / test_x_num.std()
test_x_num = test_x_num.fillna(test_x_num.mean())

test_x_cat = pd.get_dummies(test_x_cat)

test_x = pd.concat([test_x_num, test_x_cat], axis=1)

In [40]:
test_x.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.872436,0.298099,-0.498872,-0.399769,-0.497213,0,1,0,1,0
1,0.872436,1.179547,0.616254,-0.399769,-0.512045,1,0,0,0,1
2,-0.315441,2.237285,-0.498872,-0.399769,-0.463974,0,1,0,1,0


In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=17)

In [12]:
dt = DecisionTreeClassifier(random_state=17)
np.mean(cross_val_score(dt, x_train, y_train, cv=5))

0.7721228878648233

In [13]:
knn = KNeighborsClassifier()
np.mean(cross_val_score(knn, x_train, y_train, cv=5))

0.8105497183819764

In [15]:
tree_params = {'max_depth': np.arange(1, 11), 'max_features': [0.5, 0.7, 1.0]}

In [17]:
tree_grid = GridSearchCV(dt, tree_params, cv=5, n_jobs=-1)

In [22]:
tree_grid.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), 'max_features': [0.5, 0.7, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
tree_grid.best_params_

{'max_depth': 3, 'max_features': 0.5}

In [24]:
tree_grid.best_score_

0.8330658105939005

In [25]:
knn_params = {'n_neighbors': list(range(9, 30)) + list(range(50, 100, 10))}

In [27]:
knn_grid = GridSearchCV(knn, knn_params, cv=5, n_jobs=-1)

In [28]:
%%time
knn_grid.fit(x_train, y_train)

CPU times: user 327 ms, sys: 8.87 ms, total: 335 ms
Wall time: 4.88 s


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_neighbors': [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 50, 60, 70, 80, 90]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [29]:
knn_grid.best_params_

{'n_neighbors': 12}

In [30]:
knn_grid.best_score_

0.8282504012841091

In [31]:
tree_grid.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best')

In [32]:
tree_preds = tree_grid.predict(x_test)

In [34]:
accuracy_score(tree_preds, y_test)

0.7611940298507462

In [36]:
knn_grid.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=12, p=2,
           weights='uniform')

In [37]:
knn_preds = knn_grid.predict(x_test)

In [38]:
accuracy_score(knn_preds, y_test)

0.7686567164179104

In [35]:
1 - np.mean(y)

0.6161616161616161

In [None]:
dt.fit(x, y)
predictions = dt.predict(test_x)

In [0]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId,
                       'Survived': predictions})
output.to_csv('submission.csv', index=False)