In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv', dtype={'Age': np.float64})
test = pd.read_csv('test.csv', dtype={'Age': np.float64})
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
train_corr = train.corr()
train_corr

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [4]:
def correct_data(data):
    data.Age = data.Age.fillna(data.Age.median())
    data.Sex = data.Sex.replace(['male', 'female'], [0, 1])
    data.Embarked = data.Embarked.fillna('S')
    data.Embarked = data.Embarked.replace(['C', 'S', 'Q'], [0 ,1, 2])
    data.Fare = data.Fare.fillna(data.Fare.median())
    return data

train_data = correct_data(train)
test = correct_data(test)

In [5]:
train_corr = train.corr()
train_corr

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,1.0,-0.005007,-0.035144,-0.042939,0.034212,-0.057527,-0.001652,0.012658,-0.017443
Survived,-0.005007,1.0,-0.338481,0.543351,-0.06491,-0.035322,0.081629,0.257307,-0.125953
Pclass,-0.035144,-0.338481,1.0,-0.1319,-0.339898,0.083081,0.018443,-0.5495,0.305762
Sex,-0.042939,0.543351,-0.1319,1.0,-0.081163,0.114631,0.245489,0.182333,-0.022521
Age,0.034212,-0.06491,-0.339898,-0.081163,1.0,-0.233296,-0.172482,0.096688,-0.040166
SibSp,-0.057527,-0.035322,0.083081,0.114631,-0.233296,1.0,0.414838,0.159651,0.030874
Parch,-0.001652,0.081629,0.018443,0.245489,-0.172482,0.414838,1.0,0.216225,-0.035957
Fare,0.012658,0.257307,-0.5495,0.182333,0.096688,0.159651,0.216225,1.0,-0.268865
Embarked,-0.017443,-0.125953,0.305762,-0.022521,-0.040166,0.030874,-0.035957,-0.268865,1.0


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score

In [8]:
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
models = [
    ('LogisticRegression', LogisticRegression()),
    ('SVC', SVC()),
    ('LinearSVC', LinearSVC()),
    ('KNeighbors', KNeighborsClassifier()),
    ('DecisionTree', DecisionTreeClassifier()),
    ('RandomForest', RandomForestClassifier()),
    ('MLPClassifier', MLPClassifier(solver='lbfgs', random_state=0))
]

In [9]:
results = []
names = []
for name, model in models:
    result = cross_val_score(model, train_data[predictors], train_data['Survived'], cv=3)
    names.append(name)
    results.append(result)

In [10]:
for i, name in enumerate(names):
    print(name, results[i].mean())

LogisticRegression 0.785634118967
SVC 0.687991021324
LinearSVC 0.744107744108
KNeighbors 0.701459034792
DecisionTree 0.760942760943
RandomForest 0.791245791246
MLPClassifier 0.781144781145


In [11]:
from sklearn.grid_search import GridSearchCV

parameters = {
    'n_estimators': [5, 10, 20, 30, 50, 100, 300],
    'max_depth': [3, 5, 10, 15, 20, 25, 30, 40, 50, 100],
    'random_state': [0],
}
gsc = GridSearchCV(RandomForestClassifier(), parameters, cv=3)



In [12]:
gsc.fit(train_data[predictors], train_data['Survived'])

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 20, 30, 50, 100, 300], 'max_depth': [3, 5, 10, 15, 20, 25, 30, 40, 50, 100], 'random_state': [0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [13]:
res = _

In [14]:
res

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 20, 30, 50, 100, 300], 'max_depth': [3, 5, 10, 15, 20, 25, 30, 40, 50, 100], 'random_state': [0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [15]:
type(res)

sklearn.grid_search.GridSearchCV

In [16]:
gsc.grid_scores_

[mean: 0.77329, std: 0.02370, params: {'max_depth': 3, 'n_estimators': 5, 'random_state': 0},
 mean: 0.79012, std: 0.02781, params: {'max_depth': 3, 'n_estimators': 10, 'random_state': 0},
 mean: 0.80471, std: 0.03434, params: {'max_depth': 3, 'n_estimators': 20, 'random_state': 0},
 mean: 0.79798, std: 0.02520, params: {'max_depth': 3, 'n_estimators': 30, 'random_state': 0},
 mean: 0.81145, std: 0.03170, params: {'max_depth': 3, 'n_estimators': 50, 'random_state': 0},
 mean: 0.80696, std: 0.02402, params: {'max_depth': 3, 'n_estimators': 100, 'random_state': 0},
 mean: 0.80247, std: 0.02914, params: {'max_depth': 3, 'n_estimators': 300, 'random_state': 0},
 mean: 0.80247, std: 0.02402, params: {'max_depth': 5, 'n_estimators': 5, 'random_state': 0},
 mean: 0.81594, std: 0.02822, params: {'max_depth': 5, 'n_estimators': 10, 'random_state': 0},
 mean: 0.81818, std: 0.02790, params: {'max_depth': 5, 'n_estimators': 20, 'random_state': 0},
 mean: 0.81706, std: 0.02464, params: {'max_depth'

In [17]:
gsc.best_params_

{'max_depth': 10, 'n_estimators': 300, 'random_state': 0}

In [20]:
predictions = gsc.predict(test[predictors])
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})
submission.to_csv('submission.csv', index=False)

In [21]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


In [24]:
train.groupby(['Sex', 'Survived']).size()

Sex  Survived
0    0           468
     1           109
1    0            81
     1           233
dtype: int64

In [26]:
train[train.Sex == train.Survived].shape[0] / train.shape[0]

0.7867564534231201

In [27]:
pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test['Sex']
}).to_csv('submission_only_sex.csv', index=False)

In [28]:
import xgboost as xgb

In [34]:
mod = xgb.XGBClassifier()
mod.fit(train_data[predictors], train_data['Survived'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [36]:
result = cross_val_score(xgb.XGBClassifier(), train_data[predictors], train_data['Survived'], cv=3)
result

array([ 0.79124579,  0.82154882,  0.83164983])

In [37]:
result.mean()

0.81481481481481488

In [38]:
params = {
    'max_depth': [3, 5, 10],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 10, 100],
    'subsample': [0.8, 0.85, 0.9, 0.95],
    'colsample_bytree': [0.5, 1.0]
}
cv = GridSearchCV(xgb.XGBClassifier(), params, cv=3)
cv.fit(train_data[predictors], train_data['Survived'])

GridSearchCV(cv=3, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, 5, 10, 100], 'learning_rate': [0.05, 0.1], 'subsample': [0.8, 0.85, 0.9, 0.95], 'colsample_bytree': [0.5, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [39]:
cv.grid_scores_

[mean: 0.81033, std: 0.01789, params: {'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_depth': 3, 'subsample': 0.8},
 mean: 0.80584, std: 0.01789, params: {'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_depth': 3, 'subsample': 0.85},
 mean: 0.80920, std: 0.01931, params: {'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_depth': 3, 'subsample': 0.9},
 mean: 0.81033, std: 0.01680, params: {'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_depth': 3, 'subsample': 0.95},
 mean: 0.81818, std: 0.02520, params: {'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_depth': 5, 'subsample': 0.8},
 mean: 0.81818, std: 0.02250, params: {'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_depth': 5, 'subsample': 0.85},
 mean: 0.81033, std: 0.02338, params: {'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_depth': 5, 'subsample': 0.9},
 mean: 0.81706, std: 0.02670, params: {'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_depth': 5, 'subsample': 0.95},
 mean: 0.82604, std:

In [40]:
predictions = cv.predict(test[predictors])
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})
submission.to_csv('submission_xgb.csv', index=False)

In [41]:
train_data.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64