In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import mglearn

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from IPython.display import display

%matplotlib inline

In [8]:
original = pd.read_csv('train.csv', index_col='PassengerId')
original

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [9]:
data = original.copy()

def transform_data(data):
    data['Title'] = data.Name.str.extract('([A-Za-z]+)\.', expand=False)
    data.loc[data.Title == 'Mlle', 'Title'] = 'Miss'
    data.loc[data.Title == 'Mme', 'Title']  = 'Mrs'
    data.loc[data.Title == 'Ms', 'Title']   = 'Miss'

    rare_titles = ['Dr', 'Rev', 'Col', 'Major', 'Countess', 'Don', 'Jonkheer', 'Capt', 'Lady', 'Sir']
    data.Title = data.Title.replace(rare_titles, 'Rare')

    age_by_title = data.groupby('Title').Age.mean()

    data.loc[data.Age.isnull() & (data.Title == 'Mr'), 'Age'] = age_by_title['Mr']
    data.loc[data.Age.isnull() & (data.Title == 'Mrs'), 'Age'] = age_by_title['Mrs']
    data.loc[data.Age.isnull() & (data.Title == 'Miss'), 'Age'] = age_by_title['Miss']
    data.loc[data.Age.isnull() & (data.Title == 'Master'), 'Age'] = age_by_title['Master']
    data.loc[data.Age.isnull() & (data.Title == 'Rare'), 'Age'] = age_by_title['Rare']

    data.Embarked = data.Embarked.fillna('S')

    data = data.drop('Cabin', axis=1)

    data['FamilySize'] = data.Parch + data.SibSp + 1
    data = data.drop(['Parch', 'SibSp'], axis=1)

    # Additional transformation:
    data['IsMale']    = (data.Sex == 'male').astype(float)
    data['IsMr']      = (data.Title == 'Mr').astype(float)
    data['IsMrs']     = (data.Title == 'Mrs').astype(float)
    data['IsMiss']    = (data.Title == 'Miss').astype(float)
    data['IsMaster']  = (data.Title == 'Master').astype(float)
    data['IsRare']    = (data.Title == 'Rare').astype(float)
    data['EmbarkedC'] = (data.Embarked == 'C').astype(float)
    data['EmbarkedQ'] = (data.Embarked == 'Q').astype(float)
    data['EmbarkedS'] = (data.Embarked == 'S').astype(float)

    data = data.drop(['Sex', 'Embarked', 'Title'], axis=1)
    data = data.drop(['Name', 'Ticket'], axis=1)

    data['Fare'] = pd.qcut(data['Fare'],4,labels=[0,1,2,3])

    data['Class1'] = (data.Pclass == 1).astype(float)
    data['Class2'] = (data.Pclass == 2).astype(float)
    data['Class3'] = (data.Pclass == 3).astype(float)
    data = data.drop('Pclass', axis=1)

    data['IsChild'] = (data.Age < 18).astype(float)
    data['IsAdult'] = (18 <= data.Age).astype(float)

    # data = data.drop('Age', axis=1)
    data['Age'] = pd.qcut(data['Age'], 4, labels=[0,1,2,3])

    data['IsAlone']       = (data.FamilySize == 1).astype(float)
    data['IsSmallFamily'] = ((2 <= data.FamilySize) & (data.FamilySize < 5)).astype(float)
    data['IsLargeFamily'] = (5 <= data.FamilySize).astype(float)

    data = data.drop('FamilySize', axis=1)

    return data

data = transform_data(data)

In [10]:
X = data.drop('Survived', axis=1)
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

In [11]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression().fit(X_train, y_train)
print("train score:", model.score(X_train, y_train))
print("test score: ", model.score(X_test, y_test))

pipeline = Pipeline([
    ('regression', LogisticRegression())
])

grid = {
    'regression__C': [0.001, 0.01, 1, 10, 100]
}

search = GridSearchCV(pipeline, param_grid=grid, cv=5)
search.fit(X_train, y_train)
search.best_params_

train score: 0.836826347305
test score:  0.816143497758


{'regression__C': 1}

In [12]:
from sklearn.ensemble import RandomForestClassifier

best_RandomForest = RandomForestClassifier(random_state=0,
                               n_jobs=8,
                               n_estimators=10, 
                               max_features='sqrt', 
                               min_samples_split=20)
model = best_RandomForest.fit(X_train, y_train)
print("train score:", model.score(X_train, y_train))
print("test score: ", model.score(X_test, y_test))

train score: 0.853293413174
test score:  0.834080717489


In [13]:
pipeline = Pipeline([
    ('regression', RandomForestClassifier(random_state=0, n_jobs=8))
])

grid = {
    'regression__n_estimators': [10, 100, 1000, 2000, 3000],
    'regression__max_features': ['sqrt'],#, 'log2', 0.2, 0.4, 0.6, 0.8],
    'regression__min_samples_split': [20],
}

# timing
import time
start_time = time.time()
#

search = GridSearchCV(pipeline, param_grid=grid, cv=5)
search.fit(X_train, y_train)
print(search.best_params_)
pd.DataFrame(search.cv_results_)

# timing
print("--- %s seconds ---" % (time.time() - start_time))
#

pd.DataFrame(search.cv_results_)

{'regression__min_samples_split': 20, 'regression__n_estimators': 10, 'regression__max_features': 'sqrt'}
--- 51.690481185913086 seconds ---


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_regression__max_features,param_regression__min_samples_split,param_regression__n_estimators,params,rank_test_score,split0_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.110185,0.10278,0.835329,0.857775,sqrt,20,10,"{'regression__min_samples_split': 20, 'regress...",1,0.859259,...,0.827068,0.857944,0.819549,0.865421,0.819549,0.863551,0.002179,0.000277,0.016618,0.006794
1,0.166689,0.103134,0.832335,0.860777,sqrt,20,100,"{'regression__min_samples_split': 20, 'regress...",4,0.851852,...,0.819549,0.859813,0.834586,0.861682,0.819549,0.859813,0.006212,0.000763,0.012063,0.003726
2,1.291487,0.203593,0.832335,0.861148,sqrt,20,1000,"{'regression__min_samples_split': 20, 'regress...",4,0.866667,...,0.819549,0.859813,0.819549,0.86729,0.819549,0.859813,0.007331,0.000466,0.018393,0.005722
3,2.458392,0.303967,0.833832,0.859651,sqrt,20,2000,"{'regression__min_samples_split': 20, 'regress...",2,0.866667,...,0.819549,0.856075,0.827068,0.865421,0.819549,0.861682,0.006244,0.000443,0.017579,0.005008
4,3.637843,0.505291,0.833832,0.859651,sqrt,20,3000,"{'regression__min_samples_split': 20, 'regress...",2,0.866667,...,0.819549,0.856075,0.827068,0.865421,0.819549,0.861682,0.004168,0.000455,0.017579,0.005008


In [14]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('svm', SVC(probability=True)),
])

model = pipeline.fit(X_train, y_train)
print("train score:", model.score(X_train, y_train))
print("test score: ", model.score(X_test, y_test))

train score: 0.844311377246
test score:  0.820627802691


In [15]:
# train on all the data
# model = pipeline.fit(X, y) #0.80382
model = best_RandomForest.fit(X, y) #0.80861 (grid-search-best) - 0.79904 (random-split-overfit)

In [16]:
test = pd.read_csv('test.csv', index_col=['PassengerId'])
test.loc[test['Fare'].isnull(), 'Fare'] = test['Fare'].mean()
test = transform_data(test)

test.isnull().sum().sort_values()

predictions = model.predict(test)
frame = pd.DataFrame({
    'PassengerId': pd.read_csv('test.csv').PassengerId,
    'Survived': predictions
})
frame = frame.set_index('PassengerId')
frame.to_csv('predictions.csv')
frame

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1
897,0
898,1
899,0
900,1
901,0


In [17]:
wrong_predictions = (model.predict(X_test) != y_test) & (y_test == 1)
wrong_predictions_index = y_test.index[wrong_predictions]
data = original.copy()
wrongly_predicted = data.loc[data.index.intersection(wrong_predictions_index)]
wrongly_predicted['Predicted'] = model.predict(X_test)[wrong_predictions]
wrongly_predicted

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Predicted
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S,0
37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C,0
128,1,3,"Madsen, Mr. Fridtjof Arne",male,24.0,0,0,C 17369,7.1417,,S,0
208,1,3,"Albimona, Mr. Nassef Cassem",male,26.0,0,0,2699,18.7875,,C,0
210,1,1,"Blank, Mr. Henry",male,40.0,0,0,112277,31.0,A31,C,0
225,1,1,"Hoyt, Mr. Frederick Maxfield",male,38.0,1,0,19943,90.0,C93,S,0
234,1,3,"Asplund, Miss. Lillian Gertrud",female,5.0,4,2,347077,31.3875,,S,0
268,1,3,"Persson, Mr. Ernst Ulrik",male,25.0,1,0,347083,7.775,,S,0
284,1,3,"Dorking, Mr. Edward Arthur",male,19.0,0,0,A/5. 10482,8.05,,S,0
289,1,2,"Hosono, Mr. Masabumi",male,42.0,0,0,237798,13.0,,S,0


In [18]:
wrong_predictions = (model.predict(X_test) != y_test) & (y_test == 0)
wrong_predictions_index = y_test.index[wrong_predictions]
data = original.copy()
wrongly_predicted = data.loc[data.index.intersection(wrong_predictions_index)]
wrongly_predicted['Predicted'] = model.predict(X_test)[wrong_predictions]
wrongly_predicted

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Predicted
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female,31.0,1,0,345763,18.0,,S,1
115,0,3,"Attalah, Miss. Malake",female,17.0,0,0,2627,14.4583,,C,1
200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24.0,0,0,248747,13.0,,S,1
420,0,3,"Van Impe, Miss. Catharina",female,10.0,0,2,345773,24.15,,S,1
499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,1
594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q,1
655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18.0,0,0,365226,6.75,,Q,1


In [19]:
model.predict_proba(X_test)[(model.predict(X_test) != y_test) & (y_test == 1)]

array([[ 0.81383838,  0.18616162],
       [ 0.81753681,  0.18246319],
       [ 0.95168692,  0.04831308],
       [ 0.8916666 ,  0.1083334 ],
       [ 0.61370075,  0.38629925],
       [ 0.52603844,  0.47396156],
       [ 0.84664501,  0.15335499],
       [ 0.56626783,  0.43373217],
       [ 0.59002699,  0.40997301],
       [ 0.85944914,  0.14055086],
       [ 0.78552868,  0.21447132],
       [ 0.65127411,  0.34872589],
       [ 0.91318903,  0.08681097],
       [ 0.90930309,  0.09069691],
       [ 0.92841261,  0.07158739],
       [ 0.84185277,  0.15814723],
       [ 0.65772689,  0.34227311],
       [ 0.78410566,  0.21589434],
       [ 0.97174242,  0.02825758],
       [ 0.61370075,  0.38629925],
       [ 0.92991174,  0.07008826],
       [ 0.84266261,  0.15733739],
       [ 0.65127411,  0.34872589],
       [ 0.60622461,  0.39377539],
       [ 0.76955051,  0.23044949]])