In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import GridSearchCV

In [2]:
cwd = os.getcwd()
path = os.path.join(cwd,"data")

fn =  os.path.join(path , "train.csv")
dataTrain = pd.read_csv(fn, index_col = "PassengerId")

fn = os.path.join(path , "test.csv")
X_test = pd.read_csv(fn)

TestPassengerId = X_test.PassengerId
X_test.set_index("PassengerId", inplace=True)

In [3]:
y_train = dataTrain["Survived"]
X_train = dataTrain.drop("Survived", axis = 1)

In [4]:
# I decided to preprocess both Test and Train datasets simultaneously

X = pd.concat([X_train, X_test])
X.isnull().sum()

Pclass         0
Name           0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

In [5]:
#Extracting Title from the name column

X["Title"] = X["Name"].str.extract("([A-Za-z]+)\.")
X["Title"].value_counts()

Mr          757
Miss        260
Mrs         197
Master       61
Dr            8
Rev           8
Col           4
Ms            2
Major         2
Mlle          2
Sir           1
Don           1
Lady          1
Capt          1
Dona          1
Mme           1
Jonkheer      1
Countess      1
Name: Title, dtype: int64

In [6]:
X[X["Age"].isnull()].groupby("Title")["Name"].count()

Title
Dr          1
Master      8
Miss       50
Mr        176
Mrs        27
Ms          1
Name: Name, dtype: int64

In [7]:
#Filling missing Age values with average age of people with the same title

X["Age"] = X.groupby(["Title"])["Age"].transform(lambda x: x.fillna(x.median())) 

In [8]:
# Merging some titles to common value

X["Title"].replace(to_replace = ["Rev","Col", "Major", "Capt"], value = "Crew", inplace = True)
X["Title"].replace(to_replace = ["Dona", "Jonkheer", "Countess", "Sir", "Lady", "Don", "Dr"], value = "Elite", inplace = True)
X["Title"].replace({"Mlle":"Miss", "Ms":"Miss", "Mme":"Mrs"}, inplace = True)

# Separating kids and seniors from others (Improved my Kaggle score by ~ 1%)

X.loc[X['Age'] <= 7, 'Title'] = "Kid"      # 18 dead, 36 survived
X.loc[X['Age'] >= 59, 'Title'] = 'Senior'  # 21 dead, 7 survived 

In [9]:
X["Title"].value_counts()

Mr        728
Miss      232
Mrs       187
Kid        74
Senior     43
Master     18
Elite      14
Crew       13
Name: Title, dtype: int64

In [10]:
X[X["Fare"].isnull()]

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,Senior


In [11]:
# Filling missing Fare value with average price of the same traveling condition

X.at[1044, "Fare"] = X[(X["Pclass"] == 3) & (X["Embarked"] == "S") & (X["SibSp"] == 0) & (X["Parch"] == 0)]["Fare"].median()

In [12]:
# Deciding to keep only deck instead of whole cabin number. E.g. C86 => C; A13 => A

X["Cabin"] = X["Cabin"].str[0] 

In [13]:
# Passengers with the same ticket number probably were traveling in the same deck

X["Cabin"] = X[["Ticket", "Cabin"]].groupby("Ticket")["Cabin"].transform(lambda x: x.fillna(x.unique()[0]))

In [14]:
# Other missing deck values were filled out with PClass number

X["Cabin"].fillna(X["Pclass"], inplace = True)

In [15]:
X["Cabin"].value_counts()

3    691
2    253
C     99
B     67
1     60
D     46
E     42
F     23
A     22
G      5
T      1
Name: Cabin, dtype: int64

In [16]:
X[X["Embarked"].isnull()]

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
62,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B,,Miss
830,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B,,Senior


In [17]:
X[(X["Fare"] > 79.5) & (X["Fare"] < 80.5)]

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
62,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B,,Miss
263,1,"Taussig, Mr. Emil",male,52.0,1,1,110413,79.65,E,S,Mr
559,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39.0,1,1,110413,79.65,E,S,Mrs
586,1,"Taussig, Miss. Ruth",female,18.0,0,2,110413,79.65,E,S,Miss
830,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B,,Senior


In [18]:
# Filling missing port of embarkation values basing on the price of ticket

X["Embarked"].fillna("S", inplace = True)

In [19]:
X["FamilySize"] = X["SibSp"] + X["Parch"]

In [20]:
OHE_Cabin = pd.get_dummies(X["Cabin"])
OHE_Cabin.drop([1,2,3], axis = 1, inplace = True)
OHE_Sex = pd.get_dummies(X["Sex"])
OHE_Title = pd.get_dummies(X["Title"])
OHE_Embarked = pd.get_dummies(X["Embarked"])

# There may be two features named "C" - OHE_Cabin and OHE_Embarked (XGBoost does not deal with it)

OHE_Embarked.rename(columns ={"C":"CH"}, inplace = True)
OHE_PClass = pd.get_dummies(X["Pclass"])

scaler = StandardScaler()
X[["Age","Fare", "FamilySize"]] = scaler.fit_transform(X[["Age","Fare", "FamilySize"]])
X_new = pd.concat([X[["Age","Fare", "FamilySize"]], OHE_PClass, OHE_Cabin, OHE_Sex, OHE_Title, OHE_Embarked], axis = 1, sort = False)
X_new.columns

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Index([       'Age',       'Fare', 'FamilySize',            1,            2,
                  3,          'A',          'B',          'C',          'D',
                'E',          'F',          'G',          'T',     'female',
             'male',       'Crew',      'Elite',        'Kid',     'Master',
             'Miss',         'Mr',        'Mrs',     'Senior',         'CH',
                'Q',          'S'],
      dtype='object')

In [21]:
X_train = X_new.iloc[:891]
X_test = X_new.iloc[891:]

## MLP Classifier

In [22]:
from sklearn.neural_network import MLPClassifier

mlpc = MLPClassifier(random_state=1912, max_iter = 500)

# parameter_space = {
#     "hidden_layer_sizes": [(50,50,50), (50,100,50), (100,), (50,100), (75,50), (25,50), (50,25)], 
#     "solver": ["sgd", "adam", "lbfgs"],
#     "alpha": [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 3, 5, 7, 9]
# }

# Best pamams found after running GridSearch on set above:
parameter_space = {"alpha": [7], "hidden_layer_sizes": [(50, 100, 50)], "solver": ["lbfgs"]} 

mlpc_gs = GridSearchCV(mlpc, parameter_space, n_jobs=-1, cv=3)
mlpc_gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1912, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'alpha': [7], 'hidden_layer_sizes': [(50, 100, 50)], 'solver': ['lbfgs']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
print("Best parameters found for MLPC: %r \nMax Score = %0.03f" %(mlpc_gs.best_params_, mlpc_gs.best_score_))

# means = mlpc_gs.cv_results_["mean_test_score"]
# stds = mlpc_gs.cv_results_["std_test_score"]
# for mean, std, params in zip(means, stds, mlpc_gs.cv_results_["params"]):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters found for MLPC: {'alpha': 7, 'hidden_layer_sizes': (50, 100, 50), 'solver': 'lbfgs'} 
Max Score = 0.832


## SVC

In [24]:
from sklearn.svm import SVC

svc = SVC(random_state=1912)

# That's what left after running certain amount of GridSearches: 
parameter_space = {
    "C": [0.01],
    "gamma": [1],
    "kernel": ["poly"],
    "degree": [2]
}

svc_gs = GridSearchCV(svc, parameter_space, n_jobs=-1, cv=3)
svc_gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=1912,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': [0.01], 'gamma': [1], 'kernel': ['poly'], 'degree': [2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
print("Best parameters found for SVC: %r \nMax Score = %0.03f" %(svc_gs.best_params_, svc_gs.best_score_))

# means = svc_gs.cv_results_["mean_test_score"]
# stds = svc_gs.cv_results_["std_test_score"]
# for mean, std, params in zip(means, stds, svc_gs.cv_results_["params"]):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters found for SVC: {'C': 0.01, 'degree': 2, 'gamma': 1, 'kernel': 'poly'} 
Max Score = 0.834


## GBC

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(random_state = 1912)

# That's what left after running certain amount of GridSearches: 
parameter_space = { 
    "learning_rate": [0.05],
    "max_depth": [3],
    "n_estimators": [410]
}

gbc_gs = GridSearchCV(gbc, parameter_space, n_jobs=-1, cv=3)
gbc_gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.05], 'max_depth': [3], 'n_estimators': [410]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [27]:
print("Best parameters found for GBC: %r \nMax Score = %0.03f" %(gbc_gs.best_params_, gbc_gs.best_score_))

# means = gbc_gs.cv_results_["mean_test_score"]
# stds = gbc_gs.cv_results_["std_test_score"]
# for mean, std, params in zip(means, stds, gbc_gs.cv_results_["params"]):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters found for GBC: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 410} 
Max Score = 0.835


## XGBC

In [28]:
from xgboost import XGBClassifier
xgbc = XGBClassifier(random_state = 1912)

# That's what left after running certain amount of GridSearches: 
parameter_space = {
    'n_estimators': [625],
    'learning_rate': [0.01],
    'subsample': [0.9],
    'max_depth': [8],
    'colsample_bytree': [0.5],
    'min_child_weight': [5],
}

xgbc_gs = GridSearchCV(xgbc, parameter_space, n_jobs=-1, cv=3)
xgbc_gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=1912, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [625], 'learning_rate': [0.01], 'subsample': [0.9], 'max_depth': [8], 'colsample_bytree': [0.5], 'min_child_weight': [5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [29]:
print("Best parameters found for XGBC: %r \nMax Score = %0.03f" %(xgbc_gs.best_params_, xgbc_gs.best_score_))

# means = xgbc_gs.cv_results_["mean_test_score"]
# stds = xgbc_gs.cv_results_["std_test_score"]
# for mean, std, params in zip(means, stds, xgbc_gs.cv_results_["params"]):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))


Best parameters found for XGBC: {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 8, 'min_child_weight': 5, 'n_estimators': 625, 'subsample': 0.9} 
Max Score = 0.837


## Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 1912)

# That's what left after running certain amount of GridSearches: 
parameter_space = {
    "criterion": ["gini"],
    "bootstrap": [True],
    "max_depth": [20],
    "max_features": ["auto"],
    "min_samples_leaf": [2],
    "min_samples_split": [7],
    "n_estimators": [25]
}
 
rf_gs = GridSearchCV(rf, parameter_space, n_jobs=-1, cv=3)
rf_gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=1912, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'criterion': ['gini'], 'bootstrap': [True], 'max_depth': [20], 'max_features': ['auto'], 'min_samples_leaf': [2], 'min_samples_split': [7], 'n_estimators': [25]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [31]:
print("Best parameters found for RF: %r \nMax Score = %0.03f" %(rf_gs.best_params_, rf_gs.best_score_))

# means = rf_gs.cv_results_["mean_test_score"]
# stds = rf_gs.cv_results_["std_test_score"]
# for mean, std, params in zip(means, stds, rf_gs.cv_results_["params"]):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters found for RF: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 25} 
Max Score = 0.840


## Logistic Regression

In [32]:
from sklearn.linear_model import LogisticRegression

lrg = LogisticRegression(max_iter = 5000, random_state = 1912)

# That's what left after running certain amount of GridSearches: 
parameter_space = {
    "C": [0.4], 
    "tol": [0.1],
    "solver": ["lbfgs"]
}

lrg_gs = GridSearchCV(lrg, parameter_space, n_jobs=-1, cv=3)
lrg_gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1912, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': [0.4], 'tol': [0.1], 'solver': ['lbfgs']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [33]:
print("Best parameters found for LR: %r \nMax Score = %0.03f" %(lrg_gs.best_params_, lrg_gs.best_score_))

# means = lrg_gs.cv_results_["mean_test_score"]
# stds = lrg_gs.cv_results_["std_test_score"]
# for mean, std, params in zip(means, stds, lrg_gs.cv_results_["params"]):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters found for LR: {'C': 0.4, 'solver': 'lbfgs', 'tol': 0.1} 
Max Score = 0.829


## Model selection and prediction

In [34]:
scores = pd.DataFrame({"Accuracy": [mlpc_gs.best_score_, svc_gs.best_score_, gbc_gs.best_score_, 
                                       xgbc_gs.best_score_, rf_gs.best_score_, lrg_gs.best_score_]})

scores.index = ["MLPC", "SVC", "GBC", "XGBC", "RF", "LR"]
scores.sort_values(by = "Accuracy", ascending = False)

Unnamed: 0,Accuracy
RF,0.839506
XGBC,0.837262
GBC,0.835017
SVC,0.833895
MLPC,0.83165
LR,0.829405


<b><i>I'll use XGBoost as a main predictor. 
    
However, I decided to make a "hybryd" model - if XGBoost predicts other value than another 5 models, it is probably wrong.</i></b>

In [35]:
model_prediction = pd.DataFrame({"MLPC":mlpc_gs.best_estimator_.predict(X_test), "SVC":svc_gs.best_estimator_.predict(X_test), 
                                 "GBC":gbc_gs.best_estimator_.predict(X_test), "XGBC":xgbc_gs.best_estimator_.predict(X_test),
                                 "RF":rf_gs.best_estimator_.predict(X_test), "LR":lrg_gs.best_estimator_.predict(X_test)})

In [36]:
model_prediction["temp"] = (model_prediction["RF"] + model_prediction["GBC"] + model_prediction["SVC"] +
                            model_prediction["MLPC"] + model_prediction["SVC"])

In [37]:
model_prediction["Survived"] = model_prediction.apply(lambda x: 1 if x["XGBC"] == 0 and x["temp"] >= 5 else 
                                                      (0 if x["XGBC"] == 1 and x["temp"] == 0 else x["XGBC"]), axis = 1)

In [38]:
# Table shows where XGBoost is possibly wrong
model_prediction[model_prediction["XGBC"]!=model_prediction["Survived"]]

Unnamed: 0,MLPC,SVC,GBC,XGBC,RF,LR,temp,Survived
21,1,1,1,0,1,0,5,1
28,0,0,0,1,0,0,0,0
75,0,0,0,1,0,1,0,0
118,0,0,0,1,0,0,0,0
192,1,1,1,0,1,0,5,1
199,1,1,1,0,1,1,5,1
331,0,0,0,1,0,0,0,0
359,1,1,1,0,1,1,5,1
392,1,1,1,0,1,0,5,1


In [39]:
pred = model_prediction["Survived"]
output = pd.concat([TestPassengerId, pred], axis = 1)
output["Survived"] = output["Survived"].astype('str')
fn = os.path.join(path , 'predictions.csv')
output.to_csv(fn, index = False)