In [1]:
import matplotlib.pyplot as plt
import re
import numpy as np
import math
import pandas as pd
import random
import seaborn as sns
import warnings
import itertools
import sklearn.neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn.linear_model
warnings.filterwarnings("ignore")

plt.style.use('ggplot')

In [119]:
train = pd.read_csv('train.csv').set_index('PassengerId')
test = pd.read_csv('test.csv').set_index('PassengerId')
test['Survived'] = -1

train_and_test = pd.concat([train, test])

train_and_test['Rank'] = train_and_test['Cabin'].str[0]
train_and_test = pd.get_dummies(train_and_test, columns=['Sex', 'Pclass', 'Rank', 'Embarked'])

def normalize(column):
    train_and_test[column] = (train_and_test[column] - train_and_test[column].min()) / (train_and_test[column].max() - train_and_test[column].min())
    
train_and_test['LastName'] = train_and_test['Name'].apply(lambda x: x.split(',')[0])

train_and_test['Age'] = train_and_test['Age'].fillna(30)
normalize('Age')
normalize('SibSp')
normalize('Parch')
normalize('Fare')
train_and_test['Fare'] = train_and_test['Fare'].fillna(0)

train_and_test = train_and_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)

train_and_test['Survived'] = train_and_test.pop('Survived')

data = train_and_test[~train_and_test.index.isin(test.index)]
test = train_and_test[~train_and_test.index.isin(data.index)]

data = data.reset_index()
test = test.reset_index()

X = data.to_numpy()

data.set_index('PassengerId').to_csv('tmd_data.csv')

In [135]:
[name for name, survived in zip(train_and_test['LastName', 'Survived']) if survived == 1]

KeyError: ('LastName', 'Survived')

In [109]:
model = sklearn.linear_model.LogisticRegression()
param_grid = {}
lin_gscv = GridSearchCV(model, param_grid, cv=5)
lin_gscv.fit(X[:, 1:-1], X[:, -1])
print("%s: %s score: %s" %  ("LogReg", str(lin_gscv.best_params_), str(lin_gscv.best_score_)))

LogReg: {} score: 0.7968740192078339


In [100]:
model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2)
param_grid = {
    "n_estimators": range(500, 1000, 100),
    "max_depth": range(2, 5)
}
rfc_gscv = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
rfc_gscv.fit(X[:, 1:-1], X[:, -1])
print("%s: %s score: %s" %  ("Forest", str(rfc_gscv.best_params_), str(rfc_gscv.best_score_)))

Forest: {'max_depth': 4, 'n_estimators': 600} score: 0.8181783943255289


In [113]:
model = MLPClassifier(random_state=1, activation='relu')
param_grid = {
    'max_iter': np.arange(100, 500, step=100),
    'solver': ['lbfgs', 'adam'],
#    'hidden_layer_sizes': list(range(4, 10)) + list(itertools.product(range(4, 12), range(1, 8)))
    'hidden_layer_sizes': list(itertools.product(range(18, 24), range(4, 8)))
}
mlp_gscv = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
mlp_gscv.fit(X[:, 1:-1], X[:, -1])
print("%s: %s score: %s" %  ('MLP', str(mlp_gscv.best_params_), str(mlp_gscv.best_score_)))
# 0.8271

MLP: {'hidden_layer_sizes': (23, 5), 'max_iter': 300, 'solver': 'adam'} score: 0.827185989580064


In [114]:
def predict(model):
    test_data = test.to_numpy()
    predicted = mlp_gscv.predict(test_data[:, 1:-1])
    predicted = np.c_[test_data[:, 0], predicted]

    predicted = pd.DataFrame(data=predicted, columns=['PassengerId', 'Survived'])

    predicted['PassengerId'] = predicted['PassengerId'].astype(int)
    predicted['Survived'] = predicted['Survived'].astype(int)
    return predicted.set_index('PassengerId').sort_values(by=['PassengerId'])

predict_lin = predict(lin_gscv)
predict_rfc = predict(rfc_gscv)
predict_mpl = predict(mlp_gscv)

predict_mpl.to_csv('my_submission.csv')

predict_lin['From'] = 'lin'
predict_rfc['From'] = 'rfc'
predict_mpl['From'] = 'mlp'

pd.concat([predict_lin, predict_rfc, predict_mpl]).sort_values(by=['PassengerId']).drop_duplicates('Survived')

Unnamed: 0_level_0,Survived,From
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
892,0,lin
898,1,rfc
