In [119]:
import matplotlib.pyplot as plt
import re
import numpy as np
import math
import pandas as pd
import random
import seaborn as sns
import warnings
import itertools
import sklearn.neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
import sklearn.linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")

plt.style.use('ggplot')

In [128]:
train = pd.read_csv('train.csv').set_index('PassengerId')
target = pd.read_csv('test.csv').set_index('PassengerId')
target['Survived'] = 0

data = pd.concat([train, target])

data['Rank'] = data['Cabin'].str[0]
data = pd.get_dummies(data, columns=['Sex', 'Pclass', 'Rank', 'Embarked'])

def normalize(column):
    data[column] = StandardScaler().fit_transform(data[column].to_numpy().reshape(-1, 1))
#     data[column] = (data[column] - data[column].mean()) / (data[column].max() - data[column].min())
    
data['LastName'] = data['Name'].apply(lambda x: x.split(',')[0])

families_total = dict(data.groupby('LastName')['Survived'].count())
families_survived = dict(data.groupby('LastName')['Survived'].sum())
data['FamilySize'] = data['LastName'].apply(lambda name: families_total.get(name, 0))
data['FamilySurvived'] = data['LastName'].apply(lambda name: families_survived.get(name, 0))
normalize('FamilySize')
normalize('FamilySurvived')


data['Age'] = data['Age'].fillna(data['Age'].mean())
normalize('Age')
normalize('SibSp')
normalize('Parch')
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
normalize('Fare')

data = data.drop(['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'LastName'], axis=1)

data['Survived'] = data.pop('Survived')
data.to_csv('tmp_data.csv')

train = data[data.index.isin(train.index)]
test = data[data.index.isin(target.index)]
test = test.drop('Survived', axis=1)

train = train.reset_index()
test = test.reset_index()

X_train, X_test, y_train, y_test = train_test_split(train.drop(['PassengerId', 'Survived'], axis=1).to_numpy(), train['Survived'].to_numpy(), test_size=0.01)

In [129]:
model = sklearn.linear_model.LogisticRegression()
param_grid = {}
lin_gscv = GridSearchCV(model, param_grid, cv=5)
lin_gscv.fit(X_train, y_train)
lin_gscv_score = lin_gscv.score(X_test, y_test)

print("%s: %s score: %s  (on train: %s)" %  ("LogReg", str(lin_gscv.best_params_), str(lin_gscv_score), str(lin_gscv.best_score_)))

LogReg: {} score: 0.8888888888888888  (on train: 0.9342321520287623)


In [133]:
model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2)
param_grid = {
    "n_estimators": range(10, 50, 5),
    "max_features": range(4, 100, 5),
    "max_depth": range(5, 10)
}
rfc_gscv = GridSearchCV(model, param_grid, cv=10, n_jobs=-1)
rfc_gscv.fit(X_train, y_train)
rfc_gscv_score = rfc_gscv.score(X_test, y_test)

print("%s: %s score: %s  (on train: %s)" %  ("Forest", str(rfc_gscv.best_params_), str(rfc_gscv_score), str(rfc_gscv.best_score_)))

Forest: {'max_depth': 8, 'max_features': 19, 'n_estimators': 15} score: 0.8888888888888888  (on train: 0.9659729315628193)


In [111]:
model = MLPClassifier(random_state=1, activation='relu')
param_grid = {
    'max_iter': np.arange(1, 100, step=10),
    'solver': ['lbfgs', 'adam'],
#    'hidden_layer_sizes': list(range(4, 10)) + list(itertools.product(range(4, 12), range(1, 8)))
    'hidden_layer_sizes': list(itertools.product(range(18, 24), range(4, 8)))
}
mlp_gscv = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
mlp_gscv.fit(X_train, y_train)
mlp_gscv_score = mlp_gscv.score(X_test, y_test)

print("%s: %s score: %s  (on train: %s)" %  ("MLP", str(mlp_gscv.best_params_), str(mlp_gscv_score), str(mlp_gscv.best_score_)))

MLP: {'hidden_layer_sizes': (22, 5), 'max_iter': 81, 'solver': 'lbfgs'} score: 0.8888888888888888  (on train: 0.9478171545968157)


In [131]:
#voting = VotingClassifier(estimators=[('lin', lin_gscv), ('rfc', rfc_gscv), ('mlp', mlp_gscv)])
#voting.fit(X_train, y_train)
#voting.score(X_test, y_test)

0.8888888888888888

In [132]:
def predict(model):
    predicted = model.predict(test.drop('PassengerId', axis=1).to_numpy())
    predicted = np.c_[test['PassengerId'], predicted]

    predicted = pd.DataFrame(data=predicted, columns=['PassengerId', 'Survived'])

    predicted['PassengerId'] = predicted['PassengerId'].astype(int)
    predicted['Survived'] = predicted['Survived'].astype(int)
    return predicted.set_index('PassengerId').sort_values(by=['PassengerId'])

predict_lin = predict(lin_gscv)
predict_rfc = predict(rfc_gscv)
predict_mpl = predict(mlp_gscv)
predict_voting = predict(voting)

predict_rfc.to_csv('my_submission.csv')

predict_lin['From'] = 'lin'
predict_rfc['From'] = 'rfc'
predict_mpl['From'] = 'mlp'
predict_voting['From'] = 'vote'

pd.concat([predict_lin, predict_rfc, predict_mpl, predict_voting]).pivot(columns='From').to_csv('all_results.csv')