In [7]:
import matplotlib.pyplot as plt
import re
import numpy as np
import math
import pandas as pd
import random
import seaborn as sns
import warnings
import itertools
import sklearn.neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
import sklearn.linear_model
warnings.filterwarnings("ignore")

plt.style.use('ggplot')

In [8]:
data = pd.read_csv('train.csv')

In [9]:
def prepare_data(_data):
    train_data = _data[['PassengerId', 'Sex', 'Age', 'Pclass', 'Fare', 'Survived']]
    train_data['FareExpensive'] = (train_data['Fare'] > 50).astype(int)
    train_data = train_data.drop('Fare', axis=1)

    train_data['Age'] = train_data['Age'] / 100

    train_data = pd.get_dummies(train_data, columns=['Sex', 'Pclass'])
    train_data['Survived'] = train_data.pop('Survived')

    X_with_age = train_data.dropna()
    X_no_age = train_data.drop('Age', axis=1).dropna()
    
    return X_with_age, X_no_age

In [10]:
X_with_age, X_no_age = prepare_data(data)
X_with_age = X_with_age.to_numpy()
X_no_age = X_no_age.to_numpy()

In [11]:
for name, X in [('age', X_with_age), ('no_age', X_no_age)]:
        model = sklearn.linear_model.LogisticRegression()
        param_grid = {}
        mlp_gscv = GridSearchCV(model, param_grid, cv=5)
        mlp_gscv.fit(X[:, 1:-1], X[:, -1])
        print("%s: %s score: %s" %  (name, str(mlp_gscv.best_params_), str(mlp_gscv.best_score_)))

age: {} score: 0.7759479956663056
no_age: {} score: 0.7722114117130123


In [12]:
models = {}
for name, X in [('age', X_with_age), ('no_age', X_no_age)]:
        model = MLPClassifier(random_state=1, activation='relu')
        param_grid = {
            'max_iter': np.arange(10, 100, step=20),
            'solver': ['lbfgs', 'adam'],
            'hidden_layer_sizes': list(range(4, 10)) + list(itertools.product(range(4, 12), range(1, 8)))
        }
        mlp_gscv = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
        mlp_gscv.fit(X[:, 1:-1], X[:, -1])
        print("%s: %s score: %s" %  (name, str(mlp_gscv.best_params_), str(mlp_gscv.best_score_)))
        models[name] = mlp_gscv

age: {'hidden_layer_sizes': (11, 7), 'max_iter': 90, 'solver': 'lbfgs'} score: 0.806746774352408
no_age: {'hidden_layer_sizes': (5, 1), 'max_iter': 90, 'solver': 'adam'} score: 0.7901073378946707


In [13]:
test = pd.read_csv('test.csv')
test['Survived'] = -1

test_with_age = test.dropna()
test_no_age = test[~test.set_index('PassengerId').index.isin(test_with_age.set_index('PassengerId').index)]

In [14]:
X_with_age_test, _ = prepare_data(test.dropna())
_, X_no_age_test = prepare_data(test_no_age)

print("Len with age: %d, no age: %d" % (len(X_with_age_test), len(X_no_age_test)))

X_with_age_test = X_with_age_test.to_numpy()
X_no_age_test = X_no_age_test.to_numpy()

Len with age: 87, no age: 331


In [15]:
def predict(name, test_data):
    predicted = models[name].predict(test_data[:, 1:-1])
    return np.c_[test_data[:, 0], predicted]

predict_with_age = predict('age', X_with_age_test)
predict_no_age = predict('no_age', X_no_age_test)
predict = np.concatenate((predict_with_age, predict_no_age), axis=0)
predict = pd.DataFrame(data=predict, columns=['PassengerId', 'Survived'])

predict['PassengerId'] = predict['PassengerId'].astype(int)
predict['Survived'] = predict['Survived'].astype(int)
predict = predict.set_index('PassengerId').sort_values(by=['PassengerId'])

predict.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1


In [16]:
predict.to_csv('my_submission.csv')