In [218]:
import matplotlib.pyplot as plt
import re
import numpy as np
import math
import pandas as pd
import random
import seaborn as sns
import warnings
import itertools
import sklearn.neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn.linear_model
warnings.filterwarnings("ignore")

plt.style.use('ggplot')

In [340]:
data = pd.read_csv('train.csv').set_index('PassengerId')
test = pd.read_csv('test.csv').set_index('PassengerId')
test['Survived'] = -1

data_and_train = pd.get_dummies(pd.concat([data, test]), columns=['Sex', 'Pclass', 'SibSp', 'Parch'])
data = data_and_train[~data_and_train.index.isin(test.index)].reset_index()
test = data_and_train[~data_and_train.index.isin(data.index)].reset_index()

In [341]:
cabin_data = data[['Cabin', 'Survived']]
cabin_data = cabin_data.dropna()
cabin_data['Cabin'] = cabin_data['Cabin'].str.split(' ')
cabin_data = cabin_data.explode('Cabin')

cabin_survived = cabin_data.groupby('Cabin').agg(Survived=('Survived', 'sum'), Total= ('Survived', 'count')).reset_index()
cabin_survived = cabin_survived[cabin_survived['Cabin'].str.contains('[A-Z].[0-9]+')]
cabin_survived['Ratio'] = cabin_survived['Survived'] / cabin_survived['Total']
cabin_survived['CabinType'] = cabin_survived['Cabin'].str[0]
cabin_survived['CabinNum'] = cabin_survived['Cabin'].str[1:].astype(int)

cabin_model = {}
for cabin_type in set(cabin_survived['CabinType']): 
    train_data = cabin_survived[cabin_survived['CabinType'] == cabin_type][['CabinNum', 'Ratio']].to_numpy()
    if len(train_data) > 5:
        model = sklearn.neighbors.KNeighborsRegressor()
        param_grid = {
            'n_neighbors': np.arange(1, 5)
        }
        knn_gscv = GridSearchCV(model, param_grid, cv=3)
        knn_gscv.fit(np.c_[train_data[:, 0]], train_data[:, 1])

        cabin_model[cabin_type] = knn_gscv

In [342]:
def prepare_data(_data):
    train_data = _data.drop('Name', axis=1).drop('Ticket', axis=1).drop('Embarked', axis=1)
    train_data['FareExpensive'] = (train_data['Fare'] > 50).astype(int)
    train_data = train_data.drop('Fare', axis=1)

    def cabin_ration(cabin):
        if type(cabin)  == float and math.isnan(cabin):
            return 0.5

        result = []
        for c in cabin.split(' '):
            if not re.match('[A-Z].[0-9]+', c):
                continue

            if c[0] not in cabin_model:
                continue
            result.append(cabin_model[c[0]].predict([[int(c[1:])]])[0])

        if len(result) == 0:
            return 0.5
        return sum(result) / len(result)


    train_data['CabinSurvivor'] = train_data['Cabin'].apply(cabin_ration)
    train_data = train_data.drop('Cabin', axis=1)

    train_data['Age'] = train_data['Age'] / 100

    train_data['Survived'] = train_data.pop('Survived')

    X_with_age = train_data.dropna()
    X_no_age = train_data.drop('Age', axis=1).dropna()
    
    return X_with_age, X_no_age

In [343]:
X_with_age, X_no_age = prepare_data(data)
X_with_age = X_with_age.to_numpy()
X_no_age = X_no_age.to_numpy()

In [344]:
for name, X in [('age', X_with_age), ('no_age', X_no_age)]:
        model = sklearn.linear_model.LogisticRegression()
        param_grid = {}
        mlp_gscv = GridSearchCV(model, param_grid, cv=5)
        mlp_gscv.fit(X[:, 1:-1], X[:, -1])
        print("%s: %s score: %s" %  (name, str(mlp_gscv.best_params_), str(mlp_gscv.best_score_)))

age: {} score: 0.8011425194523787
no_age: {} score: 0.8080911430544223


In [345]:
for name, X in [('age', X_with_age), ('no_age', X_no_age)]:
        model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2)
        param_grid = {
            "n_estimators": range(10, 100, 20),
            "max_depth": range(2, 5)
        }
        mlp_gscv = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
        mlp_gscv.fit(X[:, 1:-1], X[:, -1])
        print("%s: %s score: %s" %  (name, str(mlp_gscv.best_params_), str(mlp_gscv.best_score_)))

age: {'max_depth': 4, 'n_estimators': 10} score: 0.8179749827637153
no_age: {'max_depth': 4, 'n_estimators': 70} score: 0.8170736300295023


In [346]:
models = {}
for name, X in [('age', X_with_age), ('no_age', X_no_age)]:
        model = MLPClassifier(random_state=1, activation='relu')
        param_grid = {
            'max_iter': np.arange(10, 100, step=20),
            'solver': ['lbfgs', 'adam'],
            'hidden_layer_sizes': list(range(4, 10)) + list(itertools.product(range(4, 12), range(1, 8)))
        }
        mlp_gscv = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
        mlp_gscv.fit(X[:, 1:-1], X[:, -1])
        print("%s: %s score: %s" %  (name, str(mlp_gscv.best_params_), str(mlp_gscv.best_score_)))
        models[name] = mlp_gscv

age: {'hidden_layer_sizes': (11, 5), 'max_iter': 50, 'solver': 'lbfgs'} score: 0.8487737614498178
no_age: {'hidden_layer_sizes': (5, 5), 'max_iter': 70, 'solver': 'lbfgs'} score: 0.8282970309459545


In [353]:
test_with_age = test.dropna()
test_no_age = test[~test.set_index('PassengerId').index.isin(test_with_age.set_index('PassengerId').index)]

X_with_age_test, _ = prepare_data(test_with_age.dropna())
_, X_no_age_test = prepare_data(test_no_age)

print("Len with age: %d, no age: %d" % (len(X_with_age_test), len(X_no_age_test)))

X_with_age_test = X_with_age_test.to_numpy()
X_no_age_test = X_no_age_test.to_numpy()

Len with age: 87, no age: 332


In [354]:
def predict(name, test_data):
    predicted = models[name].predict(test_data[:, 1:-1])
    return np.c_[test_data[:, 0], predicted]

predict_with_age = predict('age', X_with_age_test)
predict_no_age = predict('no_age', X_no_age_test)
predict = np.concatenate((predict_with_age, predict_no_age), axis=0)
predict = pd.DataFrame(data=predict, columns=['PassengerId', 'Survived'])

predict['PassengerId'] = predict['PassengerId'].astype(int)
predict['Survived'] = predict['Survived'].astype(int)
predict = predict.set_index('PassengerId').sort_values(by=['PassengerId'])

predict

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
891,0
892,0
893,0
894,0
895,0
...,...
1305,0
1306,1
1307,0
1308,0


In [355]:
predict.to_csv('my_submission.csv')