In [9]:
import matplotlib.pyplot as plt
import re
import numpy as np
import math
import pandas as pd
import random
import seaborn as sns
import warnings
import itertools
import sklearn.neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
import sklearn.linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
warnings.filterwarnings("ignore")

plt.style.use('ggplot')
pd.set_option("display.max_rows", None)

In [10]:
def normalize(column):
    data[column] = MinMaxScaler().fit_transform(data[column].to_numpy().reshape(-1, 1))

In [11]:
train_orig = pd.read_csv('train.csv').set_index('PassengerId')
target_orig = pd.read_csv('test.csv').set_index('PassengerId')
target_orig['Survived'] = 0

data = pd.concat([train_orig, target_orig])
data.sample(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
156,0,1,"Williams, Mr. Charles Duane",male,51.0,0,1,PC 17597,61.3792,,C
817,0,3,"Heininen, Miss. Wendla Maria",female,23.0,0,0,STON/O2. 3101290,7.925,,S
919,0,3,"Daher, Mr. Shedid",male,22.5,0,0,2698,7.225,,C
997,0,3,"Holthen, Mr. Johan Martin",male,28.0,0,0,C 4001,22.525,,S
638,0,2,"Collyer, Mr. Harvey",male,31.0,1,1,C.A. 31921,26.25,,S
151,0,2,"Bateman, Rev. Robert James",male,51.0,0,0,S.O.P. 1166,12.525,,S
146,0,2,"Nicholls, Mr. Joseph Charles",male,19.0,1,1,C.A. 33112,36.75,,S
1273,0,3,"Foley, Mr. Joseph",male,26.0,0,0,330910,7.8792,,Q
983,0,3,"Pedersen, Mr. Olaf",male,,0,0,345498,7.775,,S
615,0,3,"Brocklebank, Mr. William Alfred",male,35.0,0,0,364512,8.05,,S


In [12]:
data['Rank'] = data['Cabin'].str[0]
data[~pd.isna(data['Rank'])][['Rank', 'Cabin']].sample(10)

Unnamed: 0_level_0,Rank,Cabin
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
505,B,B79
773,E,E77
326,C,C32
458,D,D21
474,D,D
310,E,E36
1247,E,E60
1235,B,B51 B53 B55
1179,B,B45
586,E,E68


In [13]:
data['LastName'] = data['Name'].apply(lambda x: x.split(',')[0])
families_total = dict(data.groupby('LastName')['Survived'].count())
families_survived = dict(data.groupby('LastName')['Survived'].sum())
data['FamilySize'] = data['LastName'].apply(lambda name: families_total.get(name, 0))
data['FamilySurvived'] = data['LastName'].apply(lambda name: families_survived.get(name, 0) )
data['FamilySurvived'] = data['FamilySurvived'] - data['Survived']
#normalize('FamilySize')
#normalize('FamilySurvived')
data[['LastName', 'FamilySize', 'FamilySurvived']].sample(10)

Unnamed: 0_level_0,LastName,FamilySize,FamilySurvived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
93,Chaffee,2,0
435,Silvey,2,1
679,Goodwin,8,0
1110,Widener,3,0
731,Allen,2,0
1238,Botsford,1,0
1266,Dodge,3,1
1246,Dean,4,1
518,Ryan,2,0
1172,Oreskovic,3,0


In [14]:
data['Age'] = data['Age'].fillna(data['Age'].mean())
normalize('Age')
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
normalize('Fare')
normalize('SibSp')
normalize('Parch')

data = pd.get_dummies(data, columns=['Sex', 'Pclass', 'Rank', 'Embarked', 'FamilySize', 'FamilySurvived'])
data = data.drop(['Name', 'Ticket', 'Cabin',  'LastName', 'SibSp', 'Parch'], axis=1)
data['Survived'] = data.pop('Survived')
data.sample(10)

Unnamed: 0_level_0,Age,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Rank_A,Rank_B,Rank_C,...,FamilySize_6,FamilySize_7,FamilySize_8,FamilySize_11,FamilySurvived_0,FamilySurvived_1,FamilySurvived_2,FamilySurvived_3,FamilySurvived_4,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1048,0.361142,0.432884,1,0,1,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1086,0.098083,0.063436,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
67,0.361142,0.020495,1,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
64,0.047977,0.054457,0,1,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
1278,0.298509,0.015176,0,1,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1174,0.37218,0.015127,1,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
86,0.411249,0.030937,1,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
992,0.536515,0.108215,1,0,1,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1275,0.235876,0.031425,1,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
220,0.373669,0.020495,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [15]:
data.to_csv('tmp_data.csv')

train = data[data.index.isin(train_orig.index)]
test = data[data.index.isin(target_orig.index)]
test = test.drop('Survived', axis=1)

train = train.reset_index()
test = test.reset_index()

X_train, X_test, y_train, y_test = train_test_split(train.drop(['PassengerId', 'Survived'], axis=1).to_numpy(), train['Survived'].to_numpy(), test_size=0.002)
print(len(X_train))

889


In [16]:
features = len(X_train[0])
model = MLPClassifier(random_state=1, activation='relu')

param_grid = {
    'max_iter': np.arange(100, 1000, step=100),
    'solver': ['adam'],
#    'hidden_layer_sizes': list(range(4, 10)) + list(itertools.product(range(4, 12), range(1, 8)))
    'hidden_layer_sizes': list(itertools.product(range(features  - 4, features  + 4), range(int(features / 4) - 2, int(features / 4) + 2)))
}
mlp_gscv = GridSearchCV(model, param_grid, cv=10, n_jobs=-1)
mlp_gscv.fit(X_train, y_train)
mlp_gscv_score = mlp_gscv.score(X_test, y_test)

print("%s: %s score: %s  (on train: %s)" %  ("MLP", str(mlp_gscv.best_params_), str(mlp_gscv_score), str(mlp_gscv.best_score_)))

predicted = mlp_gscv.predict(test.drop('PassengerId', axis=1).to_numpy())
predicted = np.c_[test['PassengerId'], predicted]

predicted = pd.DataFrame(data=predicted, columns=['PassengerId', 'Survived'])

predicted['PassengerId'] = predicted['PassengerId'].astype(int)
predicted['Survived'] = predicted['Survived'].astype(int)
predicted = predicted.set_index('PassengerId').sort_values(by=['PassengerId'])

predicted.to_csv('my_submission.csv')

fake_100 = pd.read_csv('fake_100.csv').set_index('PassengerId')

predicted['Survived100'] = fake_100['Survived']
total = len([x for x in (predicted['Survived'] != predicted['Survived100']) if not x]) / len(predicted)
print("Total: " + str(total))

MLP: {'hidden_layer_sizes': (28, 6), 'max_iter': 300, 'solver': 'adam'} score: 1.0  (on train: 0.8323672114402452)
Total: 0.7870813397129187
