In [333]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Load data

In [334]:
X = pd.read_csv('train.csv', index_col='PassengerId')
X_test = pd.read_csv('test.csv', index_col='PassengerId')

y = X['Survived']
X.drop(['Survived'], axis='columns', inplace=True)

In [335]:
X.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [336]:
X_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Check for missing values

In [337]:
na_X = X.isnull().sum()
na_X_test = X_test.isnull().sum()

print("NA values in train: ")
print(na_X[na_X > 0], end = "\n\n")
print("NA values in test: ")
print(na_X_test[na_X_test > 0], end="\n\n")

len_X = len(X.index)
len_X_test = len(X_test.index)

print("Number of passengers in train: {n}".format(n = len_X))
print("Number of passengers in test: {n}".format(n = len_X_test))

NA values in train: 
Age         177
Cabin       687
Embarked      2
dtype: int64

NA values in test: 
Age       86
Fare       1
Cabin    327
dtype: int64

Number of passengers in train: 891
Number of passengers in test: 418


Drop columns with missing values if there are more than 20% of them in the column

In [338]:
PERC_FOR_DROP_NA = 0.2

na_col_names_for_drop = [col for col in X.columns
                         if X[col].isnull().sum() / 
                         len_X > PERC_FOR_DROP_NA]

X.drop(na_col_names_for_drop, axis='columns', inplace=True)
X_test.drop(na_col_names_for_drop, axis='columns', inplace=True)


Ordinary encoding. The columns with a cardinality more than 10 are removed

In [339]:
MAX_CARDINALITY = 10

high_cardinality_col_names = [col for col in X.columns
                              if X[col].nunique() > MAX_CARDINALITY and
                              X[col].dtype == 'object']

X.drop(high_cardinality_col_names, axis='columns', inplace=True)
X_test.drop(high_cardinality_col_names, axis='columns', inplace=True)

In [340]:
from sklearn.preprocessing import OrdinalEncoder

category_col_names = [col for col in X.columns
                      if X[col].dtype == 'object']

od_encoder = OrdinalEncoder()
X[category_col_names] = od_encoder.fit_transform(X[category_col_names])
X_test[category_col_names] = od_encoder.transform(X_test[category_col_names])

Imputting missing values

In [341]:
from sklearn.impute import SimpleImputer

sp_imputer = SimpleImputer(missing_values=pd.NA, strategy='median')

X_ = pd.DataFrame(sp_imputer.fit_transform(X))
X_test_ = pd.DataFrame(sp_imputer.transform(X_test))

X_.index = X.index
X_test_.index = X_test.index

X_.columns = X.columns
X_test_.columns = X_test.columns

Splitting data to test and train

In [342]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_, y, train_size=0.8,
                                                      test_size=0.2, random_state=0)

Learn model

In [343]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor

modelXGBRegressor = XGBRegressor(n_estimators=200, learning_rate=0.1,
                      early_stopping_rounds=5)
modelXGBRegressor.fit(X_train, y_train,
          eval_set=[(X_valid, y_valid)],
          verbose=False)

predictions_XGBRegressor = np.round(modelXGBRegressor.predict(X_valid))
mae_XGBRegressor = mean_absolute_error(y_valid, predictions_XGBRegressor)
acc_XGBRegressor = accuracy_score(y_valid, predictions_XGBRegressor)

print("MAE of XGBRegressor: {p: .4f}".format(p = mae_XGBRegressor))
print("Accuracy of XGBRegressor: {p: .4f}".format(p = acc_XGBRegressor))

MAE of XGBRegressor:  0.1453
Accuracy of XGBRegressor:  0.8547


Predictions for a submission

In [344]:
submission_predictions = pd.DataFrame({'Survived': np.round(modelXGBRegressor.predict(X_test)).astype(int)},
                                        index=X_test.index)

submission_predictions.to_csv("submission_predictions.csv")
