In [30]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

%matplotlib inline

In [32]:
#load training data
train_raw = pandas.read_csv('train.csv', sep=',', index_col=0)

#load test data (with answer)
test_raw = pandas.read_csv('test.csv', sep=',', index_col=0)
gender_sub = pandas.read_csv('gender_submission.csv', sep=',', index_col=0)

data_len = len(train_raw)
itest_len = data_len // 10
valid_len = data_len // 10 * 2
train_len = data_len - valid_len - itest_len

test_len = len(test_raw)

def from_panda_to_numpy(df, 
                        age_min, age_max, 
                        sibsp_min, sibsp_max, 
                        parch_min, parch_max,
                        fare_min, fare_max ):
    df_len = len(df)

    def convert_name_to_label(names_):
        lower_name = names_.str.lower()
        result = np.zeros(len(names_))
        result[lower_name.str.contains('mrs.')] = 1
        result[lower_name.str.contains('mr.')] = 2
        result[lower_name.str.contains('ms.')] = 3
        result[lower_name.str.contains('mlle.')] = 4
        result[lower_name.str.contains('miss.')] = 5
        result[lower_name.str.contains('sir.')] = 6
        result[lower_name.str.contains('rev.')] = 7
        result[lower_name.str.contains('mme.')] = 8
        result[lower_name.str.contains('master.')] = 9
        result[lower_name.str.contains('major.')] = 10
        result[lower_name.str.contains('lady.')] = 11
        result[lower_name.str.contains('jonkheer.')] = 12
        result[lower_name.str.contains('dr.')] = 13
        result[lower_name.str.contains('don.')] = 14
        result[lower_name.str.contains('col.')] = 15
        result[lower_name.str.contains('capt.')] = 16
        result[lower_name.str.contains('countess.')] = 17

        return result
    
    def convert_sex_to_label(sex_):
        lower_sex = sex_.str.lower()
        result = np.zeros(len(sex_))
        result[lower_sex == 'male'] = 0
        result[lower_sex == 'female'] = 1

        return result

    def convert_cabin_to_label(cabin_):
        lower_cabin = cabin_.str.lower()
        result = np.zeros(len(cabin_))
        lower_cabin = lower_cabin.replace(np.nan, '', regex=True)
        result[lower_cabin.str.contains('a')] = 1
        result[lower_cabin.str.contains('b')] = 2
        result[lower_cabin.str.contains('c')] = 3
        result[lower_cabin.str.contains('d')] = 4
        result[lower_cabin.str.contains('e')] = 5
        result[lower_cabin.str.contains('f')] = 6
        result[lower_cabin.str.contains('g')] = 7
        result[lower_cabin.str.contains('t')] = 8

        return result

    def convert_embark_to_label(embark_):
        lower_embark = embark_.str.lower()
        result = np.zeros(len(cabin_))
        result[lower_embark.str.contains('c')] = 1
        result[lower_embark.str.contains('q')] = 2
        result[lower_embark.str.contains('s')] = 3
        
    def clean_up_age(age_):
        result = np.zeros(len(age_), dtype=np.float32)
        result[:] = age_
        result[result < 1] *= 100 
        
        return result
    
    def clean_up_fare(fare_):
        result = np.zeros(len(fare_), dtype=np.float32)
        result[:] = fare_
        result[np.isnan(result)] = 0
        
        return result
        
        
    full_dataset = np.zeros((df_len, 32), dtype=np.float32)
    full_dataset[:, 0] = df["Pclass"] / 3
    salutaion_ = convert_name_to_label(df["Name"])
    full_dataset[:, 1] = salutaion_ == 1
    full_dataset[:, 2] = salutaion_ == 2
    full_dataset[:, 3] = salutaion_ == 3
    full_dataset[:, 4] = salutaion_ == 4
    full_dataset[:, 5] = salutaion_ == 5
    full_dataset[:, 6] = salutaion_ == 6
    full_dataset[:, 7] = salutaion_ == 7
    full_dataset[:, 8] = salutaion_ == 8
    full_dataset[:, 9] = salutaion_ == 9
    full_dataset[:, 10] = salutaion_ == 10
    full_dataset[:, 11] = salutaion_ == 11
    full_dataset[:, 12] = salutaion_ == 12
    full_dataset[:, 13] = salutaion_ == 13
    full_dataset[:, 14] = salutaion_ == 14
    full_dataset[:, 15] = salutaion_ == 15
    full_dataset[:, 16] = salutaion_ == 16
    full_dataset[:, 17] = salutaion_ == 17
    full_dataset[:, 18] = convert_sex_to_label(df["Sex"])
    full_dataset[:, 19] = (clean_up_age(df["Age"]) - age_min) / (age_max - age_min)
    full_dataset[:, 20] = (df["SibSp"] - sibsp_min) / (sibsp_max - sibsp_min)
    full_dataset[:, 21] = (df["Parch"] - parch_min) / (parch_max - parch_min)
    full_dataset[:, 22] = (clean_up_fare(df["Fare"]) - fare_min) / (fare_max - fare_min)
    carbin_type = convert_cabin_to_label(df["Cabin"])
    full_dataset[:, 23] = carbin_type == 0
    full_dataset[:, 24] = carbin_type == 1
    full_dataset[:, 25] = carbin_type == 2
    full_dataset[:, 26] = carbin_type == 3
    full_dataset[:, 27] = carbin_type == 4
    full_dataset[:, 28] = carbin_type == 5
    full_dataset[:, 29] = carbin_type == 6
    full_dataset[:, 30] = carbin_type == 7
    full_dataset[:, 31] = carbin_type == 8
    
    return full_dataset

age_min_ = min(0, train_raw["Age"].min())
age_max_ = train_raw["Age"].max()
sibsp_min_ = train_raw["SibSp"].min()
sibsp_max_ = train_raw["SibSp"].max()
parch_min_ = train_raw["Parch"].min()
parch_max_ = train_raw["Parch"].max()
fare_min_ = train_raw["Fare"].min()
fare_max_ = train_raw["Fare"].max()
    
full_dataset = from_panda_to_numpy(train_raw, 
                                   age_min_, age_max_, sibsp_min_, sibsp_max_, parch_min_, parch_max_, fare_min_, fare_max_)
full_label = np.zeros((data_len), dtype=np.float32)
full_label[:] = train_raw["Survived"]

test_dataset = from_panda_to_numpy(test_raw,
                                   age_min_, age_max_, sibsp_min_, sibsp_max_, parch_min_, parch_max_, fare_min_, fare_max_)
test_labels = np.zeros((test_len), dtype=np.float32)
test_labels[:] = gender_sub["Survived"]



In [33]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(full_dataset, full_label, test_size=test_size, random_state=seed)

In [34]:
model = XGBClassifier()
model.fit(X_train, y_train)

print(model)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)


In [37]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 80.34%


In [36]:
#output dataset for submission
r_pred = model.predict(test_dataset)
r_predictions = [int(round(value)) for value in r_pred]

submission_df = pandas.DataFrame(index=gender_sub.index, columns=["Survived"])
submission_df["Survived"] = r_predictions
submission_df.to_csv("submission_xgboost.csv", sep=',')

In [22]:
X_test.shape, test_dataset.shape

((295, 32), (418, 32))

In [29]:
r_predictions

[0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
