In [89]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score

import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostClassifier

# import keras.backend as K
# from keras.models import Input, Model, load_model, save_model
# from keras.layers import Dense, Flatten, Embedding, concatenate
# from keras.layers import Dropout, GaussianNoise, SpatialDropout1D
# from keras.callbacks import EarlyStopping, ModelCheckpoint
# from keras import optimizers

sns.set()

In [48]:
train = pd.read_csv('./input/titanic/train.csv')
test = pd.read_csv('./input/titanic/test.csv')
print(f'Data size: {len(train):d}, {len(test):d}')

Data size: 891, 418


In [50]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [51]:
np.sum(train.isna())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [52]:
np.sum(test.isna())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [68]:
# pre-processing
age_median = train['Age'].median()
train['Age'].fillna(age_median, inplace=True)
test['Age'].fillna(age_median, inplace=True)

train['Fare'] = np.log1p(train['Fare'])
test['Fare'] = np.log1p(test['Fare'])
fare_mean = train['Fare'].mean()
test['Fare'].fillna(fare_mean, inplace=True)

train['Embarked'].fillna('S', inplace=True)

In [70]:
categorical = ['Pclass', 'Sex', 'Embarked']
for col in categorical:
    lbl = LabelEncoder()
    train[col] = lbl.fit_transform(train[col])
    test[col] = lbl.transform(test[col])

In [71]:
continuous = ['Age', 'SibSp', 'Parch', 'Fare']
for col in continuous:
    scaler = StandardScaler()
    train[col] = scaler.fit_transform(train[col].values.reshape(-1, 1)).reshape(-1, )
    test[col] = scaler.transform(test[col].values.reshape(-1, 1)).reshape(-1, )



In [83]:
target = train['Survived'].astype(int)
y = target.values
skf = StratifiedKFold(n_splits=5, random_state=21, shuffle=True)
index_train, index_valid = next(skf.split(y, y))

In [84]:
X_train = np.concatenate([
    train[categorical].iloc[index_train].values,
    train[continuous].iloc[index_train].values
], axis=1)
X_valid = np.concatenate([
    train[categorical].iloc[index_valid].values,
    train[continuous].iloc[index_valid].values
], axis=1)
y_train, y_valid = y[index_train], y[index_valid]

In [85]:
cat_features = list(range(len(categorical)))

catboost_params = {
    'iterations': 2000,
    'max_depth': 4,
    'learning_rate': 0.02,
    'loss_function': 'MultiClass',
    'rsm': 0.7,
    'l2_leaf_reg': 1,
    'bagging_temperature': 0.1,
    'thread_count': 4,
    'classes_count': 2,
    'logging_level': 'Silent',
    'random_seed': 21
}

model = CatBoostClassifier(**catboost_params)
model.fit(X_train, y_train, cat_features=cat_features, use_best_model=True, eval_set=[(X_valid, y_valid)])

<catboost.core.CatBoostClassifier at 0x7f82e24d12b0>

In [93]:
y_preds = model.predict_proba(X_valid)[:, 1]
score = roc_auc_score(y_valid, y_preds)
print(f'Score: {score:.6f}')

Score: 0.912055


In [96]:
cat_features = list(range(len(categorical)))

catboost_params = {
    'iterations': 2000,
    'max_depth': 4,
    'learning_rate': 0.02,
    'loss_function': 'MultiClass',
    'rsm': 0.7,
    'l2_leaf_reg': 1,
    'bagging_temperature': 0.1,
    'thread_count': 4,
    'classes_count': 2,
    'logging_level': 'Silent',
    'random_seed': 21
}

K = 5

skf = StratifiedKFold(n_splits=K, random_state=21, shuffle=True)
test_preds = np.zeros([len(test), 2])
oof_preds = np.zeros([len(train), 2])
for i, (index_train, index_valid) in enumerate(skf.split(y, y)):
    X_train = np.concatenate([
        train[categorical].iloc[index_train].values,
        train[continuous].iloc[index_train].values
    ], axis=1)
    X_valid = np.concatenate([
        train[categorical].iloc[index_valid].values,
        train[continuous].iloc[index_valid].values
    ], axis=1)
    y_train, y_valid = y[index_train], y[index_valid]
    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X_train, y_train, cat_features=cat_features, use_best_model=True, eval_set=[(X_valid, y_valid)])
    
    y_preds = model.predict_proba(X_valid)
    oof_preds[index_valid] = y_preds
    score = roc_auc_score(y_valid, y_preds[:, 1])
    print(f'Fold {i+1:d} / {K:d}: AUC - {score:.6f}')
    
    X_test = np.concatenate([
        test[categorical].values,
        test[continuous].values
    ], axis=1)
    
    test_preds += model.predict_proba(X_test) / K

Fold 1 / 5: AUC - 0.912055
Fold 2 / 5: AUC - 0.839328
Fold 3 / 5: AUC - 0.827941
Fold 4 / 5: AUC - 0.885094
Fold 5 / 5: AUC - 0.911360


In [101]:
roc_auc_score(y, oof_preds[:, 1])

0.8744847090403604

In [100]:
submit = pd.DataFrame()
submit['PassengerId'] = test['PassengerId']
submit['Survived'] = np.argmax(test_preds, axis=1)
submit.head()
submit.to_csv('./submissions/titanic_submit.csv', index=False)

# 0.78947 on LB

In [13]:
cols_int = train.columns[train.dtypes == 'int64']
cols_obj = train.columns[train.dtypes == 'object']
cols_flt = train.columns[train.dtypes == 'float64']
print(f'columns: integer - {len(cols_int):d}, object - {len(cols_obj):d}, float - {len(cols_flt):d}')

columns: integer - 6, object - 19, float - 108


In [14]:
# categorical columns
train[cols_obj].head()

Unnamed: 0,v3,v22,v24,v30,v31,v47,v52,v56,v66,v71,v74,v75,v79,v91,v107,v110,v112,v113,v125
0,C,XDX,C,C,A,C,G,DI,C,F,B,D,E,A,E,B,O,,AU
1,C,GUV,C,C,A,E,G,DY,A,F,B,D,D,B,B,A,U,G,AF
2,C,FQ,E,,A,C,F,AS,A,B,B,B,E,G,C,B,S,,AE
3,C,ACUE,D,C,B,C,H,BW,A,F,B,D,B,B,B,B,J,,CJ
4,C,HIT,E,,A,I,H,,C,F,B,D,C,G,C,A,T,G,Z


In [15]:
train[cols_int].head()

Unnamed: 0,ID,target,v38,v62,v72,v129
0,3,1,0,1,1,0
1,4,1,0,2,2,0
2,5,1,0,1,3,2
3,6,1,0,1,2,1
4,8,1,0,1,1,0


In [18]:
train[cols_flt].head()

Unnamed: 0,v1,v2,v4,v5,v6,v7,v8,v9,v10,v11,...,v120,v121,v122,v123,v124,v126,v127,v128,v130,v131
0,1.335739,8.727474,3.921026,7.915266,2.599278,3.176895,0.012941,9.999999,0.503281,16.434108,...,1.059603,0.803572,8.0,1.98978,0.035754,1.804126,3.113719,2.024285,0.636365,2.857144
1,,,,9.191265,,,2.30163,,1.31291,,...,,,,,0.598896,,,1.957825,,
2,0.943877,5.310079,4.410969,5.326159,3.979592,3.928571,0.019645,12.666667,0.765864,14.756098,...,2.138728,2.238806,9.333333,2.477596,0.013452,1.773709,3.922193,1.120468,0.883118,1.176472
3,0.797415,8.304757,4.22593,11.627438,2.0977,1.987549,0.171947,8.965516,6.542669,16.347483,...,1.166281,1.956521,7.018256,1.812795,0.002267,1.41523,2.954381,1.990847,1.677108,1.034483
4,,,,,,,,,1.050328,,...,,,,,,,,,,
