## GBDT is, as always, the best estimator
It can handle NaNs, so we will do different data preparation for this estimator specifically.

In [114]:
import lightgbm as lgbm
import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn import svm

In [339]:
features = ['Pclass', 'Title', 'Fam_type', 'Embarked', 'Fare']#, 'Age', 'Cabin_info']#, 'Deck', 'Sex']
cat_features = ['Pclass', 'Title', 'Sex', 'Fam_type', 'Embarked', 'Deck', 'Cabin_info']

In [340]:
df_train = pd.read_csv(os.path.join('data', 'train.csv'), index_col=0)
df_test = pd.read_csv(os.path.join('data', 'test.csv'), index_col=0)

# Because of the amount of missing values and high number of categories, Deck is probably a bad feature
df_train['Deck'] = df_train['Cabin'].dropna().str[0]
df_test['Deck'] = df_test['Cabin'].dropna().str[0]
# New binary cabin feature
df_train['Cabin_info'] = df_train['Cabin'].apply(lambda x: 1 if x is not np.nan else 0)
df_test['Cabin_info'] = df_test['Cabin'].apply(lambda x: 1 if x is not np.nan else 0)

# Pro tips from Massimiliano Viola's kernel
df_train.loc[(df_train['Fare']==0), 'Fare'] = np.nan
df_test.loc[(df_test['Fare']==0), 'Fare'] = np.nan

# Combine siblings, spouses, parents and children into a family size feature
df_train['Fam_type'] = pd.cut(df_train['SibSp'] + df_train['Parch'] + 1, [0,1,4,7,11], labels=['Solo', 'Small', 'Big', 'Very big'])
df_test['Fam_type'] = pd.cut(df_test['SibSp'] + df_test['Parch'] + 1, [0,1,4,7,11], labels=['Solo', 'Small', 'Big', 'Very big'])

# Create Title feature based on name (replaces Sex as feature)
df_train['Title'] = df_train['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
df_test['Title']  = df_test['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
df_train['Title'] = df_train['Title'].replace(['Mme', 'Ms', 'Lady', 'Mlle', 'the Countess', 'Dona'], 'Miss')
df_test['Title']  = df_test['Title'].replace(['Mme', 'Ms', 'Lady', 'Mlle', 'the Countess', 'Dona'], 'Miss')
df_train['Title'].replace(['Major', 'Col', 'Capt', 'Don', 'Sir', 'Jonkheer'], 'Mr', inplace=True)
df_test['Title'].replace(['Major', 'Col', 'Capt', 'Don', 'Sir', 'Jonkheer'], 'Mr', inplace=True)

In [341]:
for feature in cat_features:
    df_train[feature] = df_train[feature].astype('category')
    df_test[feature] = df_test[feature].astype('category')

In [342]:
# EDIT: This has a big random impact on the estimator. Keeping it as NaN forces LightGBM to deal with it. Is that better?
# # Impute age values based random sampling from distributions conditioned on Pclass and Sex (from both sets)
# # Add 0.5 as that is the way to show that a value is estimated in the rest of the data
# df_total = pd.concat([df_train.drop('Survived', axis=1), df_test])

# def impute_age(row):
#     if np.isnan(row['Age']):
#         return int(df_total[(df_total['Pclass'] == row['Pclass']) & (df_total['Sex'] == row['Sex'])]['Age'].dropna().sample())+0.5
#     else:
#         return row['Age']

# df_train['Age'] = df_train.apply(impute_age, axis=1)
# df_test['Age']  = df_test.apply(impute_age, axis=1)

In [343]:
df_train[features].head()

Unnamed: 0_level_0,Pclass,Title,Fam_type,Embarked,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3,Mr,Small,S,7.25
2,1,Mrs,Small,C,71.2833
3,3,Miss,Solo,S,7.925
4,1,Mrs,Small,S,53.1
5,3,Mr,Solo,S,8.05


In [344]:
X = df_train[features]
y = df_train['Survived']

In [345]:
# With categorical features
clf = lgbm.LGBMClassifier(num_leaves=800, max_depth=-1, learning_rate=0.003, 
                          n_estimators=2000, min_child_samples=40, colsample_bytree=0.67,
                          sub_sample=1.0, subsample_freq=0, random_seed=1234)
scores = cross_val_score(clf, X, y, cv=5)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
# Small standard deviation is probably better than small increments in accuracy

Accuracy: 0.816 (+/- 0.034)


In [346]:
scores

array([0.81564246, 0.80898876, 0.84831461, 0.79775281, 0.80898876])

In [347]:
clf.fit(X, y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.67,
               importance_type='split', learning_rate=0.003, max_depth=-1,
               min_child_samples=40, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=2000, n_jobs=-1, num_leaves=800, objective=None,
               random_seed=1234, random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, sub_sample=1.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [348]:
# from sklearn.metrics import accuracy_score
# accuracy_score(y, clf_final.predict(X))

In [349]:
pred = clf.predict(df_test[features])
submit = pd.DataFrame(index=df_test.index, data=pred, columns=['Survived'])

In [350]:
submit['Survived'].value_counts(normalize=True)

0    0.643541
1    0.356459
Name: Survived, dtype: float64

In [351]:
submit.to_csv(os.path.join('submissions', '200809_lgbm_7.csv'), header=True)