In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import re
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from catboost import CatBoostClassifier

In [4]:
RANDOM_STATE = 5
train = pd.read_csv('train.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')

In [8]:
train.count()

Survived    891
Pclass      891
Name        891
Sex         891
Age         714
SibSp       891
Parch       891
Ticket      891
Fare        891
Cabin       204
Embarked    889
dtype: int64

In [5]:
train = pd.read_csv('train.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')
y_train = train['Survived']
X_train = train.drop(['Survived'], axis=1)
X_test = test.copy()

X_full = [X_train, X_test]

def get_title(name):
    title_search = re.search('([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    else:
        return ""

for df in X_full:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    df['IsAlone'] = df['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

    df['Embarked'] = df['Embarked'].fillna('S')
#     df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

    df['Fare'] = df['Fare'].fillna(X_train['Fare'].median())

    age_mean = X_train['Age'].mean()
    age_std = X_train['Age'].std()
    age_null_count = df['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_mean - age_std, age_mean + age_std, size=age_null_count)
    df['Age'][np.isnan(df['Age'])] = age_null_random_list
    df['Age'] = df['Age'].astype(int)

#     df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)
    
    df['Title'] = df['Name'].apply(get_title)

    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 
                                       'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
#     title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
#     df['Title'] = df['Title'].map(title_mapping)
    df['Title'] = df['Title'].fillna('0')
    
    df.loc[ df['Fare'] <= 7.91, 'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare']   = 2
    df.loc[ df['Fare'] > 31, 'Fare'] = 3
    df['Fare'] = df['Fare'].astype(int)
    
    # Mapping Age
    df.loc[ df['Age'] <= 16, 'Age'] = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
    df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
    df.loc[ df['Age'] > 64, 'Age'] = 4
    
    df.drop(['Name', 'Ticket', 'Cabin', 'SibSp'], axis=1, inplace=True)

In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 9 columns):
Pclass        891 non-null int64
Sex           891 non-null object
Age           891 non-null int64
Parch         891 non-null int64
Fare          891 non-null int64
Embarked      891 non-null object
FamilySize    891 non-null int64
IsAlone       891 non-null int64
Title         891 non-null object
dtypes: int64(6), object(3)
memory usage: 69.6+ KB


In [11]:
X, y = X_train, y_train

In [6]:
X_train_concat_feat = X_train.copy()
cols = X_train.columns
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        feature = '%s_%s' % (cols[i], cols[j])
        X_train_concat_feat[feature] = X_train[cols[i]].map(str) + '_' + X_train[cols[j]].map(str)

In [13]:
X_train_concat_feat.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Parch,Fare,Embarked,FamilySize,IsAlone,Title,Pclass_Sex,...,Fare_Embarked,Fare_FamilySize,Fare_IsAlone,Fare_Title,Embarked_FamilySize,Embarked_IsAlone,Embarked_Title,FamilySize_IsAlone,FamilySize_Title,IsAlone_Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,male,1,0,0,S,2,0,Mr,3_male,...,0_S,0_2,0_0,0_Mr,S_2,S_0,S_Mr,2_0,2_Mr,0_Mr
2,1,female,2,0,3,C,2,0,Mrs,1_female,...,3_C,3_2,3_0,3_Mrs,C_2,C_0,C_Mrs,2_0,2_Mrs,0_Mrs
3,3,female,1,0,1,S,1,1,Miss,3_female,...,1_S,1_1,1_1,1_Miss,S_1,S_1,S_Miss,1_1,1_Miss,1_Miss
4,1,female,2,0,3,S,2,0,Mrs,1_female,...,3_S,3_2,3_0,3_Mrs,S_2,S_0,S_Mrs,2_0,2_Mrs,0_Mrs
5,3,male,2,0,1,S,1,1,Mr,3_male,...,1_S,1_1,1_1,1_Mr,S_1,S_1,S_Mr,1_1,1_Mr,1_Mr


In [7]:
X_test_concat_feat = X_test.copy()
cols = X_test.columns
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        feature = '%s_%s' % (cols[i], cols[j])
        X_test_concat_feat[feature] = X_test[cols[i]].map(str) + '_' + X_test[cols[j]].map(str)

In [33]:
X_test_concat_feat.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Parch,Fare,Embarked,FamilySize,IsAlone,Title,Pclass_Sex,...,Fare_Embarked,Fare_FamilySize,Fare_IsAlone,Fare_Title,Embarked_FamilySize,Embarked_IsAlone,Embarked_Title,FamilySize_IsAlone,FamilySize_Title,IsAlone_Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,3,male,2,0,0,Q,1,1,Mr,3_male,...,0_Q,0_1,0_1,0_Mr,Q_1,Q_1,Q_Mr,1_1,1_Mr,1_Mr
893,3,female,2,0,0,S,2,0,Mrs,3_female,...,0_S,0_2,0_0,0_Mrs,S_2,S_0,S_Mrs,2_0,2_Mrs,0_Mrs
894,2,male,3,0,1,Q,1,1,Mr,2_male,...,1_Q,1_1,1_1,1_Mr,Q_1,Q_1,Q_Mr,1_1,1_Mr,1_Mr
895,3,male,1,0,1,S,1,1,Mr,3_male,...,1_S,1_1,1_1,1_Mr,S_1,S_1,S_Mr,1_1,1_Mr,1_Mr
896,3,female,1,1,1,S,3,0,Mrs,3_female,...,1_S,1_3,1_0,1_Mrs,S_3,S_0,S_Mrs,3_0,3_Mrs,0_Mrs


In [8]:
cat_features = []
for i, col in enumerate(X_train_concat_feat.dtypes):
    if col == object:
        cat_features.append(i)

In [28]:
X_train_1, X_valid, y_train_1, y_valid = \
        train_test_split(X_train_concat_feat, y_train, test_size=0.33, random_state=RANDOM_STATE)

In [123]:
# specify the training parameters 
model = CatBoostClassifier(iterations=100, depth=5, learning_rate=1, 
                           loss_function='Logloss', logging_level='Silent')
#train the model
model.fit(X_train_1, y_train_1, cat_features=cat_features)
# make the prediction using the resulting model
y_preds = model.predict(X_valid)
# preds_proba = model.predict_proba(test_data)

In [129]:
accuracy_score(y_valid, y_preds)

0.8338983050847457

In [18]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [23]:
%%time
cat = CatBoostClassifier(random_seed=RANDOM_STATE,  
                        logging_level='Silent',
                        cat_features=cat_features)
gs = GridSearchCV(estimator=cat, param_grid={'learning_rate': [1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001],
                                            'iterations': [300],
                                            'depth': [7],
                                            'loss_function': ['Logloss', 'CrossEntropy']},
                  scoring='accuracy', cv=skf)
gs.fit(X_train_concat_feat, y_train);
print(gs.best_params_)

{'depth': 7, 'iterations': 300, 'learning_rate': 0.01, 'loss_function': 'Logloss'}
CPU times: user 38min 10s, sys: 3min 27s, total: 41min 37s
Wall time: 5min 33s


In [29]:
# specify the training parameters 
model = CatBoostClassifier(iterations=500, depth=7, learning_rate=0.01, 
                           loss_function='Logloss', logging_level='Silent')
#train the model
model.fit(X_train_1, y_train_1, cat_features=cat_features)
# make the prediction using the resulting model
y_preds = model.predict(X_valid)
# preds_proba = model.predict_proba(test_data)

In [30]:
accuracy_score(y_valid, y_preds)

0.8440677966101695

In [9]:
X_train_concat_feat.shape, train.shape

((891, 45), (891, 11))

### Submission #002

In [10]:
# specify the training parameters 
model = CatBoostClassifier(iterations=500, depth=7, learning_rate=0.01, 
                           loss_function='Logloss', logging_level='Silent',
                          cat_features=cat_features, random_seed=RANDOM_STATE)
#train the model
model.fit(X_train_concat_feat, y_train)
# make the prediction using the resulting model
y_preds = model.predict(X_test_concat_feat).astype(int)
# preds_proba = model.predict_proba(test_data)

In [11]:
y_fin = pd.DataFrame(y_preds, columns=['Survived'], index=X_test.index)
y_fin.Survived.value_counts()

0    263
1    155
Name: Survived, dtype: int64

In [46]:
y_fin.to_csv('submissions/cat_002.csv', header=True, index_label='PassengerId') 

### Submission #003

In [17]:
# specify the training parameters 
model = CatBoostClassifier(iterations=2000, depth=7, learning_rate=0.01, 
                           loss_function='Logloss', logging_level='Silent',
                          cat_features=cat_features, random_seed=RANDOM_STATE)
#train the model
model.fit(X_train_concat_feat, y_train)
# make the prediction using the resulting model
y_preds = model.predict(X_test_concat_feat).astype(int)
# preds_proba = model.predict_proba(test_data)

In [18]:
y_fin = pd.DataFrame(y_preds, columns=['Survived'], index=X_test.index)
y_fin.Survived.value_counts()

0    263
1    155
Name: Survived, dtype: int64

In [14]:
y_fin.to_csv('submissions/cat_003.csv', header=True, index_label='PassengerId') 