In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (18, 8)

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
gender_submission = pd.read_csv('data/gender_submission.csv')

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## Sex

In [6]:
train['Sex_clean'] = train['Sex'].astype('category').cat.codes
test['Sex_clean'] = test['Sex'].astype('category').cat.codes

## Embarked

In [7]:
train['Embarked'].isnull().sum()

2

In [8]:
test['Embarked'].isnull().sum()

0

In [9]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [10]:
train['Embarked'].fillna('S', inplace=True)

In [11]:
train['Embarked'].isnull().sum()

0

In [12]:
train['Embarked_clean'] = train['Embarked'].astype('category').cat.codes
test['Embarked_clean'] = test['Embarked'].astype('category').cat.codes

## Family

In [13]:
train['Family'] = 1 + train['SibSp'] + train['Parch']
test['Family'] = 1 + test['SibSp'] + test['Parch']

In [14]:
train['Solo'] = (train['Family'] == 1)

In [15]:
test['Solo'] = (test['Family'] == 1)

 ## Fare

In [16]:
train['FareBin'] = pd.qcut(train['Fare'], 5)
test['FareBin'] = pd.qcut(test['Fare'], 5)

In [17]:
train['FareBin'].value_counts()

(7.854, 10.5]        184
(21.679, 39.688]     180
(-0.001, 7.854]      179
(39.688, 512.329]    176
(10.5, 21.679]       172
Name: FareBin, dtype: int64

In [18]:
test['FareBin'].value_counts()

(-0.001, 7.796]     85
(46.62, 512.329]    84
(21.438, 46.62]     83
(11.025, 21.438]    83
(7.796, 11.025]     82
Name: FareBin, dtype: int64

In [19]:
train['Fare_clean'] = train['FareBin'].astype('category').cat.codes
test['Fare_clean'] = test['FareBin'].astype('category').cat.codes

In [20]:
train['Fare_clean'].value_counts()

1    184
3    180
0    179
4    176
2    172
Name: Fare_clean, dtype: int64

In [21]:
test['Fare_clean'].value_counts()

 0    85
 4    84
 3    83
 2    83
 1    82
-1     1
Name: Fare_clean, dtype: int64

## Title

In [22]:
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [23]:
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')

In [24]:
train['Title'].value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
Other      23
Mlle        2
Ms          1
Mme         1
Name: Title, dtype: int64

In [25]:
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')

In [26]:
train['Title'].value_counts()

Mr        517
Miss      185
Mrs       126
Master     40
Other      23
Name: Title, dtype: int64

In [27]:
test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')

In [28]:
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')

In [29]:
test['Title'].value_counts()

Mr        240
Miss       79
Mrs        72
Master     21
Other       6
Name: Title, dtype: int64

In [30]:
train['Title_clean'] = train['Title'].astype('category').cat.codes
test['Title_clean'] = test['Title'].astype('category').cat.codes

In [31]:
train['Title_clean'].value_counts()

2    517
1    185
3    126
0     40
4     23
Name: Title_clean, dtype: int64

In [32]:
test['Title_clean'].value_counts()

2    240
1     79
3     72
0     21
4      6
Name: Title_clean, dtype: int64

## Age

In [33]:
train['Age'].isnull().sum()

177

In [34]:
test['Age'].isnull().sum()

86

In [35]:
train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)

In [36]:
train.loc[ train['Age'] <= 10, 'Age_clean'] = 0
train.loc[(train['Age'] > 10) & (train['Age'] <= 16), 'Age_clean'] = 1
train.loc[(train['Age'] > 16) & (train['Age'] <= 20), 'Age_clean'] = 2
train.loc[(train['Age'] > 20) & (train['Age'] <= 26), 'Age_clean'] = 3
train.loc[(train['Age'] > 26) & (train['Age'] <= 30), 'Age_clean'] = 4
train.loc[(train['Age'] > 30) & (train['Age'] <= 36), 'Age_clean'] = 5
train.loc[(train['Age'] > 36) & (train['Age'] <= 40), 'Age_clean'] = 6
train.loc[(train['Age'] > 40) & (train['Age'] <= 46), 'Age_clean'] = 7
train.loc[(train['Age'] > 46) & (train['Age'] <= 50), 'Age_clean'] = 8
train.loc[(train['Age'] > 50) & (train['Age'] <= 60), 'Age_clean'] = 9
train.loc[ train['Age'] > 60, 'Age_clean'] = 10

In [37]:
test.loc[ test['Age'] <= 10, 'Age_clean'] = 0
test.loc[(test['Age'] > 10) & (test['Age'] <= 16), 'Age_clean'] = 1
test.loc[(test['Age'] > 16) & (test['Age'] <= 20), 'Age_clean'] = 2
test.loc[(test['Age'] > 20) & (test['Age'] <= 26), 'Age_clean'] = 3
test.loc[(test['Age'] > 26) & (test['Age'] <= 30), 'Age_clean'] = 4
test.loc[(test['Age'] > 30) & (test['Age'] <= 36), 'Age_clean'] = 5
test.loc[(test['Age'] > 36) & (test['Age'] <= 40), 'Age_clean'] = 6
test.loc[(test['Age'] > 40) & (test['Age'] <= 46), 'Age_clean'] = 7
test.loc[(test['Age'] > 46) & (test['Age'] <= 50), 'Age_clean'] = 8
test.loc[(test['Age'] > 50) & (test['Age'] <= 60), 'Age_clean'] = 9
test.loc[ test['Age'] > 60, 'Age_clean'] = 10

In [38]:
train['Age_clean'].value_counts()

4.0     209
3.0     176
5.0     127
2.0      79
0.0      68
7.0      52
6.0      45
9.0      42
1.0      36
8.0      35
10.0     22
Name: Age_clean, dtype: int64

In [39]:
test['Age_clean'].value_counts()

4.0     103
3.0     100
5.0      36
2.0      35
6.0      29
7.0      28
0.0      26
9.0      20
8.0      18
1.0      12
10.0     11
Name: Age_clean, dtype: int64

## Cabin

In [40]:
train['Cabin'].str[:1].value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: Cabin, dtype: int64

In [41]:
mapping = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6,
    'T': 7
}

In [42]:
train['Cabin_clean'] = train['Cabin'].str[:1]

In [43]:
train['Cabin_clean'] = train['Cabin_clean'].map(mapping)

In [44]:
train[['Pclass', 'Cabin_clean']].head(10)

Unnamed: 0,Pclass,Cabin_clean
0,3,
1,1,2.0
2,3,
3,1,2.0
4,3,
5,3,
6,1,4.0
7,3,
8,3,
9,2,


In [45]:
train.groupby('Pclass')['Cabin_clean'].median()

Pclass
1    2.0
2    4.5
3    5.0
Name: Cabin_clean, dtype: float64

In [46]:
train['Cabin_clean'].head(10)

0    NaN
1    2.0
2    NaN
3    2.0
4    NaN
5    NaN
6    4.0
7    NaN
8    NaN
9    NaN
Name: Cabin_clean, dtype: float64

In [47]:
test['Cabin_clean'] = test['Cabin'].str[:1]
test['Cabin_clean'] = test['Cabin_clean'].map(mapping)

In [48]:
train['Cabin_clean'] = train.groupby('Pclass')['Cabin_clean'].transform('median')
test['Cabin_clean'] = test.groupby('Pclass')['Cabin_clean'].transform('median')

In [49]:
train['Cabin_clean'].value_counts()

5.0    491
2.0    216
4.5    184
Name: Cabin_clean, dtype: int64

In [50]:
test['Cabin_clean'].value_counts()

5.0    311
2.0    107
Name: Cabin_clean, dtype: int64

## Feature & label

In [51]:
feature = [
    'Pclass',
    'SibSp',
    'Parch',
    'Sex_clean',
    'Embarked_clean',
    'Family',
    'Solo',
    'Title_clean',
    'Age_clean',
    'Cabin_clean',
    'Fare_clean',
]

In [52]:
label = [
    'Survived',
]

## Model Selection

In [53]:
data = train[feature]
target = train[label]

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [70]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

In [71]:
x_train, x_test, y_train, y_test = train_test_split(data, target, random_state=0)

In [80]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [74]:
clf = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=0)
cross_val_score(clf, data, target, cv=k_fold, scoring='accuracy', ).mean()

0.8226841448189763

In [83]:
clf = XGBClassifier(learning_rate=0.025, max_depth=6, n_estimators=500, colsample_bytree=0.8, subsample=0.9)
cross_val_score(clf, data, target, cv=k_fold, scoring='accuracy', ).mean()

0.8271535580524345

## Make Prediction

In [84]:
x_train = train[feature]
x_test = test[feature]
y_train = train[label]

In [85]:
clf.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.025, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [86]:
pred = clf.predict(x_test)

In [87]:
gender_submission['Survived'] = pred

In [88]:
gender_submission.to_csv('titanic-submission-4.csv',index=False)