In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
train_data.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Ilett, Miss. Bertha",male,,,,347082.0,,C23 C25 C27,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [3]:
size_train = len(train_data)
dataset = pd.concat([train_data, test_data]).reset_index(drop=True)

In [4]:
dataset.isna().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [5]:
dataset.sample(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
275,276,1.0,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S
859,860,0.0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
376,377,1.0,3,"Landergren, Miss. Aurora Adelia",female,22.0,0,0,C 7077,7.25,,S
17,18,1.0,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S
549,550,1.0,2,"Davies, Master. John Morgan Jr",male,8.0,1,1,C.A. 33112,36.75,,S
30,31,0.0,1,"Uruchurtu, Don. Manuel E",male,40.0,0,0,PC 17601,27.7208,,C
420,421,0.0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C
852,853,0.0,3,"Boulos, Miss. Nourelain",female,9.0,1,1,2678,15.2458,,C
657,658,0.0,3,"Bourke, Mrs. John (Catherine)",female,32.0,1,1,364849,15.5,,Q
1242,1243,,2,"Stokes, Mr. Philip Joseph",male,25.0,0,0,F.C.C. 13540,10.5,,S


# EDA

In [6]:
EDA = False

### Age

In [7]:
if EDA:
    ax = sns.kdeplot(train_data.loc[train_data['Survived']==True, 'Age'], shade=True, color='r')
    ax = sns.kdeplot(train_data.loc[train_data['Survived']==False, 'Age'], shade=True, color='b', ax=ax)
    ax_legend = ax.legend(['Survived', 'Not Survived'])

In [8]:
if EDA:
    g = sns.FacetGrid(data=train_data, col='Survived')
    g = g.map(sns.distplot, 'Age')

In [9]:
if EDA:
    features = ['SibSp', 'Pclass', 'Sex', 'Parch', 'Embarked']
    for f in features:
        sns.catplot(x=f, y='Age', data=dataset, kind='box')
    sns.heatmap(dataset[features + ['Age']].corr(), annot=True)

In [10]:
if EDA:
    sns.catplot(x='Pclass', y='Age', hue='Sex', data=dataset, kind='bar')

### Fare

In [11]:
if EDA:
    fig = plt.figure(figsize=[12, 4])
    axes = fig.subplots(1, 2)
    ax = sns.distplot(train_data['Fare'], ax=axes[0])
    ax = sns.distplot(train_data['Fare'].map(lambda x: np.log(x) if x > 0 else -10), ax=axes[1])

### Pclass

In [12]:
if EDA:
    g = sns.catplot(x='Pclass', y='Survived', hue='Sex', data=train_data, kind='bar')

### Embarked

In [13]:
if EDA:
    g = sns.catplot(x='Embarked', y='Survived', hue='Sex', data=train_data, kind='bar')

In [14]:
if EDA:
    sns.catplot(x='Pclass', col='Embarked', data=train_data, kind='count')

### Family

In [15]:
if EDA:
    sns.catplot(x='SibSp', y='Survived', data=train_data, kind='bar')

In [16]:
if EDA:
    sns.catplot(x='Parch', y='Survived', data=train_data, kind='bar')

### Sex

In [17]:
if EDA:
    sns.catplot(x='Sex', y='Survived', data=train_data, kind='bar')

## Data processing 

In [18]:
dataset.isna().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

### Age, Fare, Embarked

In [19]:
dataset['Embarked'].fillna('S', inplace=True)

In [20]:
dataset[dataset['Fare'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1043,1044,,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [21]:
dataset['Fare'].fillna(dataset.loc[(dataset['Pclass'] == 3) & (dataset['Embarked'] == 'S'), 'Fare'].median(), inplace=True)

In [22]:
dataset['Age'] = dataset.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))

In [23]:
dataset.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          0
dtype: int64

In [24]:
dataset['AgeBand'] = pd.qcut(dataset['Age'], 10)
if EDA:
    sns.countplot(x='AgeBand', hue='Survived', data=dataset)

In [25]:
dataset['FareBand'] = pd.qcut(dataset['Fare'], 13)
if EDA:
    sns.countplot(x='FareBand', hue='Survived', data=dataset)

### Cabin

In [26]:
dataset['Cabin'] = dataset['Cabin'].str.extract(r'^(\S)', expand=False).fillna('M')

In [27]:
if EDA:
    sns.catplot(x='Cabin', y='Survived', data=dataset.loc[:size_train], kind='bar')

In [28]:
dataset['Cabin'].replace(['A', 'B', 'C', 'T'], 'ABC', inplace=True)
dataset['Cabin'].replace(['D', 'E'], 'DE', inplace=True)
dataset['Cabin'].replace(['F', 'G'], 'FG', inplace=True)

### Name

In [29]:
dataset.loc[dataset['Name'].str.contains('\('), 'Name'].sample(20)

1218          Rosenshine, Mr. George (Mr George Thorne")"
1238      Whabee, Mrs. George Joseph (Shawneene Abi-Saab)
1059    Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...
426           Clarke, Mrs. Charles V (Ada Maria Winfield)
559          de Messemaeker, Mrs. Guillaume Joseph (Emma)
895          Hirvonen, Mrs. Alexander (Helga E Lindqvist)
754                      Herman, Mrs. Samuel (Jane Laver)
763             Carter, Mrs. William Ernest (Lucile Polk)
25      Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...
347             Davison, Mrs. Thomas Henry (Mary E Finck)
1070    Compton, Mrs. Alexander Taylor (Mary Eliza Ing...
774                 Hocking, Mrs. Elizabeth (Eliza Needs)
567           Palsson, Mrs. Nils (Alma Cornelia Berglund)
553                     Leeni, Mr. Fahim ("Philip Zenni")
272             Mellinger, Mrs. (Elizabeth Anne Maidment)
85      Backstrom, Mrs. Karl Alfred (Maria Mathilda Gu...
670     Brown, Mrs. Thomas William Solomon (Elizabeth ...
415           

In [30]:
dataset['Surname'] = dataset['Name'].str.extract(r'^([^,]+),', expand=False)
dataset['Title'] = dataset['Name'].str.extract(r'([A-Za-z]+)\.', expand=False)
dataset['Married'] = (dataset['Title'] == 'Mrs').astype(np.int)
dataset['Title'].replace(['Ms', 'Mrs', 'Mlle', 'Countess', 'Lady', 'Dona', 'Mme'], 'Miss', inplace=True)
dataset['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Rev', 'Don', 'Sir'], 'Noble', inplace=True)

In [31]:
if EDA:
    sns.countplot(x='Title', hue='Survived', data=dataset[:size_train])

In [32]:
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
if EDA:
    sns.countplot(x='FamilySize', hue='Survived', data=dataset)

In [33]:
dataset.loc[dataset['FamilySize'] == 1, 'FamilyType'] = 'Alone'
dataset.loc[(dataset['FamilySize'] > 1) & (dataset['FamilySize'] < 5), 'FamilyType'] = 'Small'
dataset.loc[(dataset['FamilySize'] >= 5) & (dataset['FamilySize'] < 7), 'FamilyType'] = 'Medium'
dataset['FamilyType'].fillna('Large', inplace=True)

In [34]:
if EDA:
    sns.countplot(x='FamilyType', hue='Survived', data=dataset)

In [35]:
surname_survived = dataset[:size_train].groupby('Surname')['Survived'].median()
surname_count = dataset[:size_train].groupby('Surname')['FamilySize'].median()
surname_train = set(dataset.iloc[:size_train]['Surname'].tolist())
surname_test = set(dataset.iloc[size_train:]['Surname'].tolist())

surname_set = surname_train.intersection(surname_test)

In [36]:
average_mean = dataset['Survived'].mean()
dataset['SurnameSurvived'] = dataset['Surname'].transform(lambda x: surname_survived[x] if x in surname_set and surname_count[x] > 1 else average_mean)
dataset['SurnameSurvivedNotNA'] = dataset['Surname'].transform(lambda x: 1 if x in surname_set and surname_count[x] > 1 else 0)

### Ticket

In [37]:
dataset['TicketFreq'] = dataset.groupby('Ticket')['Ticket'].transform('count')
ticket_survived = dataset[:size_train].groupby('Ticket')['Survived'].median().to_dict()
ticket_count = dataset[:size_train].groupby('Ticket')['TicketFreq'].median().to_dict()
ticket_train = set(dataset.iloc[:size_train]['Ticket'].tolist())
ticket_test = set(dataset.iloc[size_train:]['Ticket'].tolist())

ticket_set = ticket_train.intersection(ticket_test)

In [38]:
dataset['TicketSurvived'] = dataset['Ticket'].transform(lambda x: ticket_survived[x] if x in ticket_set and ticket_count[x] > 1 else average_mean)
dataset['TicketSurvivedNotNA'] = dataset['Ticket'].transform(lambda x: 1 if x in ticket_set and ticket_count[x] > 1 else 0)

In [39]:
dataset['SurvivalRate'] = (dataset['TicketSurvived'] + dataset['SurnameSurvived']) / 2
dataset['SurvivalRateNotNA'] = (dataset['TicketSurvivedNotNA']  + dataset['SurnameSurvivedNotNA']) / 2
dataset.drop(['TicketSurvived', 'SurnameSurvived', 'TicketSurvivedNotNA', 'SurnameSurvivedNotNA'], axis=1, inplace=True)

### Finalize

In [40]:
dataset.drop(['Name', 'Ticket', 'Surname', 'FamilySize', 'Age', 'Fare'], axis='columns', inplace=True)

In [41]:
dataset.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Cabin,Embarked,AgeBand,FareBand,Title,Married,FamilyType,TicketFreq,SurvivalRate,SurvivalRateNotNA
0,1,0.0,3,male,1,0,M,S,"(21.0, 22.0]","(-0.001, 7.25]",Mr,0,Small,1,0.383838,0.0
1,2,1.0,1,female,1,0,ABC,C,"(34.0, 40.0]","(56.496, 83.475]",Miss,1,Small,2,1.0,1.0
2,3,1.0,3,female,0,0,M,S,"(25.0, 26.0]","(7.896, 8.05]",Miss,0,Alone,1,0.383838,0.0
3,4,1.0,1,female,1,0,ABC,S,"(34.0, 40.0]","(34.075, 56.496]",Miss,1,Small,2,0.383838,0.0
4,5,0.0,3,male,0,0,M,S,"(34.0, 40.0]","(7.896, 8.05]",Mr,0,Alone,1,0.383838,0.0
5,6,0.0,3,male,0,0,M,Q,"(22.0, 25.0]","(8.05, 10.5]",Mr,0,Alone,1,0.383838,0.0
6,7,0.0,1,male,0,0,DE,S,"(48.0, 80.0]","(34.075, 56.496]",Mr,0,Alone,2,0.191919,0.5
7,8,0.0,3,male,3,1,M,S,"(0.169, 16.0]","(15.742, 23.25]",Master,0,Medium,5,0.0,1.0
8,9,1.0,3,female,0,2,M,S,"(26.0, 29.5]","(10.5, 13.0]",Miss,1,Small,3,0.383838,0.0
9,10,1.0,2,female,1,0,M,C,"(0.169, 16.0]","(26.55, 34.075]",Miss,1,Small,2,0.383838,0.0


In [42]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

dataset = pd.get_dummies(dataset, columns=['Pclass'])
dataset = pd.get_dummies(dataset, columns=['FamilyType'])
dataset = pd.get_dummies(dataset, columns=['Cabin'])
dataset = pd.get_dummies(dataset, columns=['Embarked'])
dataset = pd.get_dummies(dataset, columns=['Title'])
dataset['Sex'] = LabelEncoder().fit_transform(dataset['Sex'])
dataset['AgeBand'] = LabelEncoder().fit_transform(dataset['AgeBand'])
dataset['FareBand'] = LabelEncoder().fit_transform(dataset['FareBand'])

## Start training

In [43]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

In [44]:
y = dataset.iloc[:size_train]['Survived']
X = dataset.iloc[:size_train].drop(columns=['Survived', 'PassengerId'], axis=1)
X_test = dataset.iloc[size_train:].drop(columns=['Survived', 'PassengerId'], axis=1)

In [45]:
model = RandomForestClassifier( criterion='gini', 
                                n_estimators=1100,
                                max_depth=5,
                                min_samples_split=4,
                                min_samples_leaf=5,
                                max_features='auto',
                                oob_score=True,
                                random_state=47,
                                n_jobs=-1,
                                verbose=1)
model.fit(X, y)
y_pred = model.predict(X_test).astype(np.int)
results = pd.DataFrame({'PassengerId': dataset.iloc[size_train:]['PassengerId'], 'Survived': y_pred})
results.to_csv('submission_0807_rf.csv', index=False)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 1100 out of 1100 | elapsed:    2.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 1100 out of 1100 | elapsed:    0.3s finished


## Finish

In [46]:
y_test = pd.read_csv('data/test_label.csv')['Survived']
score = accuracy_score(y_test, y_pred)
print(score)

0.80622009569378
