In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
train_data.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Slabenoff, Mr. Petco",male,,,,1601.0,,G6,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [3]:
size_train = len(train_data)
dataset = pd.concat([train_data, test_data]).reset_index(drop=True)

In [4]:
dataset.isna().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [5]:
dataset.sample(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
74,75,1.0,3,"Bing, Mr. Lee",male,32.0,0,0,1601,56.4958,,S
5,6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
368,369,1.0,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q
203,204,0.0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C
767,768,0.0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
1269,1270,,1,"Hipkins, Mr. William Edward",male,55.0,0,0,680,50.0,C39,S
225,226,0.0,3,"Berglund, Mr. Karl Ivar Sven",male,22.0,0,0,PP 4348,9.35,,S
194,195,1.0,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44.0,0,0,PC 17610,27.7208,B4,C
221,222,0.0,2,"Bracken, Mr. James H",male,27.0,0,0,220367,13.0,,S


# EDA

In [6]:
EDA = False

### Age

In [7]:
if EDA:
    ax = sns.kdeplot(train_data.loc[train_data['Survived']==True, 'Age'], shade=True, color='r')
    ax = sns.kdeplot(train_data.loc[train_data['Survived']==False, 'Age'], shade=True, color='b', ax=ax)
    ax_legend = ax.legend(['Survived', 'Not Survived'])

In [8]:
if EDA:
    g = sns.FacetGrid(data=train_data, col='Survived')
    g = g.map(sns.distplot, 'Age')

In [9]:
if EDA:
    features = ['SibSp', 'Pclass', 'Sex', 'Parch', 'Embarked']
    for f in features:
        sns.catplot(x=f, y='Age', data=dataset, kind='box')
    sns.heatmap(dataset[features + ['Age']].corr(), annot=True)

In [10]:
if EDA:
    sns.catplot(x='Pclass', y='Age', hue='Sex', data=dataset, kind='bar')

### Fare

In [11]:
if EDA:
    fig = plt.figure(figsize=[12, 4])
    axes = fig.subplots(1, 2)
    ax = sns.distplot(train_data['Fare'], ax=axes[0])
    ax = sns.distplot(train_data['Fare'].map(lambda x: np.log(x) if x > 0 else -10), ax=axes[1])

### Pclass

In [12]:
if EDA:
    g = sns.catplot(x='Pclass', y='Survived', hue='Sex', data=train_data, kind='bar')

### Embarked

In [13]:
if EDA:
    g = sns.catplot(x='Embarked', y='Survived', hue='Sex', data=train_data, kind='bar')

In [14]:
if EDA:
    sns.catplot(x='Pclass', col='Embarked', data=train_data, kind='count')

### Family

In [15]:
if EDA:
    sns.catplot(x='SibSp', y='Survived', data=train_data, kind='bar')

In [16]:
if EDA:
    sns.catplot(x='Parch', y='Survived', data=train_data, kind='bar')

### Sex

In [17]:
if EDA:
    sns.catplot(x='Sex', y='Survived', data=train_data, kind='bar')

## Data processing 

In [18]:
dataset.isna().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

### Age, Fare, Embarked

In [19]:
dataset['Embarked'].fillna('S', inplace=True)

In [20]:
dataset[dataset['Fare'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1043,1044,,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [21]:
dataset['Fare'].fillna(dataset.loc[(dataset['Pclass'] == 3) & (dataset['Embarked'] == 'S'), 'Fare'].median(), inplace=True)

In [22]:
dataset['Age'] = dataset.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))

In [23]:
dataset.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          0
dtype: int64

In [24]:
dataset['AgeBand'] = pd.qcut(dataset['Age'], 10)
if EDA:
    sns.countplot(x='AgeBand', hue='Survived', data=dataset)

In [25]:
dataset['FareBand'] = pd.qcut(dataset['Fare'], 13)
if EDA:
    sns.countplot(x='FareBand', hue='Survived', data=dataset)

### Cabin

In [26]:
dataset['Cabin'] = dataset['Cabin'].str.extract(r'^(\S)', expand=False).fillna('M')

In [27]:
if EDA:
    sns.catplot(x='Cabin', y='Survived', data=dataset.loc[:size_train], kind='bar')

In [28]:
dataset['Cabin'].replace(['A', 'B', 'C', 'T'], 'ABC', inplace=True)
dataset['Cabin'].replace(['D', 'E'], 'DE', inplace=True)
dataset['Cabin'].replace(['F', 'G'], 'FG', inplace=True)

### Name

In [29]:
dataset.loc[dataset['Name'].str.contains('\('), 'Name'].sample(20)

166                Chibnall, Mrs. (Edith Martha Bowerman)
312                 Lahtinen, Mrs. William (Anna Sylfven)
885                  Rice, Mrs. William (Margaret Norton)
362                       Barbara, Mrs. (Catherine David)
591       Stephenson, Mrs. Walter Bertram (Martha Eustis)
1252              Mallet, Mrs. Albert (Antoinette Magnin)
427     Phillips, Miss. Kate Florence ("Mrs Kate Louis...
1286       Smith, Mrs. Lucien Philip (Mary Eloise Hughes)
254              Rosblom, Mrs. Viktor (Helena Wilhelmina)
678               Goodwin, Mrs. Frederick (Augusta Tyler)
40         Ahlin, Mrs. Johan (Johanna Persdotter Larsson)
763             Carter, Mrs. William Ernest (Lucile Polk)
995             Thomas, Mrs. Alexander (Thamine Thelma")"
230          Harris, Mrs. Henry Birkhardt (Irene Wallach)
1075    Douglas, Mrs. Frederick Charles (Mary Helene B...
801           Collyer, Mrs. Harvey (Charlotte Annie Tate)
432     Louch, Mrs. Charles Alexander (Alice Adelaide ...
1005          

In [30]:
dataset['Surname'] = dataset['Name'].str.extract(r'^([^,]+),', expand=False)
dataset['Title'] = dataset['Name'].str.extract(r'([A-Za-z]+)\.', expand=False)
dataset['Title'].replace(['Ms', 'Mlle', 'Countess', 'Lady', 'Dona', 'Mme'], 'Miss', inplace=True)
dataset['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Rev', 'Don', 'Sir'], 'Noble', inplace=True)

In [31]:
if EDA:
    sns.countplot(x='Title', hue='Survived', data=dataset[:size_train])

In [32]:
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
if EDA:
    sns.countplot(x='FamilySize', hue='Survived', data=dataset)

In [33]:
dataset.loc[dataset['FamilySize'] == 1, 'FamilyType'] = 'Alone'
dataset.loc[(dataset['FamilySize'] > 1) & (dataset['FamilySize'] < 5), 'FamilyType'] = 'Small'
dataset.loc[(dataset['FamilySize'] >= 5) & (dataset['FamilySize'] < 7), 'FamilyType'] = 'Small'
dataset['FamilyType'].fillna('Large', inplace=True)

In [34]:
if EDA:
    sns.countplot(x='FamilyType', hue='Survived', data=dataset)

In [35]:
surname_survived = dataset[:size_train].groupby('Surname')['Survived'].mean().to_dict()
surname_count = dataset[:size_train].groupby('Surname')['Survived'].count().to_dict()
surname_train = set(dataset.iloc[:size_train]['Surname'].tolist())
surname_test = set(dataset.iloc[size_train:]['Surname'].tolist())

surname_set = surname_train.intersection(surname_test)

In [36]:
average_mean = dataset['Survived'].mean()
dataset['SurnameSurvived'] = dataset['Surname'].transform(lambda x: surname_survived[x] if x in surname_set and surname_count[x] > 1 else average_mean)
dataset['SurnameSurvivedisNA'] = dataset['Surname'].transform(lambda x: 0 if x in surname_set and surname_count[x] > 1 else 1)

### Ticket

In [37]:
# ticket_survived = dataset[:size_train].groupby('Ticket')['Survived'].mean().to_dict()
# ticket_count = dataset[:size_train].groupby('Ticket')['Survived'].count().to_dict()
# ticket_train = set(dataset.iloc[:size_train]['Ticket'].tolist())
# ticket_test = set(dataset.iloc[size_train:]['Ticket'].tolist())

# ticket_set = ticket_train.intersection(ticket_test)

In [38]:
# dataset['TicketSurvived'] = dataset['Ticket'].transform(lambda x: ticket_survived[x] if x in ticket_set and ticket_count[x] > 1 else average_mean)
# dataset['TicketSurvivedisNA'] = dataset['Ticket'].transform(lambda x: 0 if x in ticket_set and ticket_count[x] > 1 else 1)
# dataset['TicketFreq'] = dataset.groupby('Ticket')['Ticket'].transform('count')

### Finalize

In [39]:
dataset.drop(['Name', 'Ticket', 'Surname', 'FamilySize', 'SibSp', 'Parch', 'Age', 'Fare'], axis='columns', inplace=True)

In [40]:
dataset.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Cabin,Embarked,AgeBand,FareBand,Title,FamilyType,SurnameSurvived,SurnameSurvivedisNA
870,871,0.0,3,male,M,S,"(25.0, 26.0]","(7.75, 7.896]",Mr,Alone,0.383838,1
703,704,0.0,3,male,M,Q,"(22.0, 25.0]","(7.25, 7.75]",Mr,Alone,0.383838,1
296,297,0.0,3,male,M,C,"(22.0, 25.0]","(-0.001, 7.25]",Mr,Alone,0.383838,1
408,409,0.0,3,male,M,S,"(16.0, 21.0]","(7.75, 7.896]",Mr,Alone,0.383838,1
821,822,1.0,3,male,M,S,"(26.0, 29.5]","(8.05, 10.5]",Mr,Alone,0.383838,1
1296,1297,,2,male,DE,C,"(16.0, 21.0]","(13.0, 15.742]",Mr,Alone,0.383838,1
1180,1181,,3,male,M,S,"(22.0, 25.0]","(7.896, 8.05]",Mr,Alone,0.0,0
1174,1175,,3,female,M,C,"(0.169, 16.0]","(13.0, 15.742]",Miss,Small,0.383838,1
1203,1204,,3,male,M,S,"(22.0, 25.0]","(7.25, 7.75]",Mr,Alone,0.383838,1
391,392,1.0,3,male,M,S,"(16.0, 21.0]","(7.75, 7.896]",Mr,Alone,0.383838,1


In [41]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

dataset = pd.get_dummies(dataset, columns=['Pclass'])
dataset = pd.get_dummies(dataset, columns=['FamilyType'])
dataset = pd.get_dummies(dataset, columns=['Cabin'])
dataset = pd.get_dummies(dataset, columns=['Embarked'])
dataset = pd.get_dummies(dataset, columns=['Title'])
dataset['Sex'] = LabelEncoder().fit_transform(dataset['Sex'])
dataset['AgeBand'] = LabelEncoder().fit_transform(dataset['AgeBand'])
dataset['FareBand'] = LabelEncoder().fit_transform(dataset['FareBand'])

## Start training

In [42]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

In [43]:
y = dataset.iloc[:size_train]['Survived']
X = dataset.iloc[:size_train].drop(columns=['Survived', 'PassengerId'], axis=1)
X_test = dataset.iloc[size_train:].drop(columns=['Survived', 'PassengerId'], axis=1)
y_test = pd.read_csv('data/test_label.csv')['Survived']

In [53]:
model = RandomForestClassifier(n_estimators=400, max_depth=5, random_state=47, n_jobs=-1, verbose=1)
model.fit(X, y)
y_pred = model.predict(X_test).astype(np.int)
results = pd.DataFrame({'PassengerId': dataset.iloc[size_train:]['PassengerId'], 'Survived': y_pred})
results.to_csv('submission_0806_rf.csv', index=False)

## Finish

In [54]:
score = accuracy_score(y_test, y_pred)
print(score)

0.7799043062200957
