In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# API for outlier problem:
# I refer to the machinelearningcoban blog of author Nguyen Huu Tiep.
# This is the solution for skewed problem. We will use this to cleaning, processing the numerical variables.
from typing import Tuple
from sklearn.base import BaseEstimator, TransformerMixin


def find_boxplot_boundaries(
    col: pd.Series, whisker_coeff: float = 1.5
) -> Tuple[float, float]:
    """Findx minimum and maximum in boxplot.

    Args:
        col: a pandas serires of input.
        whisker_coeff: whisker coefficient in box plot
    """
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - whisker_coeff * IQR
    upper = Q3 + whisker_coeff * IQR
    return lower, upper


class BoxplotOutlierClipper(BaseEstimator, TransformerMixin):
    def __init__(self, whisker_coeff: float = 1.5):
        self.whisker = whisker_coeff
        self.lower = None
        self.upper = None

    def fit(self, X: pd.Series):
        self.lower, self.upper = find_boxplot_boundaries(X, self.whisker)
        return self

    def transform(self, X):
        return X.clip(self.lower, self.upper)

In [4]:
# Training data
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

In [5]:
# Test data set
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

# 1. Feature Engineering

In this step, I will filter out those features that have an impact on passenger survival.

**For whom who doesn't grab the context, I will explain clearly what are the meaning of some feature(column):**
* Pclass: Similiar to the class of airline, Titanic have **three class {1, 2, 3}**. This feature potentially affecting passenger survival. Maybe the 1st class would be will be given priority. We will discuss about this further in the notebook, I will not give much assumption soon which could cause bias.
* sibsp: Number of siblings / spouses aboard the Titanic. This variable also has the potential to influence survival outcomes.
* Parch: Number of parents and children on board. Same as sibsp.
* Sex: male or female
* Ticket: Ticker number
* Fare: The price of ticket
* Cabin: Cabin number
* embarked: Port of Embarkation, C = Cherbourg, Q = Queenstown, S = Southampton

Base on what I explain above, I classify them into 2 type: 

**Category features:** Pclass {1,2,3} (We could also treat Pclass as a numerical feature), Sex {'male', 'female'}, embarked {'C', 'Q', 'S'}

**Numerical feature:** sibsp, parch, fare


Depending to the choice of the model, we could transform category feature to numerical. I won't transform them until finish the analyse step.

I didn't list ticket and cabin because I believe they won't give much information, **Cabin** may helpful but they have too much null, **Ticket** number could imply the class of people but we already have Pclass and Fare take care that information. Also, data type of ticket is a little bit vague. I will drop Ticket and Cabin column.

I also assume the name column will give us some interesting information, for example, the name have honorific "Dr" could have more chance of survival. I will extract the tittle of name to create new feature

In [6]:
train_data.head()

**Let us discuss some basic statistics metrics about the training set:**
* The first thing I notice: Number of count(Age) = 714, show that this is the only column contain NULL value.
* The mean of survivor = 0.38, that mean the ratio between survivors and unlucky people are relative balance.
* The SibSp and Parch: Call to mind, they are the number of relatives of a specific person. We see that number of people who have 1 SibSp accounting for 75%, mean that the data is quite skewed. Similar to the Parch column. Another sign is that the different of their mean and median.
* Fare is skewed, similar to SibSp and Parch.

In [7]:
# Drop Ticket and Cabin
train_df = train_data.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_data.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_df, test_df] # We group them in a list for easy to modify later


In [8]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
train_df['Title']

In [9]:
# Some titles are also associated with gender, let's see.
pd.crosstab(train_df['Title'], train_df['Sex'])

In [10]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace(['Mlle','Ms'], 'Miss')
    
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean().sort_values(by = ['Survived'])

Mrs and Miss are the title have the highest survival.

In [11]:
# Mapping title to the number in the asc ord, the higher number the higher rate of survival, 
# Mr have the lowest survival rate, it will be mapped to 1, Mrs have the highest rate, we map to 5
title_mapping = {"Mr": 1, "Miss": 4, "Mrs": 5, "Master": 3, "Rare": 2}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train_df.head()

In [12]:
# Now we can safely drop name and passengerID, they won't contribute much to training the model
train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
combine = [train_df, test_df]
train_df.shape, test_df.shape

In [13]:
# Combine SibSp and Parch:
# We could combine them together, they are have the same meaning, 
# their distribution are similar, so it would be great if we create a new feature familySize = SibSp + Parch
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [14]:
map_family = { 4: 8, 3:7, 2:6, 7:5, 1: 4, 5: 3, 6:2, 8:1, 11: 0}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(map_family)
    # After create new feature, we can drop the SibSP and Parch
    dataset.drop(['SibSp', 'Parch'], axis = 1,inplace= True)

train_df.head()

Now, when referring to family size, we talk about people came from a group which have a specific rate of survival .

In [15]:
# Processing Embarked
# We will replace NULL to the most frequent value because this is a category feature, we cannot derive mean or median
freq_port = train_df.Embarked.dropna().mode()[0] 
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [16]:
# Same as what I've done before, I will encode Embark to int value, highest survived have highest score
embarked_mapping = {'C': 3, 'Q': 2, 'S': 1}
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)
train_df.head()

In [17]:
# Transform Sex to binary value
sex_mapping = {'male': 0, 'female': 1}
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)
train_df.head()

**Now it's time to process the numerical data, we'll check if they have NULL, **

In [18]:
# Process the Fare variable:

# Set the size of fig
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
train_df['Fare'].hist(bins=50, ax=axes[0])
train_df['Fare'].to_frame().boxplot(ax=axes[1], vert=False)

In the Fare column, we have the right skewed distribution as well as some outliers. Now is the time using the API that I've introduced.

In [19]:
test_df['Fare']

In [20]:
for dataset in combine:
    dataset['Fare'] = BoxplotOutlierClipper().fit_transform(dataset['Fare'])

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
train_df['Fare'].hist(bins=50, ax=axes[0])
train_df['Fare'].to_frame().boxplot(ax=axes[1], vert=False)

The distribution no longer being skewed, the box plot look nicer. However, it is also not very Gaussian. I'll leave it as it is, after editing the model. The Fearture technique is a lengthy process.

"Coming up with features is difficult, time-consuming, requires expert knowledge. “Applied machine learning” is basically feature engineering." - Andrew Ng

In [21]:
test_df['Fare']

In [22]:
import scipy
from scipy import stats
from sklearn.impute import SimpleImputer

# Replace NaN Fare with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(train_df[["Fare"]])
train_df[["Fare"]] = imputer.transform(train_df[["Fare"]])
test_df[["Fare"]] = imputer.transform(test_df[["Fare"]])

for dataset in combine:
    # transform data
    dataset['Fare'] = stats.zscore(dataset['Fare']) + 1
    
train_df['Fare'].head()

In [23]:
test_df['Fare']

In [24]:
train_df['Fare'].hist(bins=50)

In [25]:
# simple heatmap to see where we are missing data!
sns.heatmap(train_data.isnull(),yticklabels=False,cbar=False,cmap='viridis')
sns.heatmap(test_data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

Look into the plot, I can say that almost value of Cabin are null. 

The proportion of Age missing is likely small enough for reasonable replacement with some form of imputation. Looking at the Cabin column, it looks like we are just missing too much of that data to do something useful with at a basic level. We'll probably drop this later.

In [26]:
# Before processing anything with NULL of age column, I will plot it before coming up with a strategy.
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
train_df['Age'].hist(bins=50, ax=axes[0])
train_df['Age'].to_frame().boxplot(ax=axes[1], vert=False)

In [27]:
train_df[["Age"]].info()

In [28]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
imputer.fit(train_df[["Age"]])
train_df[["Age"]] = imputer.transform(train_df[["Age"]])
test_df[["Age"]] = imputer.transform(test_df[["Age"]])

In [29]:
train_df[['Age']].info()

In [30]:
# Standardize the Age column 
import scipy
from scipy import stats

for dataset in combine:
    # transform data
    dataset['Age'] = stats.zscore(dataset['Age']) + 1
    
train_df['Age'].head()

The test data is not much different from the train data set.

In [31]:
train_df.head()

In [32]:
test_df.head()

I will use some charts to explore trends and to understand more about the data. Let's see what is the data trying to tell me!

In [33]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [34]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex',data=train_data,palette='rainbow')

*Women are more likely to survive this disaster.*

In [35]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=train_data,palette='rainbow')

There is not a big difference in the number of survivors in the 3 classes. However, the 3rd class have the huge number of death, it has twice as many deaths as 1st class and 2nd class combined.

In [36]:
sns.histplot(data=train_data, x="Fare", hue="Survived",multiple="stack")

It seems that the lower the ticket price, the higher the number of deaths. And I also discovered an outlier, this person's fare is twice as much as the person with the second biggest fare.

In [37]:
# Number of survivors, deaths in embarked
sns.countplot(x='Survived',hue='Embarked',data=train_data,palette='rainbow')

Most people came from port S, port Q have the least number of people.

In [38]:
# Number of Sibsp
sns.countplot(x='Survived',hue='SibSp',data=train_data,palette='rainbow')

**Based on the observations from charts and feature engineering, I can have the following conclusions:**
* Female have 

# 2. Building models

In [43]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

In [40]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [47]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

In [48]:
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

In [49]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

In [50]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

In [52]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
print(acc_random_forest)

output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': Y_pred})
output.to_csv('submission.csv', index=False)