In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Workflow stages
The competition solution workflow goes through seven stages described in the Data Science Solutions book.

1. Question or problem definition.
1. Acquire training and testing data.
1. Wrangle, prepare, cleanse the data.
1. Analyze, identify patterns, and explore the data.
1. Model, predict and solve the problem.
1. Visualize, report, and present the problem solving steps and final solution.
1. Supply or submit the results.

In [None]:
import pandas as pd
import numpy as np
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')
combine = [train_df, test_df]

In [None]:
print(train_df.columns.values)

In [None]:
train_df.head()

In [None]:
train_df.tail()

In [None]:
train_df.info()
print('_'*40)
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.describe(include=['O'])

## Assumtions based on data analysis

### Correlating.

We want to know how well does each feature correlate with Survival. We want to do this early in our project and match these quick correlations with modelled correlations later in the project.

### Completing.

1. We may want to complete Age feature as it is definitely correlated to survival.
1. We may want to complete the Embarked feature as it may also correlate with survival or another important feature.

### Correcting.

1. Ticket feature may be dropped from our analysis as it contains high ratio of duplicates (22%) and there may not be a correlation between Ticket and survival.
1. Cabin feature may be dropped as it is highly incomplete or contains many null values both in training and test dataset.
1. PassengerId may be dropped from training dataset as it does not contribute to survival.
1. Name feature is relatively non-standard, may not contribute directly to survival, so maybe dropped.

### Creating.

1. We may want to create a new feature called Family based on Parch and SibSp to get total count of family members on board.
1. We may want to engineer the Name feature to extract Title as a new feature.
1. We may want to create new feature for Age bands. This turns a continous numerical feature into an ordinal categorical feature.
1. We may also want to create a Fare range feature if it helps our analysis.

### Classifying.

We may also add to our assumptions based on the problem description noted earlier.

1. Women (Sex=female) were more likely to have survived.
1. Children (Age<?) were more likely to have survived.
1. The upper-class passengers (Pclass=1) were more likely to have survived.

In [None]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
g = sns.FacetGrid(train_df, col='Survived')
g.map(plt.hist, 'Age', bins=20)

In [None]:
grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend()

In [None]:
grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend()

In [None]:
grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size = 2.2, aspect = 1.6)
grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
grid.add_legend()

In [None]:
print("Before Dropping: ", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)
train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
combine=[train_df, test_df]
print("After Dropping: ", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)

In [None]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.', expand=False)

pd.crosstab(train_df['Title'], train_df['Sex'])

In [None]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rate": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

dataset.head()

In [None]:
dataset['Title'] = dataset['Title'].astype(int)
dataset.head()

In [None]:
train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
combine = [train_df, test_df]
train_df.shape, test_df.shape

In [None]:
print(train_df.columns.values)

In [None]:
grid = sns.FacetGrid(train_df, row='Pclass', col='Sex', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend()

In [None]:
guess_ages = np.zeros((2,3))
guess_ages

In [None]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [None]:
train_df.head()

In [None]:
for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()
            age_guess = guess_df.median()
            guess_ages[i, j] = int(age_guess/0.5 + 0.5) * 0.5
    print(guess_ages)
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[(dataset['Age'].isnull()) & (dataset['Sex'] == i) & (dataset['Pclass'] == j+1), 'Age'] = guess_ages[i, j]
    dataset['Age'] = dataset['Age'].astype(int)

dataset.head()

In [None]:
train_df['AgeBandChk'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBandChk', 'Survived']].groupby(['AgeBandChk'], as_index=False).mean().sort_values(by='AgeBandChk', ascending=True)

In [None]:
for dataset in combine:
    dataset['AgeBand'] = dataset.Age * 0
    dataset.loc[dataset['Age'] <= 16, 'AgeBand'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'AgeBand'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'AgeBand'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'AgeBand'] = 3
    dataset.loc[dataset['Age'] > 64, 'AgeBand'] = 4
train_df.head()

In [None]:
train_df = train_df.drop(['AgeBandChk'], axis=1)
combine = [train_df, test_df]
train_df.head()

In [None]:
for dataset in combine:
    dataset['AgeBand*Class'] = dataset.AgeBand * dataset.Pclass

train_df.loc[:, ['AgeBand*Class', 'AgeBand', 'Pclass']].head(10)

In [None]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

In [None]:
freq_port = train_df.Embarked.dropna().mode()[0]
print(freq_port)
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'] .fillna(freq_port)
    
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

train_df.head()

In [None]:
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

In [None]:
train_df['FareBandChk'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBandChk', 'Survived']].groupby(['FareBandChk'], as_index=False).mean().sort_values(by='FareBandChk', ascending=True)

In [None]:
for dataset in combine:
    dataset['FareBand'] = 0
    dataset.loc[dataset['Fare'] <= 7.91, 'FareBand'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'FareBand'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31.0), 'FareBand'] = 2
    dataset.loc[dataset['Fare'] > 31.0, 'FareBand'] = 3

train_df = train_df.drop(['FareBandChk'], axis=1)
combine = [train_df, test_df]

train_df.head(10)

In [None]:
X_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']
X_test = test_df.drop('PassengerId', axis=1).copy()
X_train.shape, y_train.shape, X_test.shape

In [None]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
acc_log

In [None]:
# Check Coefficiency

coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
coeff_df.sort_values(by='Correlation', ascending=False)

In [None]:
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('second_submission_logreg.csv', index=False)

In [None]:
# Support Vector Machine

svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, y_train)*100, 2)
acc_svc

In [None]:
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('second_submission_svc.csv', index=False)

In [None]:
# k-Nearest Neightbors (k-NN)

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, y_train) * 100, 2)
acc_knn

In [None]:
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('second_submission_knn.csv', index=False)

In [None]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)
acc_gaussian

In [None]:
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('second_submission_gaussian.csv', index=False)

In [None]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, y_train)
y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, y_train)*100, 2)
acc_perceptron

In [None]:
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('second_submission_perceptron.csv', index=False)

In [None]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, y_train)*100, 2)
acc_linear_svc

In [None]:
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('second_submission_linear-svc.csv', index=False)

In [None]:
# Stochatic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, y_train)*100, 2)
acc_sgd

In [None]:
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('second_submission_sgd.csv', index=False)

In [None]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, y_train)*100, 2)
acc_decision_tree

In [None]:
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('second_submission_decision_tree.csv', index=False)

In [None]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_train, y_train)*100, 2)
acc_random_forest

In [None]:
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('second_submission_random_forest.csv', index=False)

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Training Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree],
    'Test Score': [0.67464, 0.65071, 0.76076, 
              0.74162, 0.76076, 0.69377, 
              0.71770, 0.67942, 0.68421]})
models.sort_values(by='Test Score', ascending=False)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize', 'Fare', 'Age'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize', 'Fare', 'Age'], axis=1)
combine = [train_df, test_df]
train_df.head()

In [None]:
X_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']
X_test = test_df.drop('PassengerId', axis=1).copy()
X_train.shape, y_train.shape, X_test.shape

In [None]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
print("acc_log: ", acc_log)
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('third_submission_logreg.csv', index=False)

# Support Vector Machine

svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, y_train)*100, 2)
print("acc_svc: ", acc_svc)
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('third_submission_svc.csv', index=False)

# k-Nearest Neightbors (k-NN)

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, y_train) * 100, 2)
print("acc_knn: ", acc_knn)
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('third_submission_knn.csv', index=False)

# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)
print("acc_gaussian: ", acc_gaussian)
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('third_submission_gaussian.csv', index=False)

# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, y_train)
y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, y_train)*100, 2)
print("acc_perceptron: ", acc_perceptron)
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('third_submission_perceptron.csv', index=False)

# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, y_train)*100, 2)
print("acc_linear_svc: ", acc_linear_svc)
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('third_submission_linear-svc.csv', index=False)

# Stochatic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, y_train)*100, 2)
print("acc_sgd: ", acc_sgd)
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('third_submission_sgd.csv', index=False)

# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, y_train)*100, 2)
print("acc_decision_tree: ", acc_decision_tree)
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('third_submission_decision_tree.csv', index=False)

# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_train, y_train)*100, 2)
print("acc_random_forest: ", acc_random_forest)
submission = pd.DataFrame({"PassengerId": test_df['PassengerId'], "Survived": y_pred})
submission.to_csv('third_submission_random_forest.csv', index=False)

In [None]:
models2 = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Training Score 2': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree],
    'Test Score 2': [0.79425, 0.75119, 0.7799,
                    0.77511, 0.75119, 0.7799,
                    0.76555, 0.77511, 0.78468]})

In [None]:
models_merge = pd.merge(models, models2, on='Model')
models_merge.sort_values(by='Training Score', ascending=False)