# Data Science Project Walkthrough: Survival Prediction on the Titanic

## Importing Libraries

In [0]:
import pandas as pd                # For data manipulation and analysis
import pandas_profiling            # For Exploratory Data Analysis
import numpy as np                 # Implements multi-dimensional array and matrices
import matplotlib.pyplot as plt    # Plotting library for Python programming language
import seaborn as sns              # Provides a high level interface for drawing attractive and informative statistical graphics

## Download the dataset

In [0]:
import os                          # For using OS dependent functionality
import zipfile                     # Read and write zip files
from six.moves import urllib       # For making requests to URLs

DOWNLOAD_URL = 'https://raw.githubusercontent.com/sun-yitao/Python-for-Machine-Learning-Workshop/master/titanic.zip'
DATASET_PATH = os.path.join('datasets', 'titanic')
def fetch_titanic_data(url=DOWNLOAD_URL, dataset_path=DATASET_PATH):
    os.makedirs(dataset_path, exist_ok=True)
    zip_file_path = os.path.join(dataset_path, 'titanic.zip')
    urllib.request.urlretrieve(url, zip_file_path)
    print(zip_file_path)
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(dataset_path)

In [3]:
fetch_titanic_data()

datasets/titanic/titanic.zip


## Load Dataset

In [4]:
train_df = pd.read_csv('datasets/titanic/train.csv')
print(f'Train dataframe shape: {train_df.shape}')

Train dataframe shape: (891, 12)


In [0]:
test_df = # load dataframe from datasets/titanic/test.csv
# print test dataframe shape

## Exploratory Data Analysis


In [0]:
train_df.head()

### Description of Features
| Column Name                       | Description                                                                                        |
| ----------------------------------|:--------------------------------------------------------------------------------------------------:|
| PassengerId                       | Passenger Identity                                                                                                   | 
| Survived                          | Whether passenger survived or not                                                                  | 
| Pclass                            | Class of ticket                                                                                    | 
| Name                              | Name of passenger                                                                                  | 
| Sex                               | Sex of passenger                                                                                   |
| Age                               | Age of passenger                                                                                   |
| SibSp                             | Number of sibling and/or spouse travelling with passenger                                          |
| Parch                             | Number of parent and/or children travelling with passenger                                         |
| Ticket                            | Ticket number                                                                                      |
| Fare                              | Price of ticket                                                                                    |
| Cabin                             | Cabin number                                                                                       |

### Data Analysis Report with Pandas-Profiling

In [0]:
pandas_profiling.ProfileReport(train_df)

In [0]:
%matplotlib inline  
sns.set_style("darkgrid") #For better axes visiblity in dark mode

plt.subplots(figsize=(8,8))
ax = sns.countplot(x='Pclass', data=train_df)
ax.set(xlabel='Pclass', ylabel='Survived')

In [0]:
plt.subplots(figsize=(8,8))
ax = sns.countplot(x='Pclass', hue='Survived', data=train_df)
legend = ax.get_legend()
legend.set_title('Survival')
legend.texts[0].set_text('No')
legend.texts[1].set_text("yes")

In [0]:
# Some warnings may occur due to NaN values which haven't been removed yet
plt.subplots(figsize=(10,8))

ax = sns.kdeplot(train_df.loc[(train_df['Survived'] == 0), 'Age'], 
            color='r', shade=True, label='Not Survived')
ax.set(xlabel="Age", ylabel = "Number of survivors/Total Number of ppl")
sns.kdeplot(train_df.loc[(train_df['Survived'] == 1), 'Age'], 
            color='b', shade=True, label='Survived')

## Cleaning the data

In [0]:
# As test has only one missing value so lets fill it
test_df.Fare.fillna(test_df.Fare.mean(), inplace=True) # filling NaN values with mean is a common heuristic
data_df = train_df.append(test_df, sort=False) # The entire data: train + test.

In [0]:
# finding the number of missing values in each column
print(train_df.isnull().sum())
print(''.center(20, '*'))
print(test_df.isnull().sum())
sns.boxplot(x='Survived', y='Fare', data=train_df)
plt.show()

In [0]:
train_df = train_df[train_df['Fare'] < 400] # Dropping outliers which may skew model predictions
train_df.head()

In [0]:
data_df['Title'] = data_df['Name'].str.extract('([A-Za-z]+)\.', expand=True) 
mean_ages = data_df.groupby('Title')['Age'].mean()
mean_ages

In [0]:
titles = data_df.Title.unique()
print(titles)

In [0]:
# Imputing missing age values based on mean age associated the person's title
for title, age_to_impute in zip(titles, mean_ages):
    data_df.loc[(data_df['Age'].isnull()) & (data_df['Title'] == title), 'Age'] = age_to_impute
train_df['Age'] = data_df['Age'][:891]
test_df['Age'] = data_df['Age'][891:]
test_df.isnull().sum() 

## Feature Engineering

In [0]:
train_df.head()

In [0]:
## Family_size seems like a good feature to create
train_df['family_size'] = train_df.SibSp + train_df.Parch + 1
test_df['family_size'] = test_df.SibSp + test_df.Parch + 1

def family_group(size):
    if (size <= 1):
        return 'alone'
    elif (size <= 4):
        return 'small'
    else:
        return 'large'

train_df['family_group'] = train_df['family_size'].map(family_group)
test_df['family_group'] = test_df['family_size'].map(family_group)

In [0]:
train_df['calculated_fare'] = train_df.Fare / train_df.family_size
test_df['calculated_fare'] = test_df.Fare / test_df.family_size

## Model Training

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [0]:
# One-Hot Encoding Categorical Features
train_df = pd.get_dummies(train_df, columns=['Pclass','Embarked', 'family_group'])
test_df = pd.get_dummies(test_df, columns=['Pclass','Embarked', 'family_group'])
train_df.drop(['Ticket','Name', 'Cabin', 'family_size', 'Fare', 'PassengerId'], axis=1, inplace=True)
test_df.drop(['Ticket', 'Name', 'Cabin', 'family_size', 'Fare', 'PassengerId'], axis=1, inplace=True)

In [0]:
train_df.head()

In [0]:
X = train_df.drop('Survived', 1)
y = train_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [0]:
clf = KNeighborsClassifier(3)
clf.fit(X_train, y_train)
predict = clf.predict(X_test)
print(predict)
print(accuracy_score(y_test, predict))