In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import math
import matplotlib.pyplot as plt
import random

In [None]:
titanic_data = pd.read_csv('titanic.csv')

In [None]:
titanic_data.drop(['PassengerId', 'Ticket'], axis=1, inplace=True)

In [None]:
titanic_data.head()

In [None]:
print('Zero values in train name = ' + str(titanic_data['Name'].isnull().sum()))
print('Zero values in train sex = ' + str(titanic_data['Sex'].isnull().sum()))
print('Zero values in train cabin = ' + str(titanic_data['Cabin'].isnull().sum()))
print('Zero values in train embarked = ' + str(titanic_data['Embarked'].isnull().sum()))

In [None]:
titanic_data.describe()

# Extract title out of name

In [None]:
def extract_title(df):
    """
    Extracts the title out of a name
    
    Arguments:
        df {Pandas dataframe} -- should contain a Name column
        
    Returns:
        df {Pandas dataframe} -- same dataframe except changed Name column content
    """
    extraction = {'.*Mrs\..*': 'Mrs',
                  '.*Sir\..*': 'Royalty',
                  '.*Mr\..*': 'Mr',
                  '.*Capt\..*': 'Officer',
                  '.*Col\..*': 'Officer',
                  '.*Countess\..*': 'Royalty',
                  '.*Dona\..*': 'Royalty',
                  '.*Don\..*': 'Royalty',
                  '.*Dr\..*': 'Officer',
                  '.*Jonkheer.*': 'Royalty',
                  '.*Lady\..*': 'Royalty',
                  '.*Major\..*': 'Officer',
                  '.*Master\..*': 'Master',
                  '.*Mlle\..*': 'Miss',
                  '.*Mme\..*': 'Mrs',
                  '.*Ms\..*': 'Mrs',
                  '.*Rev\..*': 'Officer',
                  '.*Miss\..*': 'Miss'}
    df['Name'] = df['Name'].replace(extraction, regex=True)
    return df

titanic_data = extract_title(titanic_data)

# Fill in the small amount of missing values for fare

In [None]:
titanic_data['Fare'].fillna(titanic_data['Fare'].mean(), inplace=True)

# Filling in missing values for age

In [None]:
def fillna_age(df):
    """
    Will fill all missing values of the Age column 
    based on the median values of the Age
    after a groupby on the Name, Pclass, and Sex
    
    Arguments:
        df {Pandas dataframe} -- should contain a Age, Pclass, Name, and Sex column
        
    Returns:
        df {Pandas dataframe} -- same dataframe except all missing values of the Age column are filled
    """
    age_selection = df[['Age', 'Pclass', 'Name', 'Sex']].dropna()
    grouped_age = age_selection.groupby(['Name','Pclass','Sex'])['Age'].median()
    
    df['Age'] = df.apply(lambda x: grouped_age.loc[(x['Name'], x['Pclass'], x['Sex'])] if not x['Age'] > 0 else x['Age'],
                         axis=1)
    return df

titanic_data = fillna_age(titanic_data)

# Extract cabin class out of cabin

In [None]:
for i in range(len(titanic_data)):
    if type(titanic_data['Cabin'][i]) == str:
        trimmed = titanic_data['Cabin'][i][:1]
        titanic_data.loc[i,'Cabin'] = trimmed

# Filling in missing values for cabin

In [None]:
cabin = titanic_data[['Cabin','SibSp','Parch','Age','Fare','Pclass']].dropna()
cabin['Fare'] = cabin['Fare'].astype(int)
grouped_cabin = cabin.groupby(['Cabin'])['Pclass'].mean()
print(grouped_cabin)

In [None]:
# Can be improved
classone = ['A', 'B', 'C', 'D', 'E', 'T']
for i in range(len(titanic_data)):
    if not type(titanic_data['Cabin'][i]) == str:
        if titanic_data.loc[i,'Pclass'] == 1:
            titanic_data.loc[i,'Cabin'] = random.choice(classone)
        elif titanic_data.loc[i,'Pclass'] == 2:
            titanic_data.loc[i,'Cabin'] = 'F'
        elif titanic_data.loc[i,'Pclass'] == 3:
            titanic_data.loc[i,'Cabin'] = 'G'

# Visualizations

In [None]:
titanic_data.columns

In [None]:
import seaborn as sns

sns.barplot(y=titanic_data['Survived'], x=titanic_data['Pclass'])

In [None]:
sns.barplot(y=titanic_data['Survived'], x=titanic_data['Name'])

In [None]:
sns.barplot(y=titanic_data['Survived'], x=titanic_data['Sex'])

In [None]:
sns.barplot(x=titanic_data['Survived'], y=titanic_data['Age'])

In [None]:
sns.barplot(x=titanic_data['Survived'], y=titanic_data['Fare'])

In [None]:
sns.barplot(x=titanic_data['Survived'], y=titanic_data['Cabin'])

In [None]:
sns.barplot(x=titanic_data['Survived'], y=titanic_data['Embarked'])

In [None]:
sns.barplot(y=titanic_data['Survived'], x=titanic_data['SibSp'])

In [None]:
sns.barplot(y=titanic_data['Survived'], x=titanic_data['Parch'])

# SibSp + Parch into familysize

In [None]:
titanic_data['FamilySize'] = titanic_data['SibSp'] + titanic_data['Parch'] + 1
titanic_data.drop(['SibSp'], axis=1, inplace=True)
titanic_data.drop(['Parch'], axis=1, inplace=True)

In [None]:
sns.barplot(y=titanic_data['Survived'], x=titanic_data['FamilySize'])

# Combine FamilySize 8 + 11 because both have 0 change of surviving

In [None]:
for i in range(len(titanic_data)):
    if titanic_data.loc[i,'FamilySize'] == 11:
        titanic_data.loc[i,'FamilySize'] = 8