# Data Wrangling

In [1]:
# Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
# NumPy is a library for the Python programming language, adding support for large, 
# multi-dimensional arrays and matrices, along with a large collection of high-level 
# mathematical functions to operate on these arrays
import numpy as np
from scipy.stats import mode
import string

In [2]:
# Functions
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    print (big_string)
    return np.nan

def replace_titles(x):
        title=x['Title']
        if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
            return 'Mr'
        elif title in ['Countess', 'Mme']:
            return 'Mrs'
        elif title in ['Mlle', 'Ms']:
            return 'Miss'
        elif title =='Dr':
            if x['Sex']=='Male':
                return 'Mr'
            else:
                return 'Mrs'
        else:
            return title

In [3]:
train = pd.read_csv('train.csv')
train_backup = pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test = pd.read_csv('test.csv')
test_backup = pd.read_csv('test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
#Check nan values
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
#Check nan values
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
# Replace zero fare values to mean of its mean fare value grouped by Pclass
for df in [train, test]:
    df['Fare'] = df['Fare'].replace(0, np.nan)
    df['Fare']=df['Fare'].fillna(df.groupby('Pclass')['Fare'].transform('mean'))

In [8]:
# Fill nan values
for df in [train, test]:
    meanAge=np.mean(df.Age)
    df.Age=df.Age.fillna(meanAge)
    modeEmbarked = mode(df.Embarked)[0][0]
    df.Embarked = df.Embarked.fillna(modeEmbarked)
    df['Cabin'] = df['Cabin'].fillna('Unknown')

# Feature Engineering

In [9]:
for df in [train, test]:
    
#Turning cabin number into Deck as a new Feature
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    df['Deck'] = df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

#Creating a title column from name
    title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
            'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
            'Don', 'Jonkheer']
    df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))

# Replacing all titles with mr, mrs, miss, master
    df['Title']=df.apply(replace_titles, axis=1)

#Creating new family_size column
    df['Family_Size']=df['SibSp']+df['Parch']
#Creating new Age*Class column
    df['Age*Class']=df['Age']*df['Pclass']
#Creating new Fare_Per_Person column
    df['Fare_Per_Person']=df['Fare']/(df['Family_Size'])

In [10]:
#Check data types
train.dtypes

PassengerId          int64
Survived             int64
Pclass               int64
Name                object
Sex                 object
Age                float64
SibSp                int64
Parch                int64
Ticket              object
Fare               float64
Cabin               object
Embarked            object
Deck                object
Title               object
Family_Size          int64
Age*Class          float64
Fare_Per_Person    float64
dtype: object

In [11]:
#Check data types
test.dtypes

PassengerId          int64
Pclass               int64
Name                object
Sex                 object
Age                float64
SibSp                int64
Parch                int64
Ticket              object
Fare               float64
Cabin               object
Embarked            object
Deck                object
Title               object
Family_Size          int64
Age*Class          float64
Fare_Per_Person    float64
dtype: object

In [12]:
#Check nan values
train.isnull().sum()

PassengerId        0
Survived           0
Pclass             0
Name               0
Sex                0
Age                0
SibSp              0
Parch              0
Ticket             0
Fare               0
Cabin              0
Embarked           0
Deck               0
Title              0
Family_Size        0
Age*Class          0
Fare_Per_Person    0
dtype: int64

In [13]:
#Check nan values
test.isnull().sum()

PassengerId        0
Pclass             0
Name               0
Sex                0
Age                0
SibSp              0
Parch              0
Ticket             0
Fare               0
Cabin              0
Embarked           0
Deck               0
Title              0
Family_Size        0
Age*Class          0
Fare_Per_Person    0
dtype: int64