# Model selection By using cross validation

In [1]:
import pandas as pd
import numpy as np

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.drop('PassengerId', axis=1, inplace=True)

In [5]:
df['Title'] = df.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())


In [6]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [7]:
normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}

In [8]:
# map the normalized titles to the current titles
df.Title = df.Title.map(normalized_titles)

# view value counts for the normalized titles
print(df.Title.value_counts())

Mr         517
Miss       184
Mrs        127
Master      40
Officer     18
Royalty      5
Name: Title, dtype: int64


In [9]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [10]:
# group by Sex, Pclass, and Title
grouped = df.groupby(['Sex','Pclass', 'Title'])

# view the median Age by the grouped features
grouped.Age.median()

Sex     Pclass  Title  
female  1       Miss       30.0
                Mrs        40.0
                Officer    49.0
                Royalty    40.5
        2       Miss       24.0
                Mrs        31.5
        3       Miss       18.0
                Mrs        31.0
male    1       Master      4.0
                Mr         40.0
                Officer    51.0
                Royalty    40.0
        2       Master      1.0
                Mr         31.0
                Officer    46.5
        3       Master      4.0
                Mr         26.0
Name: Age, dtype: float64

In [11]:
# apply the grouped median value on the Age NaN
df.Age = grouped.Age.apply(lambda x: x.fillna(x.median()))
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [12]:
# fill Cabin NaN with U for unknown
df.Cabin = df.Cabin.fillna('U')

In [13]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,U,S,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,U,S,Miss
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,U,S,Mr


In [14]:
# find most frequent Embarked value and store in variable
most_embarked = df.Embarked.value_counts().index[0]

# fill NaN with most_embarked value
df.Embarked = df.Embarked.fillna(most_embarked)

In [15]:
# fill NaN with median fare
df.Fare = df.Fare.fillna(df.Fare.median())


In [16]:
# view the percentage of those that survived vs. those that died in the Titanic
df.Survived.value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [17]:
# group by sex
group_by_sex = df.groupby('Sex')

# survival rate by sex
group_by_sex.Survived.mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [18]:
# size of families (including the passenger)
df['FamilySize'] = df.Parch + df.SibSp + 1

In [19]:
# map first letter of cabin to itself
df.Cabin = df.Cabin.map(lambda x: x[0])

# view normalized count
df.Cabin.value_counts(normalize=True)

U    0.771044
C    0.066218
B    0.052750
D    0.037037
E    0.035915
A    0.016835
F    0.014590
G    0.004489
T    0.001122
Name: Cabin, dtype: float64

In [20]:
# Convert the male and female groups to integer form
df.Sex = df.Sex.map({"male": 0, "female":1})

In [21]:
# create dummy variables for categorical features
pclass_dummies = pd.get_dummies(df.Pclass, prefix="Pclass")
title_dummies = pd.get_dummies(df.Title, prefix="Title")
cabin_dummies = pd.get_dummies(df.Cabin, prefix="Cabin")
embarked_dummies = pd.get_dummies(df.Embarked, prefix="Embarked")

In [22]:
# concatenate dummy columns with main dataset
df = pd.concat([df, pclass_dummies, title_dummies, cabin_dummies, embarked_dummies], axis=1)

# drop categorical fields
df.drop(['Pclass', 'Title', 'Cabin', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)

df.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,FamilySize,Pclass_1,Pclass_2,Pclass_3,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Embarked_C,Embarked_Q,Embarked_S
0,0,0,22.0,1,0,7.25,2,0,0,1,...,0,0,0,0,0,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,2,1,0,0,...,1,0,0,0,0,0,0,1,0,0
2,1,1,26.0,0,0,7.925,1,0,0,1,...,0,0,0,0,0,0,1,0,0,1
3,1,1,35.0,1,0,53.1,2,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0,0,35.0,0,0,8.05,1,0,0,1,...,0,0,0,0,0,0,1,0,0,1


In [23]:
# create X and y for data and target values
X = df.drop('Survived', axis=1).values
y = df.Survived.values

In [24]:
# classifier models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [25]:
from sklearn.model_selection import cross_val_score

In [28]:
log = LogisticRegression()
cross_val_score(log, X, y, cv=10, scoring='accuracy')

array([0.82222222, 0.85393258, 0.7752809 , 0.86516854, 0.83146067,
       0.79775281, 0.82022472, 0.82022472, 0.8988764 , 0.85393258])

In [29]:
forrest = RandomForestClassifier()
cross_val_score(forrest, X, y, cv=10, scoring='accuracy')

array([0.78888889, 0.84269663, 0.75280899, 0.83146067, 0.85393258,
       0.84269663, 0.78651685, 0.78651685, 0.84269663, 0.86516854])

In [41]:
log = LogisticRegression()
cross_val_score(log, X, y, cv=10, scoring='accuracy').mean()

0.8339076154806492

In [42]:
forrest = RandomForestClassifier()
cross_val_score(forrest, X, y, cv=10, scoring='accuracy').mean()

0.8170911360799

In [43]:
forrest = RandomForestClassifier()
cross_val_score(forrest, X, y, cv=25, scoring='accuracy').mean()

0.8172380952380953

In [40]:
forrest = RandomForestClassifier()
cross_val_score(forrest, X, y, cv=25, scoring='accuracy').mean()

0.8173015873015873