In [1]:
#Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Load the training data from csv file
train_data = pd.read_csv('train.csv', header=0)

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


Explor the data:<br>
-Check number of rows and columns <br>
-Data types of features<br>
-Statistics of the features <br>
-Drop un-needed features<br>
-Encode Categorical features <br>
-Process missing values<br>
-Feature engineering if required (add new features if needed)<br>
-Scale features if needed <br>
-Create the model <br>
-Train the model <br>
-Score the model <br>
-Improve the model <br>

In [5]:
#Check number of rows and columns
nrows = len(train_data)
ncols = len(train_data.columns)

print('Number of rows: ',nrows)
print('Number of columns: ',ncols)

Number of rows:  891
Number of columns:  12


In [6]:
#Data types of features and missing values
train_data.info()

#Age field has 891-714 = 177 missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
#Statistics of the features
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
#Drop un-needed features
#some features will not be used in the model, so we will drop them from the dataset
#PassengerId, Name, Ticket, Cabin

train_data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [9]:
#Verify columns were dropped
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [10]:
#Check for missing or nan values
#Embarked feature
train_data.Embarked.unique()

#We have nan in Embarked

array(['S', 'C', 'Q', nan], dtype=object)

In [11]:
#Since missing values are just 2, Replace nan values with the mode
"""
mode = train_data.Embarked.mode()
train_data['Embarked'].fillna(mode[0], inplace=True)
"""

"\nmode = train_data.Embarked.mode()\ntrain_data['Embarked'].fillna(mode[0], inplace=True)\n"

In [12]:
#Another way to do it is to use SmpleImputer class

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
train_data['Embarked'] = imputer.fit_transform(X=train_data[['Embarked']])

In [13]:
#Check unique values in Sex, to make sure we don't have funny values
train_data.Sex.unique()

array(['male', 'female'], dtype=object)

In [14]:
#label encode Categorical features
#Sex
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_data['Sex'] =  label_encoder.fit_transform(train_data['Sex'])

In [15]:
#Hot encode Categorical features
#Embarked, since we have 3 values, label Encoding should work fine, but let's go with Hot encode to be safe
train_data = pd.concat([train_data , pd.get_dummies(train_data[['Embarked']])], axis=1)

#Drop Embarked col
train_data.drop(columns=['Embarked'], axis=1, inplace=True)

In [16]:
#Another way to hot encoding
"""
from sklearn.preprocessing import OneHotEncoder
oneHot_enconder = OneHotEncoder()
encoded = oneHot_enconder.fit_transform(train_data[['Embarked']])
encoded_df = pd.DataFrame(encoded.todense())

#Apppend the encoded DF to train_data
train_data = pd.concat([train_data, encoded_df], axis=1)

#Drop Embarked columns
train_data.drop(columns=['Embarked'], axis=1, inplace=True)
"""

"\nfrom sklearn.preprocessing import OneHotEncoder\noneHot_enconder = OneHotEncoder()\nencoded = oneHot_enconder.fit_transform(train_data[['Embarked']])\nencoded_df = pd.DataFrame(encoded.todense())\n\n#Apppend the encoded DF to train_data\ntrain_data = pd.concat([train_data, encoded_df], axis=1)\n\n#Drop Embarked columns\ntrain_data.drop(columns=['Embarked'], axis=1, inplace=True)\n"

In [17]:
train_data.tail()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
886,0,2,1,27.0,0,0,13.0,0,0,1
887,1,1,0,19.0,0,0,30.0,0,0,1
888,0,3,0,,1,2,23.45,0,0,1
889,1,1,1,26.0,0,0,30.0,1,0,0
890,0,3,1,32.0,0,0,7.75,0,1,0


In [18]:
#Fix missing values in Age
#We have 177 missing values in Age feature, we should not just replace it with one of the feature statistics (mean, median or mode)
#Will be using IterativeImputer where we use other features to predict the value of the Age feature

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

iter_impute = IterativeImputer(max_iter=10, random_state=0)
iter_impute.fit(train_data.values)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=100, min_value=1,
                 missing_values=np.nan, n_nearest_features=None,
                 random_state=0, sample_posterior=False, tol=0.001,
                 verbose=0)

X_test = train_data.values

imputed = np.round(iter_impute.transform(X_test))

In [19]:
train_data = pd.DataFrame(imputed, columns=['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked_C','Embarked_Q','Embarked_S'])

In [20]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0.0,3.0,1.0,22.0,1.0,0.0,7.0,0.0,0.0,1.0
1,1.0,1.0,0.0,38.0,1.0,0.0,71.0,1.0,0.0,0.0
2,1.0,3.0,0.0,26.0,0.0,0.0,8.0,0.0,0.0,1.0
3,1.0,1.0,0.0,35.0,1.0,0.0,53.0,0.0,0.0,1.0
4,0.0,3.0,1.0,35.0,0.0,0.0,8.0,0.0,0.0,1.0


In [21]:
#Make sure we processed all missing values
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Survived      891 non-null float64
Pclass        891 non-null float64
Sex           891 non-null float64
Age           891 non-null float64
SibSp         891 non-null float64
Parch         891 non-null float64
Fare          891 non-null float64
Embarked_C    891 non-null float64
Embarked_Q    891 non-null float64
Embarked_S    891 non-null float64
dtypes: float64(10)
memory usage: 69.7 KB


In [22]:
#Since iterative imputation can predict negative values, let's check if we have negative or zero values in Age
train_data[train_data.Age <= 0]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
159,0.0,3.0,1.0,-5.0,8.0,2.0,70.0,0.0,0.0,1.0
180,0.0,3.0,0.0,-5.0,8.0,2.0,70.0,0.0,0.0,1.0
201,0.0,3.0,1.0,-5.0,8.0,2.0,70.0,0.0,0.0,1.0
324,0.0,3.0,1.0,-5.0,8.0,2.0,70.0,0.0,0.0,1.0
792,0.0,3.0,0.0,-5.0,8.0,2.0,70.0,0.0,0.0,1.0
803,1.0,3.0,1.0,0.0,0.0,1.0,9.0,1.0,0.0,0.0
846,0.0,3.0,1.0,-5.0,8.0,2.0,70.0,0.0,0.0,1.0
863,0.0,3.0,0.0,-5.0,8.0,2.0,70.0,0.0,0.0,1.0


In [23]:
#We have nagative values in Age, replace them with 1
filter_age = train_data.Age <=0 
train_data.loc[filter_age, 'Age'] = 1

In [24]:
#Feature engineering
#1. Add a new feature that contains the total number of SibSp+Parch
train_data['Total_Family'] = train_data.SibSp+train_data.Parch

#Drop SibSp and Parch features
train_data = train_data.drop(columns=['SibSp','Parch'], axis=1)

In [25]:
train_data.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Total_Family
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.421998,32.214366,0.188552,0.08642,0.725028,0.904602
std,0.486592,0.836071,0.47799,13.56546,49.693352,0.391372,0.281141,0.446751,1.613459
min,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,21.0,8.0,0.0,0.0,0.0,0.0
50%,0.0,3.0,1.0,29.0,14.0,0.0,0.0,1.0,0.0
75%,1.0,3.0,1.0,36.0,31.0,0.0,0.0,1.0,1.0
max,1.0,3.0,1.0,80.0,512.0,1.0,1.0,1.0,10.0


In [26]:
#Split the train_data into train and test
#We have a separate test set but this is for me to test the model
from sklearn.model_selection import train_test_split
X = train_data.iloc[:, 1:]
y = train_data.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [27]:
#Algorithm spot-check, try multiple algorithms and check which one gives the best results

In [28]:
#Scale features (Age, Total_Family, Fare)
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
mm_scaler.fit(X_train)

X_train = mm_scaler.fit_transform(X_train)
X_test = mm_scaler.transform(X_test)

In [29]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(min_samples_split = 2)

classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [30]:
#Check the score
score = classifier.score(X_test, y_test)
score

0.8333333333333334

In [31]:
#Check the score on training data
score = classifier.score(X_train, y_train)
score

0.9762796504369539