In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

sns.set(rc={'figure.figsize':(12,10)})

Load Dataset

In [2]:
data = pd.read_csv('Dataset\Titanic-Dataset.csv')

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Type of Features

- Categorical : Sex and Embarked
- Continuous : Age, Fare
- Discrete : SibSp, Parch
- Alphanumeric: Cabin

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Numerical Value Analysis

In [None]:
heatmap = sns.heatmap(data[["Survived", "SibSp","Parch", "Age", "Fare"]].corr(), annot=True)

Conclusion:

Only Fare feature seems to have a significative correlation with the Survival probability.

It doesn't means that other features are not usefull. Subpopulations in these features can be correlated with Survival. To determine this, we need to explore in detail these features

SibSp

In [11]:
data['SibSp'].unique()

array([1, 0, 3, 4, 2, 5, 8])

In [None]:

bargraph_sibsp = sns.catplot(x='SibSp', y='Survived', data=data, kind='bar', height=8)
plt.show()


It seems that passangers having lots of siblings/spouses have less chances to survive.
Single passenger or with two passenger (SibSp 1 or 2) have more chances to survive

Age

In [None]:

age_visual = sns.FacetGrid(data, col='Survived')
age_visual.map(sns.histplot, 'Age', kde=True)
age_visual.set_ylabels("Survival Probability")

plt.show()


We observed that Age distribution are not the same in Survived and not Survived subpopulations. There is a peak corresponding to young passenger, that Survived. 

Sex

In [None]:
plt.figure(figsize=(8,6))
age_plot = sns.barplot(x="Sex", y="Survived", data=data)

In [22]:
data[['Sex', 'Survived']].groupby('Sex').mean()

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


It is clearly visible that Male have less chance to survice then Female. So Sex, might play an important role in the prediction of survival

PClass

In [None]:
pclass = sns.catplot(x='Pclass', y='Survived', data=data, kind='bar', height=7)
plt.show()

PClass VS Survived by Sex

In [None]:
pclassbysex = sns.catplot(x='Pclass', y='Survived', data=data, kind='bar', hue="Sex", height=7)
plt.show()

Embarked

In [30]:
data['Embarked'].isnull().sum()

np.int64(0)

In [31]:
data['Embarked'].value_counts()

Embarked
S    646
C    168
Q     77
Name: count, dtype: int64

In [32]:
data['Embarked'].fillna('S', inplace=True)

In [None]:
sns.catplot(x='Embarked', y='Survived', data=data, kind='bar', height=7)

Passenger coming from Cherbourg (C) have more chances to survive

Lets find the reason

In [None]:
sns.catplot(x="Pclass", col="Embarked", data=data, kind='count', height=7, hue="Pclass", palette="Set2")

Cherbourg passengers are mostly in first class which have higher survival rate.

Preparing Data

In [38]:
# Treating missing values in Age column
mean = data['Age'].mean()
std = data['Age'].std()
is_null = data['Age'].isnull().sum()
rand_age = np.random.randint(mean - std, mean + std, size = is_null)
age_slice = data['Age'].copy()
age_slice[np.isnan(age_slice)] = rand_age
data['Age'] = age_slice
data['Age'] = data['Age'].astype(int)

In [39]:
data['Age'].isnull().sum()

np.int64(0)

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    int64  
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(1), int64(5), object(2)
memory usage: 55.8+ KB


In [None]:
col_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
data.drop(columns=col_to_drop, axis=1, inplace=True)

In [44]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22,1,0,7.25,S
1,1,1,female,38,1,0,71.2833,C
2,1,3,female,26,0,0,7.925,S
3,1,1,female,35,1,0,53.1,S
4,0,3,male,35,0,0,8.05,S


In [45]:
gender = {"male":0, "female":1}
data['Sex'] = data['Sex'].map(gender)

In [47]:
port = {"S":0, "C":1, "Q":2}
data['Embarked'] = data['Embarked'].map(port)

In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    int64  
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    int64  
dtypes: float64(1), int64(7)
memory usage: 55.8 KB


Splitting Data

In [50]:
x = data.drop('Survived', axis=1)
y = data['Survived']

In [56]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=42)

Feature Scaling

In [58]:
sc_x = StandardScaler()
xtrain = sc_x.fit_transform(xtrain) 
xtest = sc_x.transform(xtest)

Classification

In [59]:
logReg = LogisticRegression()
svc_classifier = SVC()
knn_classifier = KNeighborsClassifier(n_neighbors=5)    
rf_classifier = RandomForestClassifier(n_estimators=1000, criterion='entropy', random_state=42)
dt_classifier = DecisionTreeClassifier()


In [60]:
logReg.fit(xtrain, ytrain)
svc_classifier.fit(xtrain, ytrain)
knn_classifier.fit(xtrain, ytrain)
rf_classifier.fit(xtrain, ytrain)
dt_classifier.fit(xtrain, ytrain)



0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [61]:
logReg_predict = logReg.predict(xtest)
svc_predict = svc_classifier.predict(xtest)
knn_predict = knn_classifier.predict(xtest)
rf_predict = rf_classifier.predict(xtest)
dt_predict = dt_classifier.predict(xtest)

In [62]:
from sklearn.metrics import accuracy_score
 

In [63]:
print("Logistic Regression accuracy: ", accuracy_score(ytest, logReg_predict))
print("Support Vector Machine accuracy: ", accuracy_score(ytest, svc_predict))  
print("K-Nearest Neighbors accuracy: ", accuracy_score(ytest, knn_predict))
print("Random Forest accuracy: ", accuracy_score(ytest, rf_predict))
print("Decision Tree accuracy: ", accuracy_score(ytest, dt_predict))

Logistic Regression accuracy:  0.8097014925373134
Support Vector Machine accuracy:  0.8171641791044776
K-Nearest Neighbors accuracy:  0.7985074626865671
Random Forest accuracy:  0.7873134328358209
Decision Tree accuracy:  0.746268656716418
