### Building a machine learning model to predict those who survived the titanic shipwreck

* import the necessary libraries
* explore and analyze my data
* prepare my data for modelling
* model my data
* evaluate my model

In [122]:
# importing needed libraries, pyforest contains most of the libraries for exploration and analysis

import pyforest
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("Titanic.csv")
df.head()

In [124]:
# no. of rows and columns

df.shape

(891, 12)

In [125]:
# checking for the names of the columns

df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [126]:
# details of each column, dtypes and null values

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [127]:
# statistics of my data

df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [128]:
# checking for the number of 1= first class, 2= second class, 3= third class tickets

df.value_counts('Pclass')

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [129]:
# checking for the number of siblings/spouses aboard the Titanic.

df.value_counts('SibSp')

SibSp
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: count, dtype: int64

In [130]:
# Possible number of those who survoved and those who didn't

df.value_counts('Survived')

Survived
0    549
1    342
Name: count, dtype: int64

In [131]:
# Checking for null values

df.isnull()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


In [132]:
# Number of missing values in each column

df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
# Using a heatmap to visualize the columns with null values

sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

<IPython.core.display.Javascript object>

In [None]:
# Using countplot to plot a histogram of those that survived and those that didn't

sns.set_style('darkgrid')
sns.countplot(x='Survived', data=df)

In [None]:
# separating the male surivors and male non_survivors from the female survivors and non_survivors, 0=non_survivors, 1=survivors
sns.set_style('whitegrid')
sns.countplot(x='Survived', hue='Sex', data=df, palette='RdBu_r')
dpi=700

In [None]:
# Getting a plot of the survivors in each passenger's class, 1= frist class, 2= second class, 3= third class
sns.set_style('whitegrid')
sns.countplot(x='Survived', hue='Pclass', data=df, palette='rainbow')

In [None]:
# Getting a histogram distribution plot of null values in the Age column.

sns.set_style('whitegrid')
sns.distplot(df['Age'].dropna(), kde=False, color='darkblue', bins=40)

In [None]:
df['Age'].hist(bins=30, color='green', alpha=0.8)

In [None]:
# countplot of the number of siblings/spouses aboard

sns.countplot(x='SibSp', data=df)

In [None]:
# histogram distribution plot of the Fare paid by the passengers

sns.set_style('whitegrid')
sns.distplot(df['Fare'].dropna(), kde=False, color='darkblue', bins=40)

In [None]:
df['Fare'].hist(color='green', bins=40, figsize=(8,4))

## DATA CLEANING

In [None]:
# boxplot showing the relationship between the Passenger's class and the Age

plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass', y='Age', data=df, palette='winter')

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age

In [None]:
df['Age'] = df[['Age', 'Pclass']].apply(impute_age, axis=1)

In [None]:
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
df.drop('Cabin', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.dropna(inplace=True)

In [None]:
# visualizing again to check for null values

sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

## DATA MODELLING
In order to model my data, I'd be trying out different models like Decision Tree, Linear regression, Random Forest.
I'd also evaluate these models to see which works well on the data and which will predict better.

In [None]:
# defining my independent features from my data and assigning to a variable X

df_features = ['Pclass', 'Age', 'Sex']

X = df[df_features]

In [None]:
X.describe()

In [None]:
X.head(50)

In [None]:
# converting the categorical column to bool

X.replace(['male', 'female'], [0,1], inplace=True)

In [None]:
X.head()

In [None]:
# This is the target variable

y = df.Survived

In [None]:
y.head()

In [None]:
# Modelling my data on Decision Tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

df_model = DecisionTreeClassifier(random_state=1)

df_model.fit(X, y)

In [None]:
# Making predictions from what has been modelled and printing out my predictions

print(f"Making predictions for the following passengers: \n {X.head()}")
print("The predicitons are:")
print(df_model.predict(X.head()))

In [None]:
# Checking the accuracy of this model

predicted_survivors = df_model.predict(X)
accuracy = accuracy_score(y, predicted_survivors)
print('Accuracy score of the test data: ', accuracy * 100)

In [None]:
# Evaluating the error of my model

from sklearn.metrics import mean_absolute_error

predicted_survivors = df_model.predict(X)
mean_absolute_error(y, predicted_survivors)


print("Decision Tree Classifier:")
print(f"Accuracy: {accuracy_score(y, predicted_survivors)}")
print(f"Precision: {precision_score(y, predicted_survivors)}")
print(f"Recall: {recall_score(y, predicted_survivors)}")
print(f"F1 Score: {f1_score(y, predicted_survivors)}")
print("Confusion Matrix:")
print(confusion_matrix(y, predicted_survivors))
print("Classification Report:")
print(classification_report(y, predicted_survivors))

In [None]:
# Modelling my data on Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df_model = RandomForestClassifier(random_state=42)

df_model.fit(X, y)

In [None]:
# Making predictions from what has been modelled and printing out my predictions


print(f"Making predictions for the following passengers: \n {X.head()}")
print("The predicitons are:")
print(df_model.predict(X.head()))

In [None]:
# Evaluating the model performance on the data

predicted_survivors = df_model.predict(X)
accuracy = accuracy_score(y, predicted_survivors)
print('Accuracy score of the test data: ', accuracy * 100)


print("Random Forest Classifier:")
print(f"Accuracy: {accuracy_score(y, predicted_survivors)}")
print(f"Precision: {precision_score(y, predicted_survivors)}")
print(f"Recall: {recall_score(y, predicted_survivors)}")
print(f"F1 Score: {f1_score(y, predicted_survivors)}")
print("Confusion Matrix:")
print(confusion_matrix(y, predicted_survivors))
print("Classification Report:")
print(classification_report(y, predicted_survivors))

In [None]:
# Modelling with the svm classifier model

from sklearn import svm

In [None]:
# fitting the model with my data

df_model = svm.SVC(kernel= 'linear')

df_model.fit(X, y)

In [None]:
# Making predictions with the model and printing it out

print(f"Making predictions for the following passengers: \n {X.head()}")
print("The predicitons are:")
print(df_model.predict(X.head()))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# Checking the accuracy of my model on the data

predicted_survivors = df_model.predict(X)
accuracy = accuracy_score(y, predicted_survivors)
print('Accuracy score of the test data: ', accuracy * 100)

In [None]:
# Evaluating the model performance on the data


print("SVM Classifier:")
print(f"Accuracy: {accuracy_score(y, predicted_survivors)}")
print(f"Precision: {precision_score(y, predicted_survivors)}")
print(f"Recall: {recall_score(y, predicted_survivors)}")
print(f"F1 Score: {f1_score(y, predicted_survivors)}")
print("Confusion Matrix:")
print(confusion_matrix(y, predicted_survivors))
print("Classification Report:")
print(classification_report(y, predicted_survivors))

### Conclusion

* Both The Decision Tree Classifier model and Random Forest Classifier model perfomed well on the data, giving us quite high number of TP and TN, and we have similar accuracy for both.
* The lower performance of the SVM classifier indicates that it may not be the best choice for this particular dataset, possibly due to the non-linear nature of the data.