In [17]:
# import all required libraries

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [2]:
# read the dataset
df_main = pd.read_csv("titanic.csv")
# create a copy of the main dataframe
df = df_main.copy()

### Explore dataset

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survived                 887 non-null    int64  
 1   Pclass                   887 non-null    int64  
 2   Name                     887 non-null    object 
 3   Sex                      887 non-null    object 
 4   Age                      887 non-null    float64
 5   Siblings/Spouses Aboard  887 non-null    int64  
 6   Parents/Children Aboard  887 non-null    int64  
 7   Fare                     887 non-null    float64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.6+ KB


- The column names have / and whitespace, so that can be changed
- Sex and Name are in object type, so they are probably in string

In [4]:
# Replace the column names i.e replace "/" and whitespaces to underscore for easy access of data
df.columns = df.columns.map(lambda x: x.replace('/','_').replace(' ', '_'))

In [5]:
df.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings_Spouses_Aboard',
       'Parents_Children_Aboard', 'Fare'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings_Spouses_Aboard,Parents_Children_Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


Name doesn't contribute anything to the model, so it can be dropped. Sex is a categorial variable so it can be converted to ordinal

In [7]:
df = df.drop("Name", axis = 1)

In [8]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings_Spouses_Aboard,Parents_Children_Aboard,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [10]:
# mapping male to 0 and female to 1
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

In [13]:
df['Sex'].value_counts()

0    573
1    314
Name: Sex, dtype: int64

In [14]:
df.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings_Spouses_Aboard,Parents_Children_Aboard,Fare
count,887.0,887.0,887.0,887.0,887.0,887.0,887.0
mean,0.385569,2.305524,0.354002,29.471443,0.525366,0.383315,32.30542
std,0.487004,0.836662,0.47848,14.121908,1.104669,0.807466,49.78204
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,0.0,20.25,0.0,0.0,7.925
50%,0.0,3.0,0.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,1.0,38.0,1.0,0.0,31.1375
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


We can see that the std of the columns like Age and Fare are large than others, so we need to normalize these values. Also Survived is out target variable so we will create a separate variable for features and target

In [37]:
# check if any column have missing values
df.isna().sum()

Survived                   0
Pclass                     0
Sex                        0
Age                        0
Siblings_Spouses_Aboard    0
Parents_Children_Aboard    0
Fare                       0
dtype: int64

There are no missing values.

In [16]:
X = df.drop(['Survived'], axis = 1)
y = df['Survived']

In [20]:
# Normalization
X_standard = StandardScaler().fit_transform(X)
pd.DataFrame(X_standard).describe()

Unnamed: 0,0,1,2,3,4,5
count,887.0,887.0,887.0,887.0,887.0,887.0
mean,-2.403189e-17,2.002657e-17,2.403189e-17,1.602126e-17,-1.7022590000000002e-17,1.301727e-16
std,1.000564,1.000564,1.000564,1.000564,1.000564,1.000564
min,-1.561277,-0.7402655,-2.05835,-0.4758557,-0.4749808,-0.6493034
25%,-0.3653765,-0.7402655,-0.6533568,-0.4758557,-0.4749808,-0.4900196
50%,0.8305236,-0.7402655,-0.1042546,-0.4758557,-0.4749808,-0.3587899
75%,0.8305236,1.350867,0.6042645,0.4299039,-0.4749808,-0.02347391
max,0.8305236,1.350867,3.580045,6.770221,6.959866,9.647949


Now all the vales are within the range

In [21]:
# Create training and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state = 42)

### Logistic Regression

In [22]:
# Create list of hyperparameters
logistic_parameters = {'penalty': ['none', 'l2']}
regressor = LogisticRegression(random_state = 42)
regressor_grid = GridSearchCV(regressor, logistic_parameters)
regressor_grid.fit(X_train, y_train)

GridSearchCV(estimator=LogisticRegression(random_state=42),
             param_grid={'penalty': ['none', 'l2']})

In [25]:
best_params = regressor_grid.best_params_
print(regressor_grid.best_score_)
print(regressor_grid.best_estimator_.score(X_test, y_test))

0.8138148037159125
0.7471910112359551


In [36]:
# Train Logistic Regression model using the best parameters
logistic_regressor = LogisticRegression(**best_params).fit(X_train, y_train)
logistic_pred = logistic_regressor.predict(X_test)
logistic_accuracy = accuracy_score(y_test, logistic_pred)
logistic_recall = recall_score(y_test, logistic_pred)
print("The accuracy of Logistic Regression is {:.2f} and recall is {:.2f}".format(logistic_accuracy, logistic_recall))

The accuracy of Logistic Regression is 0.75 and recall is 0.55


We weill focus mostly on recall since we want to correclty predict the survivals.