In [1]:
#Packages

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import copy

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, roc_auc_score, r2_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier


import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"C:\Users\USER\Desktop\DATA\Titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
y = df['Survived']
X = df[['PassengerId','Pclass','Sex','Age','Fare']]

In [4]:
X.isna().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
Fare            1
dtype: int64

In [5]:
y.isna().sum()

0

In [6]:
#Filling the missing age values with the average age

X['Age'].fillna(X['Age'].mean().round(2), inplace=True)
X

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare
0,892,3,male,34.50,7.8292
1,893,3,female,47.00,7.0000
2,894,2,male,62.00,9.6875
3,895,3,male,27.00,8.6625
4,896,3,female,22.00,12.2875
...,...,...,...,...,...
413,1305,3,male,30.27,8.0500
414,1306,1,female,39.00,108.9000
415,1307,3,male,38.50,7.2500
416,1308,3,male,30.27,8.0500


In [7]:
#Filling the missing age values with the average age
X['Fare'].fillna(X['Fare'].mean().round(2), inplace=True)
X

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare
0,892,3,male,34.50,7.8292
1,893,3,female,47.00,7.0000
2,894,2,male,62.00,9.6875
3,895,3,male,27.00,8.6625
4,896,3,female,22.00,12.2875
...,...,...,...,...,...
413,1305,3,male,30.27,8.0500
414,1306,1,female,39.00,108.9000
415,1307,3,male,38.50,7.2500
416,1308,3,male,30.27,8.0500


In [8]:
X.isna().sum() #no more blank values

PassengerId    0
Pclass         0
Sex            0
Age            0
Fare           0
dtype: int64

In [9]:
#Transforming the sex column into numerical values
encoder = LabelEncoder()
X['Sex'] = encoder.fit_transform(X['Sex'])
X

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare
0,892,3,1,34.50,7.8292
1,893,3,0,47.00,7.0000
2,894,2,1,62.00,9.6875
3,895,3,1,27.00,8.6625
4,896,3,0,22.00,12.2875
...,...,...,...,...,...
413,1305,3,1,30.27,8.0500
414,1306,1,0,39.00,108.9000
415,1307,3,1,38.50,7.2500
416,1308,3,1,30.27,8.0500


In [10]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
model = LogisticRegression()
model

In [12]:
model.fit(X_train,y_train)

In [13]:
#Making predictions
test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

#Checking for model accuracy
train_score = model.score(X_train,y_train).round(2)
test_score = model.score(X_test,y_test).round(2)

print(f"Training Accuracy: {train_score}")
print(f"Testing Accuracy: {test_score}")

Training Accuracy: 1.0
Testing Accuracy: 1.0


In [14]:
#Prediction
pred = model.predict(X_test)
pred

array([0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0], dtype=int64)

In [16]:
model.predict([[400,3,0,25,300]]) #Based on this new input the prediction is the person survived.

array([1], dtype=int64)

In [17]:
print("Prediction Accuracy:", accuracy_score(y_test, pred).round(2))

Prediction Accuracy: 1.0


In [18]:
#Precision
precision_score(y_test, pred).round(2)

1.0

In [19]:
#Recall
recall_score(y_test, pred).round(2)

1.0

Checking if there is accuracy difference if splitting is done before label encoding

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2)

In [21]:
X_train

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare
399,1291,3,1,31.00,7.7333
202,1094,1,1,47.00,227.5250
404,1296,1,1,43.00,27.7208
209,1101,3,1,25.00,7.8958
4,896,3,0,22.00,12.2875
...,...,...,...,...,...
72,964,3,0,29.00,7.9250
2,894,2,1,62.00,9.6875
352,1244,2,1,18.00,73.5000
113,1005,3,0,18.50,7.2833


In [22]:
scaler = StandardScaler()
scaler

In [23]:
X_train_Scaled = scaler.fit_transform(X_train)
X_train_Scaled

array([[ 1.60350174,  0.89479772,  0.77336028,  0.04299979, -0.50450019],
       [-0.02399267, -1.4677512 ,  0.77336028,  1.32310062,  3.36922231],
       [ 1.6448087 , -1.4677512 ,  0.77336028,  1.00307541, -0.15223016],
       ...,
       [ 1.21521627, -0.28647674,  0.77336028, -0.99708213,  0.65460611],
       [-0.75925664,  0.89479772, -1.29305839, -0.95707897, -0.51243123],
       [ 1.67785427,  0.89479772, -1.29305839, -0.01540481, -0.5047205 ]])

In [24]:
X_test_Scaled = scaler.fit_transform(X_test)
X_test_Scaled

array([[ 5.60713731e-01,  7.92405816e-01,  6.88247202e-01,
         5.78386919e-02, -4.77605747e-01],
       [-7.26679348e-01, -4.40225453e-01,  6.88247202e-01,
        -2.69120835e-01, -3.81776954e-01],
       [-1.58211817e+00,  7.92405816e-01, -1.45296631e+00,
         5.78386919e-02, -4.77605747e-01],
       [-3.96361387e-01, -4.40225453e-01, -1.45296631e+00,
        -7.28548742e-01,  7.80076562e-02],
       [ 1.20289783e-01,  7.92405816e-01,  6.88247202e-01,
        -7.28548742e-01, -4.93577212e-01],
       [ 1.39921317e+00,  7.92405816e-01,  6.88247202e-01,
        -6.51977425e-01, -4.83413552e-01],
       [ 9.50319531e-01, -1.67285672e+00,  6.88247202e-01,
         3.71644361e-02, -1.30105378e-01],
       [ 1.33145564e+00,  7.92405816e-01,  6.88247202e-01,
         5.78386919e-02, -4.83413552e-01],
       [ 6.96228792e-01, -1.67285672e+00, -1.45296631e+00,
         2.64058924e+00, -1.19457734e-01],
       [-1.42270648e-01,  7.92405816e-01, -1.45296631e+00,
         4.20021025e-01

In [25]:
model.fit(X_train_Scaled,y_train)

In [26]:
pred_scaled = model.predict(X_test_Scaled)
pred_scaled

array([0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0], dtype=int64)

In [27]:
accuracy_score(y_test, pred_scaled).round(2)

1.0

No differences noted!