# Import Library and Dataset

In [82]:
# Library

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [47]:
# Import Dataset
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


# EDA (Exploratory Data Analysis)

In [48]:
#Check Missing Values

df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [49]:
df['deck'].unique()

[NaN, 'C', 'E', 'G', 'D', 'A', 'B', 'F']
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [50]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [51]:
# To see the Data Type, N Unique, and its sample
list_df = []
for col in df.columns :
    list_df.append([
        col,
        df[col].dtype,
        df[col].nunique(),
        list(df[col].drop_duplicates().sample(2).values)
    ])

pd.DataFrame(list_df, columns = ['Columns', 'Data Type', 'N Unique', 'Unique Sample'])

Unnamed: 0,Columns,Data Type,N Unique,Unique Sample
0,survived,int64,2,"[1, 0]"
1,pclass,int64,3,"[2, 1]"
2,sex,object,2,"[male, female]"
3,age,float64,88,"[30.5, 14.0]"
4,sibsp,int64,7,"[4, 0]"
5,parch,int64,7,"[3, 5]"
6,fare,float64,248,"[14.4542, 71.2833]"
7,embarked,object,3,"[S, Q]"
8,class,category,3,"[First, Second]"
9,who,object,3,"[child, woman]"


In [52]:
#Check Missing Values

df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [54]:
# Drop Columns
df.drop(['embarked','embark_town','pclass','fare','sibsp','parch','adult_male','deck','alive'], axis = 1,inplace= True)

In [55]:
# Check Missing Values after Being Dropped
df.isna().sum()

survived      0
sex           0
age         177
class         0
who           0
alone         0
dtype: int64

In [61]:
# Drop NA
df.dropna(inplace=True)
df

Unnamed: 0,survived,sex,age,class,who,alone
0,0,male,22.0,Third,man,False
1,1,female,38.0,First,woman,False
2,1,female,26.0,Third,woman,True
3,1,female,35.0,First,woman,False
4,0,male,35.0,Third,man,True
...,...,...,...,...,...,...
885,0,female,39.0,Third,woman,False
886,0,male,27.0,Second,man,True
887,1,female,19.0,First,woman,True
889,1,male,26.0,First,man,True


# Data after Being Cleaned and Determine X & Y

In [62]:
#Data after being cleaned
df

Unnamed: 0,survived,sex,age,class,who,alone
0,0,male,22.0,Third,man,False
1,1,female,38.0,First,woman,False
2,1,female,26.0,Third,woman,True
3,1,female,35.0,First,woman,False
4,0,male,35.0,Third,man,True
...,...,...,...,...,...,...
885,0,female,39.0,Third,woman,False
886,0,male,27.0,Second,man,True
887,1,female,19.0,First,woman,True
889,1,male,26.0,First,man,True


In [63]:
# Create Dummy Var

df_dummy= pd.get_dummies(df, columns=['sex', 'class', 'who', 'alone'], dtype=int, drop_first= True)
df_dummy

Unnamed: 0,survived,age,sex_male,class_Second,class_Third,who_man,who_woman,alone_True
0,0,22.0,1,0,1,1,0,0
1,1,38.0,0,0,0,0,1,0
2,1,26.0,0,0,1,0,1,1
3,1,35.0,0,0,0,0,1,0
4,0,35.0,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...
885,0,39.0,0,0,1,0,1,0
886,0,27.0,1,1,0,1,0,1
887,1,19.0,0,0,0,0,1,1
889,1,26.0,1,0,0,1,0,1


## Determine X & Y
- X = 'sex', 'age', 'class', 'who', 'alone' 
- Y = 'survived'

in order to predict who survived

# Create Regression Model

In [64]:
# Train Test Split
x = df_dummy.drop('survived', axis = 1)
y = df_dummy['survived']

# Mandatory, sistematis
xtrain, xtest, ytrain, ytest = train_test_split(
    x,
    y, 
    test_size= 0.2, 
    random_state= 2023, 
    stratify= y 
) 


## Logistic Regression

In [65]:
logreg = LogisticRegression()
logreg.fit(xtrain, ytrain)

In [66]:
xtest

Unnamed: 0,age,sex_male,class_Second,class_Third,who_man,who_woman,alone_True
52,49.0,0,0,0,0,1,0
880,25.0,0,1,0,0,1,0
787,8.0,1,0,1,0,0,0
63,4.0,1,0,1,0,0,0
747,30.0,0,1,0,0,1,1
...,...,...,...,...,...,...,...
103,33.0,1,0,1,1,0,1
290,26.0,0,0,0,0,1,1
210,24.0,1,0,1,1,0,1
74,32.0,1,0,1,1,0,1


In [87]:
# Predict

pred = logreg.predict(xtest)

print(classification_report(ytest,pred))

#Accuracy: 84% mampu memprediksi dengan benar

              precision    recall  f1-score   support

           0       0.87      0.86      0.86        85
           1       0.80      0.81      0.80        58

    accuracy                           0.84       143
   macro avg       0.83      0.83      0.83       143
weighted avg       0.84      0.84      0.84       143



## Decision Tree

In [88]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(xtrain, ytrain) 
pred_tree = dt.predict(xtest)

print(classification_report(ytest,pred_tree))


              precision    recall  f1-score   support

           0       0.90      0.85      0.87        85
           1       0.79      0.86      0.83        58

    accuracy                           0.85       143
   macro avg       0.85      0.85      0.85       143
weighted avg       0.86      0.85      0.85       143



## K-Nearest Neighbor

In [78]:
# K-Nearest Neighbor

knn = KNeighborsClassifier()
knn.fit(xtrain, ytrain)
knn_predict = knn.predict(xtest)


## Support Vector Classifier

In [79]:
# Support Vector Classifier

svc = SVC(kernel='linear')
svc.fit(xtrain, ytrain)
svc_predict = svc.predict(xtest)

# Conclusion

In [90]:
# Comparing Model

print(f'Accuracy score dari LogReg = {accuracy_score(ytest,pred)}')
print(f'Accuracy score dari DT = {accuracy_score(ytest,pred_tree)}')
print(f'Accuracy score dari KNN = {accuracy_score(ytest,knn_predict)}')
print(f'Accuracy score dari SVC = {accuracy_score(ytest,svc_predict)}')

Accuracy score dari LogReg = 0.8391608391608392
Accuracy score dari DT = 0.8531468531468531
Accuracy score dari KNN = 0.8111888111888111
Accuracy score dari SVC = 0.7972027972027972


In [84]:
# Confusion Matrix

print(confusion_matrix(ytest,pred))


[[73 12]
 [11 47]]


Berdasarkan confusion matrix di atas,

- 47 individu diprediksi selamat dan pada kenyataannya selamat
- 72 individu diprediksi tidak selamat dan pada kenyataannya tidak selamat

Dengan menggunakan akurasi skor sebagai matrixnya,  model **Decision Tree** yang **terbaik** untuk memprediksi korban selamat
