In [267]:
import numpy as np
np.random.seed(42)
import pandas as pd
import matplotlib.pyplot as plt

### Dataset preparation

In [268]:
data = pd.read_csv("../data/titanic/final_dataset.csv")

data_y = data.Survived
data_x = data.drop(["Survived"], axis=1)

In [269]:
print(data_x)
print(data_y)

     Pclass     Sex  Age  SibSp  Parch
0         1  female   38    1.0    0.0
1         3  female   26    0.0    0.0
2         1  female   35    1.0    0.0
3         3    male   35    0.0    0.0
4         1    male   54    0.0    0.0
..      ...     ...  ...    ...    ...
699       3  female   39    0.0    5.0
700       2    male   27    0.0    0.0
701       1  female   19    0.0    0.0
702       1    male   26    0.0    0.0
703       3    male   32    0.0    0.0

[704 rows x 5 columns]
0      1.0
1      1.0
2      1.0
3      0.0
4      0.0
      ... 
699    0.0
700    0.0
701    1.0
702    1.0
703    0.0
Name: Survived, Length: 704, dtype: float64


In [270]:
data_x[data_x["Sex"] == "female"] = 0
data_x[data_x["Sex"] == "male"] = 1

x = data_x.to_numpy()
y = data_y.to_numpy()

In [271]:
print(f"x shape:\n{x.shape}")
print(f"y shape:\n{y.shape}")

print(f"x:\n{x[:5]}")
print(f"y:\n{y[:5]}")

x shape:
(704, 5)
y shape:
(704,)
x:
[[0 0 0 0.0 0.0]
 [0 0 0 0.0 0.0]
 [0 0 0 0.0 0.0]
 [1 1 1 1.0 1.0]
 [1 1 1 1.0 1.0]]
y:
[1. 1. 1. 0. 0.]


### Dataset Split

In [272]:
num_samples = x.shape[0]
num_features = x.shape[1]
num_classes = y.shape[0]
num_survived = np.count_nonzero(y == 1)
num_not_survived = np.count_nonzero(y == 0)

print(f"Num_samples: {num_samples}")
print(f"Num_features: {num_features}")
print(f"Survived: {num_survived}")
print(f"Not survived: {num_not_survived}")
print(f"Survived rate: {1 - num_survived / num_not_survived}")
print(f"Not survived rate: {num_survived / num_not_survived}")

Num_samples: 704
Num_features: 5
Survived: 281
Not survived: 423
Survived rate: 0.3356973995271868
Not survived rate: 0.6643026004728132


In [273]:
test_size = num_samples // 3

random_idxs = np.random.permutation(num_samples) # [0, num_samples-1]

x_train = x[random_idxs[:-test_size]]
y_train = y[random_idxs[:-test_size]]

x_test = x[random_idxs[-test_size:]]
y_test = y[random_idxs[-test_size:]]

In [274]:
print(f"x_train shape:\n{x_train.shape}")
print(f"y_train shape:\n{y_train.shape}")

print(f"x_test shape:\n{x_test.shape}")
print(f"y_test shape:\n{y_test.shape}")

x_train shape:
(470, 5)
y_train shape:
(470,)
x_test shape:
(234, 5)
y_test shape:
(234,)


### KNN Model

In [275]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=4)
clf.fit(x_train, y_train)

accuracy = clf.score(x_test, y_test)
print(f"accuracy: {accuracy*100.0:.4}%")

y_pred = clf.predict(x_test)
print(f"y_pred:\n{y_pred}")

accuracy: 80.34%
y_pred:
[0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0.
 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.
 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0.
 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0.
 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0.
 0. 1. 1. 0. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1.]
