In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Baca dataset Titanic
titanic_data = pd.read_csv('titanic.csv')
# print(titanic_data.head())

# Mengubah fitur 'Sex' menjadi one-hot encoding
titanic_data = pd.get_dummies(titanic_data, columns=['Sex'], drop_first=True)

# Pengisian Missing Value pada Fitur Age
titanic_data['Age'] = titanic_data.groupby('Pclass')['Age'].transform(lambda x: x.fillna(x.mean()))
# Menampilkan baris-baris yang telah diisi nilai Age
# with pd.option_context('display.max_rows', None):
#     print(titanic_data[['Pclass', 'Age']])

# Mengambil Label (Kelas Survived)
labels = titanic_data['Survived']
# print(labels)

# Normalisasi Data dengan Min-Max Scaling
scaler = MinMaxScaler()
titanic_data[['Age', 'Pclass', 'Fare']] = scaler.fit_transform(titanic_data[['Age', 'Pclass', 'Fare']])
# Menampilkan hasil normalisasi dan nilai min-max setiap atribut
# print(titanic_data[['Age', 'Pclass', 'Fare']].head())
# print("Nilai Min-Max 'Age':")
# print("Min:", titanic_data['Age'].min())
# print("Max:", titanic_data['Age'].max())
# print("\nNilai Min-Max 'Pclass':")
# print("Min:", titanic_data['Pclass'].min())
# print("Max:", titanic_data['Pclass'].max())
# print("\nNilai Min-Max 'Fare':")
# print("Min:", titanic_data['Fare'].min())
# print("Max:", titanic_data['Fare'].max())

# Validasi Model
X = titanic_data[['Sex_male', 'Age', 'Pclass', 'Fare']]
y = titanic_data['Survived']

# Klasifikasi k-NN (k=3) dan Menghitung Error Ratio
knn = KNeighborsClassifier(n_neighbors=3)

# Hold-out Method
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
error_ratio_holdout = 1 - accuracy_score(y_test, y_pred)

# K-Fold
kf = KFold(n_splits=10)
error_ratios_kfold = []
for train_index, test_index in kf.split(X):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    knn.fit(X_train_fold, y_train_fold)
    y_pred_fold = knn.predict(X_test_fold)
    error_ratio_fold = 1 - accuracy_score(y_test_fold, y_pred_fold)
    error_ratios_kfold.append(error_ratio_fold)

# LOO
loo = LeaveOneOut()
error_ratios_loo = []
for train_index, test_index in loo.split(X):
    X_train_loo, X_test_loo = X.iloc[train_index], X.iloc[test_index]
    y_train_loo, y_test_loo = y.iloc[train_index], y.iloc[test_index]
    knn.fit(X_train_loo, y_train_loo)
    y_pred_loo = knn.predict(X_test_loo)
    error_ratio_loo = 1 - accuracy_score(y_test_loo, y_pred_loo)
    error_ratios_loo.append(error_ratio_loo)

# Menampilkan Error Ratio
print("Error Ratio Holdout Method:", error_ratio_holdout)
print("Average Error Ratio K-Fold:", np.mean(error_ratios_kfold))
print("Error Ratio LOO:", np.mean(error_ratios_loo))

     Pclass        Age
0         3  22.000000
1         1  38.000000
2         3  26.000000
3         1  35.000000
4         3  35.000000
5         3  25.140620
6         1  54.000000
7         3   2.000000
8         3  27.000000
9         2  14.000000
10        3   4.000000
11        1  58.000000
12        3  20.000000
13        3  39.000000
14        3  14.000000
15        2  55.000000
16        3   2.000000
17        2  29.877630
18        3  31.000000
19        3  25.140620
20        2  35.000000
21        2  34.000000
22        3  15.000000
23        1  28.000000
24        3   8.000000
25        3  38.000000
26        3  25.140620
27        1  19.000000
28        3  25.140620
29        3  25.140620
30        1  40.000000
31        1  38.233441
32        3  25.140620
33        2  66.000000
34        1  28.000000
35        1  42.000000
36        3  25.140620
37        3  21.000000
38        3  18.000000
39        3  14.000000
40        3  40.000000
41        2  27.000000
42        3

Error Ratio Holdout Method: 0.18656716417910446
Average Error Ratio K-Fold: 0.18068664169787768
Error Ratio LOO: 0.1739618406285073
