In [63]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score

# Step 1: Load datasets titanic.csv
dataset = pd.read_csv('titanic.csv')
dataset.head()

# Mengubah nilai 'Sex' menjadi numerik
dataset['Sex'] = dataset['Sex'].map({'male': 0, 'female': 1})

# Step 2: Extract relevant features from the dataset
data = dataset[['Sex', 'Age', 'Pclass', 'Fare', 'Survived']]

# Step 3 and 4: Extract training data and labels with non-null Age values
train_data = data[data['Age'].notnull()][['Sex', 'Pclass', 'Fare', 'Survived']]
train_label = data[data['Age'].notnull()]['Age']

# Step 5: Extract test data with null Age values
test_data = data[data['Age'].isnull()][['Sex', 'Pclass', 'Fare', 'Survived']]

# Step 6 and 7: Normalize train_data and test_data using Min-Max scaling
scaler = MinMaxScaler()
train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.transform(test_data)

# Step 8: Perform 3-NN classification
knn_age = KNeighborsRegressor(n_neighbors=3)
knn_age.fit(train_data_scaled, train_label)
age_result = knn_age.predict(test_data_scaled)

# Step 9: Fill missing Age values in the dataset using class_result
dataset.loc[dataset['Age'].isnull(), 'Age'] = age_result

# Step 10: Load datasets titanic_test.csv
test_dataset = pd.read_csv('titanic_test.csv')
# Mengubah nilai 'Sex' menjadi numerik
test_dataset['Sex'] = test_dataset['Sex'].map({'male': 0, 'female': 1})

# Step 11 and 12: Extract training data and labels from the updated dataset
train_data = dataset[['Sex', 'Age', 'Pclass', 'Fare']]
train_label = dataset['Survived']

# Step 13: Extract test data and remove rows with missing values
dropped_rows = test_dataset[['Sex', 'Age', 'Pclass', 'Fare']][test_dataset[['Sex', 'Age', 'Pclass', 'Fare']].isnull().any(axis=1)]
# print("Dropped rows indices:", dropped_rows.index.tolist())
test_data = test_dataset[['Sex', 'Age', 'Pclass', 'Fare']].dropna()

# Step 14: Load datasets titanic_testlabel.csv
test_label = pd.read_csv('titanic_testlabel.csv')
# Get corresponding test labels for the rows that were not dropped
remaining_test_labels = test_label.drop(dropped_rows.index)

# Step 15 and 16: Normalize train_data and test_data using Min-Max scaling
train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.transform(test_data)

# Step 17: Perform 3-NN classification for the final test data
knn_survived = KNeighborsClassifier(n_neighbors=3)
knn_survived.fit(train_data_scaled, train_label)
class_result_final = knn_survived.predict(test_data_scaled)

# Step 18: Calculate errors and error ratio
error = len(remaining_test_labels[remaining_test_labels['Survived'] != class_result_final])
error_ratio = (error / len(test_label)) * 100

# Output error and error ratio
print("Total errors:", error)
print("Error ratio:", error_ratio, "%")


Total errors: 53
Error ratio: 12.679425837320574 %
