In [8]:
import numpy as np
import pandas as pd

# Machine Learning - KNN Stuff
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn import metrics 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
# Load the test data
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Preprocess the train data
train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True) # Drop 
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
train_data['Embarked'] = train_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Fill N/A Values
train_data.fillna(train_data.mean(), inplace=True) 

train_data # display 

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,0,22.000000,1,0,7.2500,0.0
1,2,1,1,1,38.000000,1,0,71.2833,1.0
2,3,1,3,1,26.000000,0,0,7.9250,0.0
3,4,1,1,1,35.000000,1,0,53.1000,0.0
4,5,0,3,0,35.000000,0,0,8.0500,0.0
...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,27.000000,0,0,13.0000,0.0
887,888,1,1,1,19.000000,0,0,30.0000,0.0
888,889,0,3,1,29.699118,1,2,23.4500,0.0
889,890,1,1,0,26.000000,0,0,30.0000,1.0


* **PassengerId**: ID for each passenger
* **Survived**: See if the passenger survived
* **Pclass**: 1st, 2nd, and 3rd class
* **Sex**: 0-male, 1-female
* **Age**: Age of passenger
* **SibSp**: Number of siblings/spouse for passenger
* **Parch**: Number of parents/children for passenger
* **Fare**: Price of ticket
* **Embarked**: Location of Departure, 'S': 0, 'C': 1, 'Q': 2

In [4]:
# Features and Target Variables
# Features = Pclass, Sex, Age, SibSp, Parch, Fare, Embarked
X = train_data.drop(columns=['PassengerId', 'Survived'])
y = train_data['Survived']

In [5]:
X # Features

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.000000,1,0,7.2500,0.0
1,1,1,38.000000,1,0,71.2833,1.0
2,3,1,26.000000,0,0,7.9250,0.0
3,1,1,35.000000,1,0,53.1000,0.0
4,3,0,35.000000,0,0,8.0500,0.0
...,...,...,...,...,...,...,...
886,2,0,27.000000,0,0,13.0000,0.0
887,1,1,19.000000,0,0,30.0000,0.0
888,3,1,29.699118,1,2,23.4500,0.0
889,1,0,26.000000,0,0,30.0000,1.0


In [6]:
y # Target Variable

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [9]:
# Standarize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=20)

for i in range(1, 11):
    print(f"\nKNN with {i} neighbors")

    # Create a KNN classifier with i neighbors
    knn = KNeighborsClassifier(n_neighbors=i)

    # Keep track of accuracy and confusion matrix for each fold
    fold_accuracies = []
    fold_confusion_matrices = []

    # Perform cross-validation
    for train_index, test_index in kf.split(X_scaled):
        X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]

        # Fit the model on the current fold
        knn.fit(X_train_fold, y_train_fold)

        # Predict on the test fold
        y_pred_fold = knn.predict(X_test_fold)

        # Calculate accuracy
        accuracy = accuracy_score(y_test_fold, y_pred_fold)
        fold_accuracies.append(accuracy)

        # Calculate confusion matrix
        c_matrix = confusion_matrix(y_test_fold, y_pred_fold)
        fold_confusion_matrices.append(c_matrix)

        # Output confusion matrix for this fold
        print(f"Confusion Matrix for fold:")
        print(c_matrix)

    # Output the average accuracy for this neighbor count
    print(f"Average Accuracy: {np.mean(fold_accuracies) * 100:.2f}%")




KNN with 1 neighbors
Confusion Matrix for fold:
[[94 18]
 [22 45]]
Confusion Matrix for fold:
[[83 30]
 [15 50]]
Confusion Matrix for fold:
[[76 24]
 [24 54]]
Confusion Matrix for fold:
[[84 32]
 [16 46]]
Confusion Matrix for fold:
[[79 29]
 [25 45]]
Average Accuracy: 73.62%

KNN with 2 neighbors
Confusion Matrix for fold:
[[105   7]
 [ 27  40]]
Confusion Matrix for fold:
[[101  12]
 [ 21  44]]
Confusion Matrix for fold:
[[93  7]
 [33 45]]
Confusion Matrix for fold:
[[107   9]
 [ 20  42]]
Confusion Matrix for fold:
[[98 10]
 [31 39]]
Average Accuracy: 80.13%

KNN with 3 neighbors
Confusion Matrix for fold:
[[100  12]
 [ 20  47]]
Confusion Matrix for fold:
[[93 20]
 [17 48]]
Confusion Matrix for fold:
[[84 16]
 [19 59]]
Confusion Matrix for fold:
[[98 18]
 [17 45]]
Confusion Matrix for fold:
[[91 17]
 [22 48]]
Average Accuracy: 80.02%

KNN with 4 neighbors
Confusion Matrix for fold:
[[106   6]
 [ 24  43]]
Confusion Matrix for fold:
[[100  13]
 [ 27  38]]
Confusion Matrix for fold:
[[92

In [9]:
# Load and Proprocess Data: Same as train.csv
test_data = pd.read_csv("test.csv")
test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})
test_data['Embarked'] = test_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
test_data.fillna(test_data.mean(), inplace=True) 
X_test_final = test_data.drop(columns=['PassengerId'])

# Standardize and Predict
X_test_final_scale = scaler.transform(X_test_final)
predictions = knn.predict(X_test_final_scale)

# CSV output
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': predictions})
output.to_csv('knn6.csv', index=False)