In [1]:
import numpy as np
import pandas as pd

# Machine Learning - KNN Stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from sklearn import metrics 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
# Load the test data
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Preprocess the train data
train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True) # Drop 
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
train_data['Embarked'] = train_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Fill N/A Values
train_data.fillna(train_data.mean(), inplace=True) 

train_data # display 

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,0,22.000000,1,0,7.2500,0.0
1,2,1,1,1,38.000000,1,0,71.2833,1.0
2,3,1,3,1,26.000000,0,0,7.9250,0.0
3,4,1,1,1,35.000000,1,0,53.1000,0.0
4,5,0,3,0,35.000000,0,0,8.0500,0.0
...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,27.000000,0,0,13.0000,0.0
887,888,1,1,1,19.000000,0,0,30.0000,0.0
888,889,0,3,1,29.699118,1,2,23.4500,0.0
889,890,1,1,0,26.000000,0,0,30.0000,1.0


* **PassengerId**: ID for each passenger
* **Survived**: See if the passenger survived
* **Pclass**: 1st, 2nd, and 3rd class
* **Sex**: 0-male, 1-female
* **Age**: Age of passenger
* **SibSp**: Number of siblings/spouse for passenger
* **Parch**: Number of parents/children for passenger
* **Fare**: Price of ticket
* **Embarked**: Location of Departure, 'S': 0, 'C': 1, 'Q': 2

In [5]:
# Features and Target Variables
# Features = Pclass, Sex, Age, SibSp, Parch, Fare, Embarked
X = train_data.drop(columns=['PassengerId', 'Survived'])
y = train_data['Survived']

In [6]:
X # This is X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.000000,1,0,7.2500,0.0
1,1,1,38.000000,1,0,71.2833,1.0
2,3,1,26.000000,0,0,7.9250,0.0
3,1,1,35.000000,1,0,53.1000,0.0
4,3,0,35.000000,0,0,8.0500,0.0
...,...,...,...,...,...,...,...
886,2,0,27.000000,0,0,13.0000,0.0
887,1,1,19.000000,0,0,30.0000,0.0
888,3,1,29.699118,1,2,23.4500,0.0
889,1,0,26.000000,0,0,30.0000,1.0


In [7]:
y # This is y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [26]:
# FIND THE BEST NEIGHBOR

# Split Data: 90% Train, 10% Test on Train.csv 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=20)

# Prints out the KNN Model Peformance from 1-10 Neighbors
for i in range(1,21):
    print(str(i) + " neighbor KNN model.")
    # Standardize
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_test_scale = scaler.transform(X_test)  

    # Train KNN model
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_scale, y_train)

    # Predict on the 10% Test Set (train.csv)
    y_pred = knn.predict(X_test_scale)

    # Calculate FP, FN, Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    c_matrix = metrics.confusion_matrix(y_test, y_pred)
    FP = c_matrix[0, 1] 
    FN = c_matrix[1, 0]  

    # Output results
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    print(f"Confusion Matrix:\n{c_matrix}")
    print(f"False Positives (FP): {FP}")
    print(f"False Negatives (FN): {FN}")



1 neighbor KNN model.
Model Accuracy: 85.56%
Confusion Matrix:
[[50  7]
 [ 6 27]]
False Positives (FP): 7
False Negatives (FN): 6
2 neighbor KNN model.
Model Accuracy: 84.44%
Confusion Matrix:
[[53  4]
 [10 23]]
False Positives (FP): 4
False Negatives (FN): 10
3 neighbor KNN model.
Model Accuracy: 82.22%
Confusion Matrix:
[[49  8]
 [ 8 25]]
False Positives (FP): 8
False Negatives (FN): 8
4 neighbor KNN model.
Model Accuracy: 86.67%
Confusion Matrix:
[[54  3]
 [ 9 24]]
False Positives (FP): 3
False Negatives (FN): 9
5 neighbor KNN model.
Model Accuracy: 81.11%
Confusion Matrix:
[[47 10]
 [ 7 26]]
False Positives (FP): 10
False Negatives (FN): 7
6 neighbor KNN model.
Model Accuracy: 85.56%
Confusion Matrix:
[[53  4]
 [ 9 24]]
False Positives (FP): 4
False Negatives (FN): 9
7 neighbor KNN model.
Model Accuracy: 86.67%
Confusion Matrix:
[[50  7]
 [ 5 28]]
False Positives (FP): 7
False Negatives (FN): 5
8 neighbor KNN model.
Model Accuracy: 85.56%
Confusion Matrix:
[[52  5]
 [ 8 25]]
False 

In [24]:
# Set the Neighbor and Fit (OPTIMAL)
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train_scale, y_train)

In [25]:
# Load and Proprocess Data: Same as train.csv
test_data = pd.read_csv("test.csv")
test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})
test_data['Embarked'] = test_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
test_data.fillna(test_data.mean(), inplace=True) 
X_test_final = test_data.drop(columns=['PassengerId'])

# Standardize and Predict
X_test_final_scale = scaler.transform(X_test_final)
predictions = knn.predict(X_test_final_scale)

# CSV output
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': predictions})
output.to_csv('knn6.csv', index=False)